In [20]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all' # to print multiple outputs from the same cell
import math
import utils
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
from operator import index
from collections import defaultdict
from scipy.stats import pearsonr
from datetime import datetime, timedelta

# Indicator Creation

In this notebook we will create new interesting features which we believe to be relevant for clustering purposes. The indicators we will try to create in this notebook are:
1. Account age in days (From account creation till the release of this dataset) (datetime(2022,9,29,11,0,0))
2. Account tweets number
3. Account highest daily tweet count
5. Account average twitter text length
6. Account tweet number over periods of time (e.g. Covid-19)
7. Account tweet hashtag average
8. Account discussion creation score (Tweet to retweet ratio)
9. Account average number of mentions per tweet
10. Account average tweet text length
11. Account average number of special characters in tweets
12. Account's Entropy
13. Number of Tweets
14. Number of likes and comments
15. Ratio between number of tweets and number of likes

In [21]:
users_df = pd.read_csv("dataset/users_dataset_cleaned.csv")
merged_df = pd.read_csv("dataset/merged_dataset.csv")

  merged_df = pd.read_csv("dataset/merged_dataset.csv")


## 1. Account age in days
Creating the account age in days feature and adding it to the dataframe.

In [28]:
users_df["created_at"] = pd.to_datetime(users_df["created_at"], errors="coerce") # Even though this is already done in the previous notebook, this information is turned to string when converting the merged_df to csv
release_of_dataset = datetime(2022,9,29,11,0,0)
users_df["account_age_in_days"] = (release_of_dataset - users_df['created_at']).dt.days

users_df.head()

# Keeping info in merged_df too for further calculations
merged_df["account_created"] = pd.to_datetime(merged_df["account_created"], errors="coerce")
merged_df["account_age_in_days"] = (release_of_dataset - merged_df['account_created']).dt.days

# 2. Account number of tweets

Creating the account average tweets per day and adding it to the dataframe.

In [23]:
users_df = users_df.merge(merged_df.groupby('user_id').size().reset_index(name='counts'), how="left")
users_df.rename(columns={"counts": "number_of_tweets"}, inplace=True)
users_df.head()

Unnamed: 0,user_id,name,lang,bot,created_at,statuses_count,account_age_in_days,number_of_tweets
0,2353593986,Lamonica Raborn,en,1,2019-02-22 18:00:42,76,1314,
1,2358850842,Lourie Botton,en,0,2019-02-26 03:02:32,54,1311,
2,137959629,Dadan Syarifudin,en,1,2015-04-30 07:09:56,3,2709,
3,466124818,Carletto Focia,it,1,2017-01-18 02:49:18,50,2080,
4,2571493866,MBK Ebook,en,0,2019-06-18 19:30:21,7085,1198,


# 2. Account average tweets per day

Creating the account average tweets per day and adding it to the dataframe.

In [33]:
# list_of_average_tweets_per_day = []

# for elem in merged_df["statuses_count"]:
#     try:
#         tweets_made = elem
#         age_in_days = merged_df["account_age_in_days"][elem]
#         average_tweets_per_day = int(tweets_made)/int(age_in_days)
#         list_of_average_tweets_per_day.append(average_tweets_per_day)
#     except KeyError:
#         list_of_average_tweets_per_day.append(np.nan)

merged_df["account_average_tweets_per_day"] = merged_df["statuses_count"]/merged_df["account_age_in_days"]

# merged_df["account_average_tweets_per_day"] = list_of_average_tweets_per_day
merged_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3455680 entries, 0 to 3455679
Data columns (total 17 columns):
 #   Column                          Dtype         
---  ------                          -----         
 0   user_id                         object        
 1   name                            object        
 2   lang                            object        
 3   bot                             float64       
 4   account_created                 datetime64[ns]
 5   statuses_count                  float64       
 6   tweet_id                        float64       
 7   retweet_count                   float64       
 8   reply_count                     float64       
 9   favorite_count                  float64       
 10  num_hashtags                    float64       
 11  num_urls                        float64       
 12  num_mentions                    float64       
 13  tweet_created                   object        
 14  text                            object        
 15

Unnamed: 0,user_id,name,lang,bot,account_created,statuses_count,tweet_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,tweet_created,text,account_age_in_days,account_average_tweets_per_day


Idea, isn't better to add this attribute to the users_df ? Because it's a user caractherization, and having it in the merged_df is gonna be redundant

In [34]:
# list_of_average_tweets_per_day = []

# for elem in users_df["statuses_count"]:
#     try:
#         tweets_made = elem
#         age_in_days = users_df["account_age_in_days"][elem]
#         average_tweets_per_day = int(tweets_made)/int(age_in_days)
#         list_of_average_tweets_per_day.append(average_tweets_per_day)
#     except KeyError:
#         list_of_average_tweets_per_day.append(np.nan)

users_df["account_average_tweets_per_day"] = users_df["statuses_count"]/users_df["account_age_in_days"]

# users_df["account_average_tweets_per_day"] = list_of_average_tweets_per_day
users_df.head()

Unnamed: 0,user_id,name,lang,bot,created_at,statuses_count,account_age_in_days,number_of_tweets,account_average_tweets_per_day
0,2353593986,Lamonica Raborn,en,1,2019-02-22 18:00:42,76,1314,,0.057839
1,2358850842,Lourie Botton,en,0,2019-02-26 03:02:32,54,1311,,0.04119
2,137959629,Dadan Syarifudin,en,1,2015-04-30 07:09:56,3,2709,,0.001107
3,466124818,Carletto Focia,it,1,2017-01-18 02:49:18,50,2080,,0.024038
4,2571493866,MBK Ebook,en,0,2019-06-18 19:30:21,7085,1198,,5.914023


We need to know the tweet age in days, the average number of tweets between the days that the user used twitter

In [38]:
# Creating the tweet age in days (for each) till the release of dataset
merged_df['tweet_created'] = merged_df['tweet_created'].apply(pd.to_datetime)
release_of_dataset = datetime(2022,9,29,11,0,0)

merged_df['tweet_created_just_days'] = merged_df['tweet_created'].dt.date
merged_df["tweet_created_just_days"] = pd.to_datetime(merged_df["tweet_created_just_days"], errors="coerce")
merged_df['tweet_age_in_days'] = (release_of_dataset - merged_df['tweet_created_just_days']).dt.days

merged_df.drop(columns="tweet_created_just_days", inplace=True)
merged_df.head()

Unnamed: 0,user_id,name,lang,bot,account_created,statuses_count,tweet_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,tweet_created,text,account_age_in_days,account_average_tweets_per_day,tweet_age_in_days
0,2353593986,Lamonica Raborn,en,1.0,2019-02-22 18:00:42,76.0,4.867187e+17,0.0,0.0,0.0,0.0,0.0,1.0,2019-07-11 03:49:06,"@4fri2endly0 ""A business that makes nothing bu...",1314.0,0.057839,1176.0
1,2353593986,Lamonica Raborn,en,1.0,2019-02-22 18:00:42,76.0,4.795354e+17,0.0,0.0,0.0,0.0,0.0,0.0,2019-06-21 08:05:13,"""Happiness is not a station you arrive at, but...",1314.0,0.057839,1196.0
2,2353593986,Lamonica Raborn,en,1.0,2019-02-22 18:00:42,76.0,4.517009e+17,0.0,0.0,0.0,0.0,0.0,0.0,2019-04-05 12:41:00,Music flow.,1314.0,0.057839,1273.0
3,2353593986,Lamonica Raborn,en,1.0,2019-02-22 18:00:42,76.0,4.9192e+17,0.0,0.0,0.0,0.0,0.0,1.0,2019-07-25 12:17:31,@_SimplyKC follow meeee...,1314.0,0.057839,1162.0
4,2353593986,Lamonica Raborn,en,1.0,2019-02-22 18:00:42,76.0,4.605225e+17,0.0,0.0,0.0,0.0,0.0,0.0,2019-04-29 20:55:05,"""You are the only person on earth who can use ...",1314.0,0.057839,1249.0


Adding indicator for average number of tweets per day in which the account ACTUALLY tweeted - Gianluca 

In [None]:
tmp = merged_df.groupby(["user_id", "tweet_age_in_days"], as_index=False).size()

# computing count
tmp = pd.DataFrame(tmp[["user_id", "tweet_age_in_days", "name"]])
count_df = tmp.rename(columns={"name": "count"})

# computing average
tmp = count_df.groupby(["user_id"], as_index=False).mean()
tmp=pd.DataFrame(tmp[["user_id", "count"]])
avg_df = tmp.rename(columns={"count": "avg", "user_id": "id"})

In [None]:
# adding the computed avg to users dataframe
#users_df = users_df.merge(avg_df, on="id")
users_df.head()

## 3. Account highest daily tweet count

Creating the account highest daily tweet count and adding it too the dataframe.

In [None]:
# Trying to see how the group by num of days works
merged_df.groupby(["user_id", "tweet_age_in_days"], as_index=False).max()

In [None]:
# Need to finish this indicator another time - Tengel
""" 
account_high_score = 0

# Iterate through every user in the users_df and count all duplicated datetimes
for elem in users_df["id"]:
    single_account_tweets_df = pd.DataFrame([merged_df["user_id"] == elem, merged_df[]])
    merged_df.groupby("tweet_created")
"""