In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all' # to print multiple outputs from the same cell
import math
import utils
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
from operator import index
from collections import defaultdict
from scipy.stats import pearsonr
from datetime import datetime, timedelta

# Indicator Creation

In this notebook we will create new interesting features which we believe to be relevant for clustering purposes. The indicators we will try to create in this notebook are:
1. Account age in days (From account creation till the release of this dataset) (datetime(2022,9,29,11,0,0))
2. Account average tweets per day
3. Account highest daily tweet count
4. Account average twitter text length
5. Account tweet hashtag average
6. Account discussion creation score (Tweet to retweet ratio)
7. Account average number of mentions per tweet

In [3]:
users_df = pd.read_csv("dataset/users.csv")
users_df.head(1)
merged_df = pd.read_csv("dataset/merged_dataset.csv")
merged_df.head(1)

Unnamed: 0,id,name,lang,bot,created_at,statuses_count
0,2353593986,Lamonica Raborn,en,1,2019-02-22 18:00:42,76.0


  merged_df = pd.read_csv("dataset/merged_dataset.csv")


Unnamed: 0,user_id,name,lang,bot,account_created,statuses_count,tweet_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,tweet_created,text
0,2353593986,Lamonica Raborn,en,1.0,2019-02-22 18:00:42,76.0,4.867187e+17,0.0,0.0,0.0,0.0,0.0,1.0,2019-07-11 03:49:06,"@4fri2endly0 ""A business that makes nothing bu..."


In [4]:
merged_df.axes
merged_df.drop(columns="Unnamed: 0", inplace=True) # Dropping the first column of the dataset, consisting of an index copy
merged_df.axes

[RangeIndex(start=0, stop=3455680, step=1),
 Index(['user_id', 'name', 'lang', 'bot', 'account_created', 'statuses_count',
        'tweet_id', 'retweet_count', 'reply_count', 'favorite_count',
        'num_hashtags', 'num_urls', 'num_mentions', 'tweet_created', 'text'],
       dtype='object')]

KeyError: "['Unnamed: 0'] not found in axis"

## 1. Account age in days
Creating the account age in days feature and adding it to the dataframe.

In [33]:
merged_df["account_created"] = pd.to_datetime(merged_df["account_created"], errors="coerce") # Even though this is already done in the previous notebook, this information is turned to string when converting the merged_df to csv

release_of_dataset = datetime(2022,9,29,11,0,0)
#account_age_in_days = []

#for elem in merged_df["account_created"]:
#    age = release_of_dataset - elem
#    age_in_days = age.days
#    account_age_in_days.append(age_in_days)

#merged_df["account_age_in_days"] = account_age_in_days
merged_df['account_age_in_days'] = (release_of_dataset - merged_df['account_created']).dt.days

merged_df.head()

Unnamed: 0,user_id,name,lang,bot,account_created,statuses_count,tweet_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,tweet_created,text,account_age_in_days,account_average_tweets_per_day,tweet_age_in_days
0,2353593986,Lamonica Raborn,en,1.0,2019-02-22 18:00:42,76.0,4.867187e+17,0.0,0.0,0.0,0.0,0.0,1.0,2019-07-11 03:49:06,"@4fri2endly0 ""A business that makes nothing bu...",1314.0,0.057839,1176.0
1,2353593986,Lamonica Raborn,en,1.0,2019-02-22 18:00:42,76.0,4.795354e+17,0.0,0.0,0.0,0.0,0.0,0.0,2019-06-21 08:05:13,"""Happiness is not a station you arrive at, but...",1314.0,0.057839,1196.0
2,2353593986,Lamonica Raborn,en,1.0,2019-02-22 18:00:42,76.0,4.517009e+17,0.0,0.0,0.0,0.0,0.0,0.0,2019-04-05 12:41:00,Music flow.,1314.0,0.057839,1273.0
3,2353593986,Lamonica Raborn,en,1.0,2019-02-22 18:00:42,76.0,4.9192e+17,0.0,0.0,0.0,0.0,0.0,1.0,2019-07-25 12:17:31,@_SimplyKC follow meeee...,1314.0,0.057839,1162.0
4,2353593986,Lamonica Raborn,en,1.0,2019-02-22 18:00:42,76.0,4.605225e+17,0.0,0.0,0.0,0.0,0.0,0.0,2019-04-29 20:55:05,"""You are the only person on earth who can use ...",1314.0,0.057839,1249.0


Making the thing on the user_df aswell
In my opinion (Paul) it's better to make it on the users df too, cause it's faster to access if we just have to make plots on this attribute

In [32]:
users_df["created_at"] = pd.to_datetime(users_df["created_at"], errors="coerce") # Even though this is already done in the previous notebook, this information is turned to string when converting the merged_df to csv
release_of_dataset = datetime(2022,9,29,11,0,0)
users_df["account_age_in_days"] = (release_of_dataset - users_df['created_at']).dt.days

users_df.head()

Unnamed: 0,id,name,lang,bot,created_at,statuses_count,account_age_in_days,account_average_tweets_per_day
0,2353593986,Lamonica Raborn,en,1,2019-02-22 18:00:42,76.0,1314,0.057927
1,2358850842,Lourie Botton,en,0,2019-02-26 03:02:32,54.0,1311,0.020015
2,137959629,Dadan Syarifudin,en,1,2015-04-30 07:09:56,3.0,2709,0.001442
3,466124818,Carletto Focia,it,1,2017-01-18 02:49:18,50.0,2080,0.038197
4,2571493866,MBK Ebook,en,0,2019-06-18 19:30:21,7085.0,1198,3.503956


# 2. Account average tweets per day

Creating the account average tweets per day and adding it to the dataframe.

In [7]:
list_of_average_tweets_per_day = []

for elem in merged_df["statuses_count"]:
    try:
        tweets_made = elem
        age_in_days = merged_df["account_age_in_days"][elem]
        average_tweets_per_day = int(tweets_made)/int(age_in_days)
        list_of_average_tweets_per_day.append(average_tweets_per_day)
    except KeyError:
        list_of_average_tweets_per_day.append(np.nan)

merged_df["account_average_tweets_per_day"] = list_of_average_tweets_per_day
merged_df.head()

Unnamed: 0,user_id,name,lang,bot,account_created,statuses_count,tweet_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,tweet_created,text,account_age_in_days,account_average_tweets_per_day
0,2353593986,Lamonica Raborn,en,1.0,2019-02-22 18:00:42,76.0,4.867187e+17,0.0,0.0,0.0,0.0,0.0,1.0,2019-07-11 03:49:06,"@4fri2endly0 ""A business that makes nothing bu...",1314.0,0.057839
1,2353593986,Lamonica Raborn,en,1.0,2019-02-22 18:00:42,76.0,4.795354e+17,0.0,0.0,0.0,0.0,0.0,0.0,2019-06-21 08:05:13,"""Happiness is not a station you arrive at, but...",1314.0,0.057839
2,2353593986,Lamonica Raborn,en,1.0,2019-02-22 18:00:42,76.0,4.517009e+17,0.0,0.0,0.0,0.0,0.0,0.0,2019-04-05 12:41:00,Music flow.,1314.0,0.057839
3,2353593986,Lamonica Raborn,en,1.0,2019-02-22 18:00:42,76.0,4.9192e+17,0.0,0.0,0.0,0.0,0.0,1.0,2019-07-25 12:17:31,@_SimplyKC follow meeee...,1314.0,0.057839
4,2353593986,Lamonica Raborn,en,1.0,2019-02-22 18:00:42,76.0,4.605225e+17,0.0,0.0,0.0,0.0,0.0,0.0,2019-04-29 20:55:05,"""You are the only person on earth who can use ...",1314.0,0.057839


Idea, isn't better to add this attribute to the users_df ? Because it's a user caractherization, and having it in the merged_df is gonna be redundant

In [8]:
list_of_average_tweets_per_day = []

for elem in users_df["statuses_count"]:
    try:
        tweets_made = elem
        age_in_days = users_df["account_age_in_days"][elem]
        average_tweets_per_day = int(tweets_made)/int(age_in_days)
        list_of_average_tweets_per_day.append(average_tweets_per_day)
    except KeyError:
        list_of_average_tweets_per_day.append(np.nan)

users_df["account_average_tweets_per_day"] = list_of_average_tweets_per_day
users_df.head()

Unnamed: 0,id,name,lang,bot,created_at,statuses_count,account_age_in_days,account_average_tweets_per_day
0,2353593986,Lamonica Raborn,en,1,2019-02-22 18:00:42,76.0,1314,0.057927
1,2358850842,Lourie Botton,en,0,2019-02-26 03:02:32,54.0,1311,0.020015
2,137959629,Dadan Syarifudin,en,1,2015-04-30 07:09:56,3.0,2709,0.001442
3,466124818,Carletto Focia,it,1,2017-01-18 02:49:18,50.0,2080,0.038197
4,2571493866,MBK Ebook,en,0,2019-06-18 19:30:21,7085.0,1198,3.503956


## 3. Account highest daily tweet count

Creating the account highest daily tweet count and adding it too the dataframe.

In [31]:
#users_df["account_created"] = pd.to_datetime(users_df["created_at"], errors="coerce") # Even though this is already done in the previous notebook, this information is turned to string when converting the merged_df to csv
#users_df.info()

# Creating the tweet age in days (for each) till the release of dataset
release_of_dataset = datetime(2022,9,29,11,0,0)

merged_df['tweet_created_just_days'] = merged_df['tweet_created'].dt.date
merged_df["tweet_created_just_days"] = pd.to_datetime(merged_df["tweet_created_just_days"], errors="coerce")
merged_df['tweet_age_in_days'] = (release_of_dataset - merged_df['tweet_created_just_days']).dt.days

merged_df.drop(columns="tweet_created_just_days", inplace=True)
merged_df.head()

Unnamed: 0,user_id,name,lang,bot,account_created,statuses_count,tweet_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,tweet_created,text,account_age_in_days,account_average_tweets_per_day,tweet_age_in_days
0,2353593986,Lamonica Raborn,en,1.0,2019-02-22 18:00:42,76.0,4.867187e+17,0.0,0.0,0.0,0.0,0.0,1.0,2019-07-11 03:49:06,"@4fri2endly0 ""A business that makes nothing bu...",1314.0,0.057839,1176.0
1,2353593986,Lamonica Raborn,en,1.0,2019-02-22 18:00:42,76.0,4.795354e+17,0.0,0.0,0.0,0.0,0.0,0.0,2019-06-21 08:05:13,"""Happiness is not a station you arrive at, but...",1314.0,0.057839,1196.0
2,2353593986,Lamonica Raborn,en,1.0,2019-02-22 18:00:42,76.0,4.517009e+17,0.0,0.0,0.0,0.0,0.0,0.0,2019-04-05 12:41:00,Music flow.,1314.0,0.057839,1273.0
3,2353593986,Lamonica Raborn,en,1.0,2019-02-22 18:00:42,76.0,4.9192e+17,0.0,0.0,0.0,0.0,0.0,1.0,2019-07-25 12:17:31,@_SimplyKC follow meeee...,1314.0,0.057839,1162.0
4,2353593986,Lamonica Raborn,en,1.0,2019-02-22 18:00:42,76.0,4.605225e+17,0.0,0.0,0.0,0.0,0.0,0.0,2019-04-29 20:55:05,"""You are the only person on earth who can use ...",1314.0,0.057839,1249.0


In [38]:
# Trying to see how the group by num of days works
merged_df.groupby(["user_id", "tweet_age_in_days"], as_index=False).max()

Unnamed: 0,user_id,tweet_age_in_days,name,lang,bot,account_created,statuses_count,tweet_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,tweet_created,text,account_age_in_days,account_average_tweets_per_day
0,1382561,-6410.0,jojoko,en,0.0,2012-03-18 23:24:35,8236.0,3.781420e+17,0.0,0.0,0.0,0.0,0.0,0.0,2040-04-17 06:15:02,Netflix is rebooting Full House!,3846.0,4.976435
1,1382561,-6408.0,jojoko,en,0.0,2012-03-18 23:24:35,8236.0,6.829975e+14,0.0,0.0,0.0,0.0,0.0,0.0,2040-04-15 22:41:21,Iced tea on a spring day in the garden with a ...,3846.0,4.976435
2,1382561,-6376.0,jojoko,en,0.0,2012-03-18 23:24:35,8236.0,3.400646e+14,0.0,0.0,0.0,0.0,0.0,0.0,2040-03-14 18:45:02,@surreal_killer link me,3846.0,4.976435
3,1382561,-6375.0,jojoko,en,0.0,2012-03-18 23:24:35,8236.0,2.916868e+09,0.0,0.0,0.0,0.0,0.0,1.0,2040-03-13 22:10:45,@walkngonawire did you hear Emma Thompson is t...,3846.0,4.976435
4,1382561,-6368.0,jojoko,en,0.0,2012-03-18 23:24:35,8236.0,9.838085e+14,0.0,0.0,0.0,0.0,0.0,1.0,2040-03-06 21:24:33,@Rawbie I don't know :(,3846.0,4.976435
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
623996,999662546,1599.0,Melody Naylor,en,0.0,2017-12-11 16:46:43,1543.0,3.336750e+17,1.0,0.0,0.0,0.0,0.0,1.0,2018-05-14 20:08:14,RT @laurnaylor: Happy Mother's day mommy! I lo...,1752.0,1.287980
623997,999662546,1633.0,Melody Naylor,en,0.0,2017-12-11 16:46:43,1543.0,3.210779e+17,0.0,0.0,0.0,0.0,0.0,1.0,2018-04-10 01:51:45,@blakeshelton or 3!,1752.0,1.287980
623998,999662546,1652.0,Melody Naylor,en,0.0,2017-12-11 16:46:43,1543.0,3.141978e+17,0.0,0.0,0.0,0.0,0.0,1.0,2018-03-22 02:12:36,@WiseManSi \nME!!!!,1752.0,1.287980
623999,999662546,1675.0,Melody Naylor,en,0.0,2017-12-11 16:46:43,1543.0,3.058401e+17,0.0,0.0,0.0,0.0,0.0,2.0,2018-02-27 00:42:12,@blakeshelton are you and @mirandalambert next...,1752.0,1.287980


In [11]:
# Need to finish this indicator another time - Tengel
""" 
account_high_score = 0

# Iterate through every user in the users_df and count all duplicated datetimes
for elem in users_df["id"]:
    single_account_tweets_df = pd.DataFrame([merged_df["user_id"] == elem, merged_df[]])
    merged_df.groupby("tweet_created")
"""

' \naccount_high_score = 0\n\n# Iterate through every user in the users_df and count all duplicated datetimes\nfor elem in users_df["id"]:\n    single_account_tweets_df = pd.DataFrame([merged_df["user_id"] == elem, merged_df[]])\n    merged_df.groupby("tweet_created")\n'