# **Data Mining Project** 
# Task 4: Time series analysis
Pistolesi Veronica, Poli Francesca

## Libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%matplotlib inline
import math
import numpy as np
import pandas as pd
import scipy
import matplotlib
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker

from collections import defaultdict
from scipy.stats import pearsonr
from scipy.stats import entropy
from datetime import datetime

## Data loading

In [None]:
tweets = pd.read_csv('/content/drive/Shareddrives/DataMining/dataset/clean_tweets.csv', lineterminator='\n')

In [None]:
tweets.set_index('id', inplace=True)

In [None]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11278702 entries, 1 to 11712597
Data columns (total 9 columns):
 #   Column          Dtype  
---  ------          -----  
 0   user_id         int64  
 1   retweet_count   float64
 2   reply_count     float64
 3   favorite_count  float64
 4   num_hashtags    float64
 5   num_urls        float64
 6   num_mentions    float64
 7   created_at      object 
 8   text            object 
dtypes: float64(6), int64(1), object(2)
memory usage: 860.5+ MB


In [None]:
tweets.sort_values('created_at', inplace=True) # sort tweets by created_at

In [None]:
tweets.isnull().any()

user_id           False
retweet_count     False
reply_count       False
favorite_count    False
num_hashtags      False
num_urls          False
num_mentions      False
created_at         True
text               True
dtype: bool

In [None]:
tweets.user_id.isnull().sum()

0

In [None]:
tweets.head()

Unnamed: 0_level_0,user_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,created_at,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2828882,887281,0.0,0.0,0.0,0.0,0.0,0.0,2012-03-11 22:25:59,Back from eating at Jamon Jamon.
1946198,887281,0.0,0.0,0.0,0.0,0.0,0.0,2012-04-13 23:08:58,going to bed (alone)
384430,887281,0.0,0.0,0.0,0.0,0.0,0.0,2012-04-13 23:22:57,Zzzzzzzzzzzzzzzzz
3342826,887281,0.0,0.0,0.0,0.0,0.0,0.0,2012-04-14 19:09:05,"&lt;a href=""http://tinyurl.com/yxpscm""&gt;I'm ..."
7410260,887281,0.0,0.0,0.0,0.0,0.0,0.0,2012-04-15 17:18:43,Working on my LA Law Wiki


In [None]:
tweets.shape

(11278702, 9)

In [None]:
users = pd.read_csv('/content/drive/Shareddrives/DataMining/dataset/users_profile.csv', lineterminator='\n')

In [None]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11508 entries, 0 to 11507
Data columns (total 33 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   user_id             11508 non-null  int64  
 1   name                11507 non-null  object 
 2   lang                11508 non-null  object 
 3   bot                 11508 non-null  int64  
 4   user_created_at     11508 non-null  object 
 5   statuses_count      11508 non-null  int64  
 6   retweet_count       11508 non-null  float64
 7   reply_count         11508 non-null  float64
 8   favorite_count      11508 non-null  float64
 9   num_hashtags        11508 non-null  float64
 10  num_urls            11508 non-null  float64
 11  num_mentions        11508 non-null  float64
 12  max_retweet_count   11508 non-null  float64
 13  max_favorite_count  11508 non-null  float64
 14  std_retweet_count   11508 non-null  float64
 15  std_favorite_count  11508 non-null  float64
 16  acti

In [None]:
users = users[["user_id", "bot"]]

## Data preparation step

In [None]:
tweets.created_at = pd.to_datetime(tweets.created_at, format='%Y-%m-%d %H:%M:%S').dt.date

In [None]:
tweets = tweets[(tweets.created_at.apply(lambda x: x.year)) == 2019] #consider only the tweets posted in 2019
tweets.head()

Unnamed: 0_level_0,user_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,created_at,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
9511311,2240858066,0.0,0.0,0.0,0.0,0.0,0.0,2019-01-01,tava me sentindo super mal esses dias
7649694,2240858066,0.0,0.0,0.0,0.0,0.0,0.0,2019-01-01,tava me sentindo super mal esses dias
5807283,2240858066,0.0,0.0,0.0,0.0,0.0,0.0,2019-01-01,Ãs vezes tudo o que a gente precisa pra se se...
6362374,2240858066,4.0,0.0,0.0,0.0,0.0,1.0,2019-01-01,RT @myh3ro: TO FAZENDO TODO MUNDO ASSISTIR SHE...
191836,494302461,1.0,0.0,0.0,0.0,0.0,0.0,2019-01-01,"Ã cosÃ¬, mente e cuore sono anarchici nella l..."


In [None]:
tweets.created_at.isnull().any().sum()

0

In [None]:
tweets.shape

(5068559, 9)

## SuccessScore

𝑆𝑢𝑐𝑐𝑒𝑠𝑠𝑆𝑐𝑜𝑟𝑒 = 𝐴𝑐𝑐𝑒𝑝𝑡𝑎𝑛𝑐𝑒𝑆𝑐𝑜𝑟𝑒 / (𝐷𝑖𝑓𝑓𝑢𝑠𝑖𝑜𝑛𝑆𝑐𝑜𝑟𝑒 + 0.1)

where

● 𝐴𝑐𝑐𝑒𝑝𝑡𝑎𝑛𝑐𝑒𝑆𝑐𝑜𝑟𝑒 = 𝑟𝑒𝑡𝑤𝑒𝑒𝑡_𝑐𝑜𝑢𝑛𝑡 + 𝑟𝑒𝑝𝑙𝑦_𝑐𝑜𝑢𝑛𝑡 + 𝑓𝑎𝑣𝑜𝑟𝑖𝑡𝑒_𝑐𝑜𝑢𝑛𝑡

● 𝐷𝑖𝑓𝑓𝑢𝑠𝑖𝑜𝑛𝑆𝑐𝑜𝑟𝑒 = 𝑛𝑢𝑚_ℎ𝑎𝑠ℎ𝑡𝑎𝑔𝑠 + 𝑛𝑢𝑚_𝑚𝑒𝑛𝑡𝑖𝑜𝑛𝑠 + 𝑛𝑢𝑚_𝑢𝑟𝑙𝑠

In [None]:
tweets_for_score = tweets.groupby(['user_id', 'created_at']).sum()

### AcceptanceScore

In [None]:
AcceptanceScore = (tweets.groupby(['user_id', 'created_at']).sum()['retweet_count'] + tweets.groupby(['user_id', 'created_at']).sum()['reply_count'] + tweets.groupby(['user_id', 'created_at']).sum()['favorite_count'])

In [None]:
print(AcceptanceScore)

user_id               created_at
-9223372036854775808  2019-01-24    0.0
                      2019-04-05    0.0
                      2019-04-29    0.0
                      2019-09-28    0.0
                      2019-10-13    0.0
                                   ... 
 9400000000           2019-10-30    1.0
 16000000000          2019-11-10    4.0
 97000000000          2019-05-08    0.0
 687000000000         2019-06-26    0.0
 20000000000000       2019-03-29    0.0
Length: 518093, dtype: float64


In [None]:
tweets_for_score['AcceptanceScore']=list(AcceptanceScore)

In [None]:
tweets_for_score.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,AcceptanceScore
user_id,created_at,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
-9223372036854775808,2019-01-24,0.0,0.0,0.0,0.0,0.0,1.0,0.0
-9223372036854775808,2019-04-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9223372036854775808,2019-04-29,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9223372036854775808,2019-09-28,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9223372036854775808,2019-10-13,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### DiffusionScore + 0.1

In [None]:
DiffusionScore01 = (tweets.groupby(['user_id', 'created_at']).sum()['num_hashtags'] + tweets.groupby(['user_id', 'created_at']).sum()['num_mentions'] + tweets.groupby(['user_id', 'created_at']).sum()['num_urls'] + 0.1)

In [None]:
print(DiffusionScore01)

user_id               created_at
-9223372036854775808  2019-01-24    1.1
                      2019-04-05    0.1
                      2019-04-29    0.1
                      2019-09-28    0.1
                      2019-10-13    0.1
                                   ... 
 9400000000           2019-10-30    0.1
 16000000000          2019-11-10    0.1
 97000000000          2019-05-08    0.1
 687000000000         2019-06-26    1.1
 20000000000000       2019-03-29    0.1
Length: 518093, dtype: float64


In [None]:
tweets_for_score['DiffusionScore01']=list(DiffusionScore01)

In [None]:
tweets_for_score.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,AcceptanceScore,DiffusionScore01
user_id,created_at,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
-9223372036854775808,2019-01-24,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.1
-9223372036854775808,2019-04-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1
-9223372036854775808,2019-04-29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1
-9223372036854775808,2019-09-28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1
-9223372036854775808,2019-10-13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1


### SuccessScore

In [None]:
tweets_for_score['SuccessScore'] = tweets_for_score['AcceptanceScore'] / tweets_for_score['DiffusionScore01']

In [None]:
tweets_for_score.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,AcceptanceScore,DiffusionScore01,SuccessScore
user_id,created_at,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
-9223372036854775808,2019-01-24,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.1,0.0
-9223372036854775808,2019-04-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0
-9223372036854775808,2019-04-29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0
-9223372036854775808,2019-09-28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0
-9223372036854775808,2019-10-13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0


In [None]:
tweets_for_score.SuccessScore.unique()

array([   0.        ,   10.        ,  120.        , ...,  860.32258065,
       4082.85714286, 1866.36363636])

In [None]:
tweets_for_score.shape

(518093, 9)

## TimeSeries preparation

In [None]:
tweets_scores = tweets_for_score['SuccessScore']

In [None]:
tweets_scores.head()

user_id               created_at
-9223372036854775808  2019-01-24    0.0
                      2019-04-05    0.0
                      2019-04-29    0.0
                      2019-09-28    0.0
                      2019-10-13    0.0
Name: SuccessScore, dtype: float64

In [None]:
tweets_scores = tweets_scores.to_frame()

In [None]:
tweets_scores.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,SuccessScore
user_id,created_at,Unnamed: 2_level_1
-9223372036854775808,2019-01-24,0.0
-9223372036854775808,2019-04-05,0.0
-9223372036854775808,2019-04-29,0.0
-9223372036854775808,2019-09-28,0.0
-9223372036854775808,2019-10-13,0.0


In [None]:
tweets_scores.unstack(['user_id'])

Unnamed: 0_level_0,SuccessScore,SuccessScore,SuccessScore,SuccessScore,SuccessScore,SuccessScore,SuccessScore,SuccessScore,SuccessScore,SuccessScore,SuccessScore,SuccessScore,SuccessScore,SuccessScore,SuccessScore,SuccessScore,SuccessScore,SuccessScore,SuccessScore,SuccessScore,SuccessScore
user_id,-9223372036854775808,0,3,5,6,8,9,17,20,31,...,2711226669,2717999764,2722021425,3000000000,6500000000,9400000000,16000000000,97000000000,687000000000,20000000000000
created_at,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2019-01-01,,,,,,,,,,,...,,,,,,,,,,
2019-01-02,,,,,,,,,,,...,,,,,,,,,,
2019-01-03,,,,,,,,,,,...,,,,,,,,,,
2019-01-04,,,,,,,,,,,...,,,,,,,,,,
2019-01-05,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-11-11,,,,,,,,,,,...,,,,,,,,,,
2019-11-12,,,,,,,,,,,...,,,,,,,,,,
2019-11-13,,,,,,,,,,,...,,,,,,,,,,
2019-11-14,,,,,,,,,,,...,,,,,,,,,,


In [None]:
tweets_scores = tweets_scores.unstack(['user_id']).fillna(-1).stack(['user_id'])
display(tweets_scores)

Unnamed: 0_level_0,Unnamed: 1_level_0,SuccessScore
created_at,user_id,Unnamed: 2_level_1
2019-01-01,-9223372036854775808,-1.0
2019-01-01,0,-1.0
2019-01-01,3,-1.0
2019-01-01,5,-1.0
2019-01-01,6,-1.0
...,...,...
2019-11-15,9400000000,-1.0
2019-11-15,16000000000,-1.0
2019-11-15,97000000000,-1.0
2019-11-15,687000000000,-1.0


In [None]:
# 2019-11-15  --> last date on dataset
# 15 (from Nov) + 31 (from Dec) = 46 missing days  [365-46 = 319 actual days]
# 2285316/319 = 7164 users

In [None]:
tweets_scores.reset_index(level=['created_at'], inplace=True) 

In [None]:
tweets_scores.sort_values(['user_id','created_at'], inplace=True)

In [None]:
len(tweets_scores.index.unique())  # number of users before the merge with users dataset

7163

In [None]:
tweets_scores

Unnamed: 0_level_0,created_at,SuccessScore
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
-9223372036854775808,2019-01-01,-1.0
-9223372036854775808,2019-01-02,-1.0
-9223372036854775808,2019-01-03,-1.0
-9223372036854775808,2019-01-04,-1.0
-9223372036854775808,2019-01-05,-1.0
...,...,...
20000000000000,2019-11-11,-1.0
20000000000000,2019-11-12,-1.0
20000000000000,2019-11-13,-1.0
20000000000000,2019-11-14,-1.0


In order to exploit the binary variable bot in a second moment, we need to merge tweets_scores with users dataset

In [None]:
tweets_scores = tweets_scores.merge(users, how='inner', on="user_id")

In [None]:
tweets_scores.head()

Unnamed: 0,user_id,created_at,SuccessScore,bot
0,722623,2019-01-01,-1.0,0
1,722623,2019-01-02,-1.0,0
2,722623,2019-01-03,1.549296,0
3,722623,2019-01-04,1626.393443,0
4,722623,2019-01-05,1.818182,0


In [None]:
tweets_scores.shape

(2165053, 4)

In [None]:
# 2019-11-15  --> last date on dataset
# 15 (from Nov) + 31 (from Dec) = 46 missing days  [365-46 = 319 actual days]
# 2165053/319 = 6787 users

In [None]:
len(tweets_scores.user_id.unique())  # number of users after the merge with users dataset

6787

In [None]:
tweets_scores.sort_values(['user_id','created_at'], inplace=True)

In [None]:
tweets_scores

Unnamed: 0,user_id,created_at,SuccessScore,bot
0,722623,2019-01-01,-1.000000,0
1,722623,2019-01-02,-1.000000,0
2,722623,2019-01-03,1.549296,0
3,722623,2019-01-04,1626.393443,0
4,722623,2019-01-05,1.818182,0
...,...,...,...,...
2165048,2722021425,2019-11-11,-1.000000,0
2165049,2722021425,2019-11-12,-1.000000,0
2165050,2722021425,2019-11-13,-1.000000,0
2165051,2722021425,2019-11-14,-1.000000,0


In [None]:
tweets_scores[tweets_scores.SuccessScore!=-1.0]  # tweets for which a SuccessScore has been calculated

Unnamed: 0,user_id,created_at,SuccessScore,bot
2,722623,2019-01-03,1.549296,0
3,722623,2019-01-04,1626.393443,0
4,722623,2019-01-05,1.818182,0
5,722623,2019-01-06,0.322581,0
6,722623,2019-01-07,11.526718,0
...,...,...,...,...
2164942,2722021425,2019-07-28,4.545455,0
2164943,2722021425,2019-07-29,3.636364,0
2164944,2722021425,2019-07-30,1.463415,0
2164945,2722021425,2019-07-31,2683.636364,0


In [None]:
# 517656 tweets in 2019 (517656/365 = 1418 tweets/day in 2019 for this dataset)

In [None]:
# 2019-11-15  --> last date on dataset
# 15 (from Nov) + 31 (from Dec) = 46 missing days  [365-46 = 319 actual days]

In [None]:
tweets_scores[:319]   # 319 rows for each users (see above) 

Unnamed: 0,user_id,created_at,SuccessScore,bot
0,722623,2019-01-01,-1.000000,0
1,722623,2019-01-02,-1.000000,0
2,722623,2019-01-03,1.549296,0
3,722623,2019-01-04,1626.393443,0
4,722623,2019-01-05,1.818182,0
...,...,...,...,...
314,722623,2019-11-11,-1.000000,0
315,722623,2019-11-12,-1.000000,0
316,722623,2019-11-13,-1.000000,0
317,722623,2019-11-14,-1.000000,0


In [None]:
tweets_scores[319:638]

Unnamed: 0,user_id,created_at,SuccessScore,bot
319,755746,2019-01-01,-1.0,0
320,755746,2019-01-02,-1.0,0
321,755746,2019-01-03,0.0,0
322,755746,2019-01-04,0.0,0
323,755746,2019-01-05,0.0,0
...,...,...,...,...
633,755746,2019-11-11,-1.0,0
634,755746,2019-11-12,-1.0,0
635,755746,2019-11-13,-1.0,0
636,755746,2019-11-14,-1.0,0


In [None]:
len(tweets_scores[tweets_scores.bot==1])/319    # number of bot

3875.0

In [None]:
len(tweets_scores[tweets_scores.bot==0])/319    # number of humans

2912.0

In [None]:
len(tweets_scores[tweets_scores.bot==0])/319 + len(tweets_scores[tweets_scores.bot==1])/319  # check for total number of users

6787.0

In [None]:
tweets_scores.to_csv('/content/drive/Shareddrives/DataMining/dataset/tweets_scores.csv', index=False)

In [None]:
scores = tweets_scores.groupby('user_id')['SuccessScore'].apply(list)
scores.head()

user_id
722623     [-1.0, -1.0, 1.5492957746478875, 1626.39344262...
755746     [-1.0, -1.0, 0.0, 0.0, 0.0, 0.1639344262295082...
806975     [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....
887281     [-1.0, -1.0, 0.0, -1.0, -1.0, -1.0, -1.0, -1.0...
1382561    [-1.0, -1.0, 60.094007050528795, 0.44806517311...
Name: SuccessScore, dtype: object

In [None]:
scores = scores.to_frame()

In [None]:
ts_users = scores.merge(users, on='user_id')

In [None]:
ts_users.head()

Unnamed: 0,user_id,SuccessScore,bot
0,722623,"[-1.0, -1.0, 1.5492957746478875, 1626.39344262...",0
1,755746,"[-1.0, -1.0, 0.0, 0.0, 0.0, 0.1639344262295082...",0
2,806975,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",0
3,887281,"[-1.0, -1.0, 0.0, -1.0, -1.0, -1.0, -1.0, -1.0...",0
4,1382561,"[-1.0, -1.0, 60.094007050528795, 0.44806517311...",0


In [None]:
#ts_users['SuccessScore'][0]