In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all' # to print multiple outputs from the same cell
import math
import utils
import pandas as pd
from datetime import datetime, timedelta, date

### Let's load the merged dataframe

In [3]:
merged_df = pd.read_csv('dataset/merged_dataset_cleaned.csv', lineterminator='\n')
merged_df

Unnamed: 0,user_id,name,lang,bot,account_created,statuses_count,tweet_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,tweet_created,text
0,2353593986,Lamonica Raborn,en,1,2019-02-22 18:00:42,76,486718663223828480,0,0,0,0,0,1,2019-07-11 03:49:06,"@4fri2endly0 ""A business that makes nothing bu..."
1,2353593986,Lamonica Raborn,en,1,2019-02-22 18:00:42,76,479535357126393856,0,0,0,0,0,0,2019-06-21 08:05:13,"""Happiness is not a station you arrive at, but..."
2,2353593986,Lamonica Raborn,en,1,2019-02-22 18:00:42,76,451700895491706880,0,0,0,0,0,0,2019-04-05 12:41:00,Music flow.
3,2353593986,Lamonica Raborn,en,1,2019-02-22 18:00:42,76,491920040405397504,0,0,0,0,0,1,2019-07-25 12:17:31,@_SimplyKC follow meeee...
4,2353593986,Lamonica Raborn,en,1,2019-02-22 18:00:42,76,460522543628046336,0,0,0,0,0,0,2019-04-29 20:55:05,"""You are the only person on earth who can use ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10460632,933183398,Corvanna,en,0,2017-11-09 23:24:16,5279,591409992605028352,1,0,0,0,0,1,2020-04-25 01:15:04,RT @_MarcussA: Phone dry
10460633,933183398,Corvanna,en,0,2017-11-09 23:24:16,5279,570728918614974464,0,0,0,0,0,0,2020-02-27 23:35:52,@_Freak_x1
10460634,933183398,Corvanna,en,0,2017-11-09 23:24:16,5279,582259121648320512,1,0,0,0,0,0,2020-03-30 19:12:47,Something wrong
10460635,933183398,Corvanna,en,0,2017-11-09 23:24:16,5279,591238256626380800,0,0,0,0,0,0,2020-04-24 13:52:39,always going over my data


### Filter tweets only of 2019

In [4]:
# Cast tweet creation to datetime
merged_df["tweet_created"] = pd.to_datetime(merged_df["tweet_created"], errors="coerce")

year = 2019

merged_df = merged_df[merged_df.tweet_created.dt.year == year]
merged_df

Unnamed: 0,user_id,name,lang,bot,account_created,statuses_count,tweet_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,tweet_created,text
0,2353593986,Lamonica Raborn,en,1,2019-02-22 18:00:42,76,486718663223828480,0,0,0,0,0,1,2019-07-11 03:49:06,"@4fri2endly0 ""A business that makes nothing bu..."
1,2353593986,Lamonica Raborn,en,1,2019-02-22 18:00:42,76,479535357126393856,0,0,0,0,0,0,2019-06-21 08:05:13,"""Happiness is not a station you arrive at, but..."
2,2353593986,Lamonica Raborn,en,1,2019-02-22 18:00:42,76,451700895491706880,0,0,0,0,0,0,2019-04-05 12:41:00,Music flow.
3,2353593986,Lamonica Raborn,en,1,2019-02-22 18:00:42,76,491920040405397504,0,0,0,0,0,1,2019-07-25 12:17:31,@_SimplyKC follow meeee...
4,2353593986,Lamonica Raborn,en,1,2019-02-22 18:00:42,76,460522543628046336,0,0,0,0,0,0,2019-04-29 20:55:05,"""You are the only person on earth who can use ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10457175,2383025796,Minnie Guadagno,en,1,2019-03-13 02:44:13,42,460078189067239360,0,0,0,0,0,0,2019-04-28 15:29:22,make up money could be going to so many other ...
10457176,2383025796,Minnie Guadagno,en,1,2019-03-13 02:44:13,42,461515583813476416,1,0,1,0,0,1,2019-05-02 14:41:04,@dilligaf_life !
10457177,2383025796,Minnie Guadagno,en,1,2019-03-13 02:44:13,42,982032609,0,0,0,0,0,0,2019-07-12 21:47:31,"@LottoRackz478 ""What we have learned from othe..."
10457178,2383025796,Minnie Guadagno,en,1,2019-03-13 02:44:13,42,459627644548440064,0,0,0,0,0,0,2019-04-27 09:39:04,I Thank God Everyday. I Know Where I Could've ...


As we can see there are ~= 5000000 tweets made in 2019 in the dataset

### Group by user, tweet creation in the same day and collapse the values (sum)

In [5]:
temp_df = merged_df.groupby(['user_id',merged_df['tweet_created'].dt.to_period('D')], as_index=False).sum(['retweet_count', 'reply_count', 'favorite_count', 'num_hashtags', 'num_urls', 'num_mentions'])
temp_df = temp_df[['user_id', 'bot', 'retweet_count', 'reply_count', 'favorite_count', 'num_hashtags', 'num_urls', 'num_mentions']]
temp_df['AcceptanceScore'] = temp_df['retweet_count'] + temp_df['reply_count'] + temp_df['favorite_count']
temp_df['DiffusionScore'] = temp_df['num_hashtags'] + temp_df['num_urls'] + temp_df['num_mentions']
temp_df['SuccessScore'] = temp_df['AcceptanceScore']/ (temp_df['DiffusionScore'] + 0.1)
temp_df['bot'].mask(temp_df['bot']>0, 1, inplace=True)
temp_df = temp_df[['user_id', 'bot', 'SuccessScore']]


temp_tmp = merged_df.groupby(['user_id',merged_df['tweet_created'].dt.to_period('D')], as_index=False).size()
temp_df['tweet_created'] = temp_tmp['tweet_created']
final_df = temp_df
final_df

Unnamed: 0,user_id,bot,SuccessScore,tweet_created
0,722623,0,1.549296,2019-01-03
1,722623,0,1626.393443,2019-01-04
2,722623,0,1.818182,2019-01-05
3,722623,0,0.322581,2019-01-06
4,722623,0,11.526718,2019-01-07
...,...,...,...,...
512275,2722021425,0,4.545455,2019-07-28
512276,2722021425,0,3.636364,2019-07-29
512277,2722021425,0,1.463415,2019-07-30
512278,2722021425,0,2683.636364,2019-07-31


### Add days not present in the year for each user

In [6]:
# start_date = date(2019, 1, 1)
# end_date = date(2019, 12, 31)
# delta = timedelta(days=1)
# users = merged_df.user_id.unique()

# while start_date <= end_date:

#     for user in users:
#         if not final_df[(final_df.user_id==user) & (final_df.tweet_created==start_date)].empty:
#             print('present')
#     start_date += delta
