In [12]:
import pandas as pd
import matplotlib.pyplot as plt
import calendar
import json
import numpy as np
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn import tree
import collections
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,accuracy_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import itertools
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Ridge,Lasso,ElasticNet
from sklearn.model_selection import GridSearchCV
import datetime, time
import pytz

# 1. Popularity Prediction

#### Initialize variables

In [48]:
hashtags = ['#gopatriots']#,'#nfl','#sb49','#gohawks','#patriots','#superbowl'] using only one tag
df_map = {} # to store dfs for each file

#### Reading files into dataframes and storing

In [55]:
df_columns = ['timestamp','tweet_id','author_id','followers','retweets']
for hashtag in hashtags:
    df = pd.DataFrame(columns=df_columns)
    lno = 0
    print("Processing ",hashtag)
    with open("tweet_data/tweets_"+hashtag+".txt", "r") as file_obj:
        for line in file_obj:
            if(lno%5000 == 0):
                print(lno)
            j = json.loads(line)
            #must need columns
            timestamp =j['citation_date']
            followers =j['author']['followers']
            retweets = j['metrics']['citations']['total']
            #useful for later
            author_id = j['author']['url']
            tweet_id = j['tweet']['id']
            df.loc[lno] = [timestamp,author_id, tweet_id,followers,retweets]
            ##
            #print(df.head())
            lno+=1
        df_map[hashtag] = df.copy()     
        

Processing  #gopatriots
0
5000
10000
15000
20000
25000


## 1.1 Statistics

In [102]:
tweets_hrly = {}
retweets_hrly = {}
followers_hrly = {}
tweets_cnt = {}
retweets_cnt = {}
followers_cnt = {}

for hashtag in hashtags:
    df = df_map[hashtag]
    sec = df['timestamp'].max() - df['timestamp'].min()   
    tweets_cnt[hashtag] = df['tweet_id'].count()
    retweets_cnt[hashtag] = df['retweets'].sum()
    followers_cnt[hashtag] = df['followers'].sum()
    tweets_hrly[hashtag] = (tweets_cnt[hashtag]*3600)/sec
    retweets_hrly[hashtag] = (retweets_cnt[hashtag]*3600)/sec
    followers_hrly[hashtag] = (followers_cnt[hashtag]*3600)/sec    


In [103]:
print('Tweets Count : ',tweets_cnt)
print('Tweets Avg. hourly',tweets_hrly)
print('Retweets Count : ',retweets_cnt)
print('Retweets Avg. hourly',retweets_hrly)

print('Followers Count : ',followers_cnt)
print('Followers Avg. hourly',followers_cnt)

Tweets Count :  {'#gopatriots': 26232}
Tweets Avg. hourly {'#gopatriots': 45.694510573562027}
Retweets Count :  {'#gopatriots': 36727}
Retweets Avg. hourly {'#gopatriots': 63.976147065996216}
Followers Count :  {'#gopatriots': 36774523.0}
Followers Avg. hourly {'#gopatriots': 36774523.0}


## 1.2 Linear Regression

#### Returns the hour number from timestamp

In [65]:
pst_tz = pytz.timezone('US/Pacific') 
datetime.datetime.fromtimestamp(citation date, pst tz)
def getHour(timestamp):
    return int(timestamp/3600)

In [94]:
for hashtag in hashtags:
    df = df_map[hashtag].copy()
    df['timestamp'] = df['timestamp'].apply(lambda x:getHour(x) )
    grouped = df.groupby('timestamp').agg({'followers': ['sum', 'max', 'count'], 'retweets':'sum'})
    grouped.columns = ["_".join(x) for x in grouped.columns.ravel()]
    #grouped.columns=grouped.columns.droplevel()
    grouped.reset_index(inplace=True)
    print(grouped)
    

     timestamp  followers_sum  followers_max  followers_count  retweets_sum
0       394785        10955.0        10955.0                1             4
1       394786         1762.0         1762.0                1             3
2       394788         1168.0          600.0                2             3
3       394789          465.0          431.0                2             3
4       394790         2850.0         1425.0                2             3
5       394791         1101.0         1101.0                1             1
6       394792          712.0          712.0                1             1
7       394793         2895.0         2895.0                1             6
8       394794         4732.0         2874.0                5             5
9       394795         1547.0         1425.0                3             3
10      394796         1868.0         1230.0                3             5
11      394798          158.0          158.0                1             4
12      3947