In [148]:
import pandas as pd
import matplotlib.pyplot as plt
import calendar
import json
import numpy as np
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn import tree
import collections
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,accuracy_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import itertools
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Ridge,Lasso,ElasticNet
from sklearn.model_selection import GridSearchCV
import datetime, time
import pytz
import statsmodels.api as stats_api


# 1. Popularity Prediction

#### Initialize variables

In [48]:
hashtags = ['#gopatriots']#,'#nfl','#sb49','#gohawks','#patriots','#superbowl'] using only one tag
df_map = {} # to store dfs for each file

#### Reading files into dataframes and storing

In [55]:
df_columns = ['timestamp','tweet_id','author_id','followers','retweets']
for hashtag in hashtags:
    df = pd.DataFrame(columns=df_columns)
    lno = 0
    print("Processing ",hashtag)
    with open("tweet_data/tweets_"+hashtag+".txt", "r") as file_obj:
        for line in file_obj:
            if(lno%5000 == 0):
                print(lno)
            j = json.loads(line)
            #must need columns
            timestamp =j['citation_date']
            followers =j['author']['followers']
            retweets = j['metrics']['citations']['total']
            #useful for later
            author_id = j['author']['url']
            tweet_id = j['tweet']['id']
            df.loc[lno] = [timestamp,author_id, tweet_id,followers,retweets]
            ##
            #print(df.head())
            lno+=1
        df_map[hashtag] = df.copy()     
        

Processing  #gopatriots
0
5000
10000
15000
20000
25000


## 1.1 Statistics

In [102]:
tweets_hrly = {}
retweets_hrly = {}
followers_hrly = {}
tweets_cnt = {}
retweets_cnt = {}
followers_cnt = {}

for hashtag in hashtags:
    df = df_map[hashtag]
    sec = df['timestamp'].max() - df['timestamp'].min()   
    tweets_cnt[hashtag] = df['tweet_id'].count()
    retweets_cnt[hashtag] = df['retweets'].sum()
    followers_cnt[hashtag] = df['followers'].sum()
    tweets_hrly[hashtag] = (tweets_cnt[hashtag]*3600)/sec
    retweets_hrly[hashtag] = (retweets_cnt[hashtag]*3600)/sec
    followers_hrly[hashtag] = (followers_cnt[hashtag]*3600)/sec    


In [103]:
print('Tweets Count : ',tweets_cnt)
print('Tweets Avg. hourly',tweets_hrly)
print('Retweets Count : ',retweets_cnt)
print('Retweets Avg. hourly',retweets_hrly)

print('Followers Count : ',followers_cnt)
print('Followers Avg. hourly',followers_cnt)

Tweets Count :  {'#gopatriots': 26232}
Tweets Avg. hourly {'#gopatriots': 45.694510573562027}
Retweets Count :  {'#gopatriots': 36727}
Retweets Avg. hourly {'#gopatriots': 63.976147065996216}
Followers Count :  {'#gopatriots': 36774523.0}
Followers Avg. hourly {'#gopatriots': 36774523.0}


## 1.2 Linear Regression

#### K-fold cross validation from our previous project

In [143]:
def perform_10fold(X,y,regressor, print_ = False,shuffle_= True):
    kf = KFold(n_splits=10,shuffle=shuffle_, random_state=0)
    i = 1
    bestModel = None
    tr_e = 0
    ts_e = 0
    min_ts_e = 10
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        regressor.fit(X_train,y_train)
        train_preds = regressor.predict(X_train)
        test_preds = regressor.predict(X_test)
        test_error = mean_squared_error(y_test,test_preds)
        tr_e += mean_squared_error(y_train,train_preds)
        ts_e += test_error
        if(print_):
            print("Fold : ",i)
            print("Training RMSE : ",np.sqrt(mean_squared_error(y_train,train_preds)))
            print("Test RMSE : ",np.sqrt(test_error))
        if(test_error<min_ts_e):
            min_ts_e = test_error
            bestModel = regressor
        i = i+1
    return np.sqrt(tr_e/10),np.sqrt(ts_e/10), bestModel

#### Returns the hour number from timestamp

In [122]:
pst_tz = pytz.timezone('US/Mountain')
def getHourofDay(timestamp):
    print(timestamp)
    return datetime.datetime.fromtimestamp(timestamp, pst_tz).hour

    
def getHour(timestamp):
    return int(timestamp/3600)

In [140]:
X_map = {}
Y_map = {}
for hashtag in hashtags:
    df = df_map[hashtag].copy()
    df['timestamp'] = df['timestamp'].apply(lambda x:getHour(x) )
    grouped = df.groupby('timestamp').agg({'followers': ['sum', 'max'],'tweet_id':'count', 'retweets':'sum'})
    grouped.columns = ["_".join(x) for x in grouped.columns.ravel()]
    #grouped.columns=grouped.columns.droplevel()
    grouped.reset_index(inplace=True)
    grouped['hour_of_day','tweet'] = grouped.apply(lambda x: getHourofDay(x['timestamp']*3600), axis=1)
    X_map[hashtag] = grouped.drop(['timestamp','tweet_id_count'],axis=1)
    Y_map[hashtag] = grouped['tweet_id_count']
    #print(grouped)
    

In [155]:
for hashtag in hashtags:
    model = stats_api.OLS(Y_map['#gopatriots'],X_map['#gopatriots']).fit()
    print(model.summary())
    

                            OLS Regression Results                            
Dep. Variable:         tweet_id_count   R-squared:                       0.990
Model:                            OLS   Adj. R-squared:                  0.990
Method:                 Least Squares   F-statistic:                 1.129e+04
Date:                Tue, 06 Mar 2018   Prob (F-statistic):               0.00
Time:                        17:16:38   Log-Likelihood:                -2216.2
No. Observations:                 447   AIC:                             4440.
Df Residuals:                     443   BIC:                             4457.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
followers_sum           