In [43]:
import numpy as np
import pandas as pd
import itertools
import json
import pickle
import string
import csv
import datetime
import pytz


from statsmodels.regression.linear_model import RegressionResults
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import warnings
import nltk

In [52]:
def lr_train(df): 
    df['future_tweets'] = df['tweets']
    df.future_tweets = df.future_tweets.shift(-1)
    df.future_tweets[-1] = df.future_tweets[-2]
    target = df['future_tweets'].values.tolist()
    target = np.asarray(target)
    df = df.drop(['future_tweets'], axis=1)
    regr = LinearRegression()
    regr.fit(df.values.tolist(), target)
    return regr

def mlp_train(df):
    df['future_tweets'] = df['tweets']
    df.future_tweets = df.future_tweets.shift(-1)
    df.future_tweets[-1] = df.future_tweets[-2]
    target = df['future_tweets'].values.tolist()
    target = np.asarray(target)
    df = df.drop(['future_tweets'], axis=1)
    mlp = MLPRegressor()
    mlp.fit(df.values.tolist(), target)
    return mlp

def rf_train(df):
    df['future_tweets'] = df['tweets']
    df.future_tweets = df.future_tweets.shift(-1)
    df.future_tweets[-1] = df.future_tweets[-2]
    target = df['future_tweets'].values.tolist()
    target = np.asarray(target)
    df = df.drop(['future_tweets'], axis=1)
    rf = RandomForestRegressor()
    rf.fit(df.values.tolist(), target)
    return rf

def cv_score(df, model):
    df['future_tweets'] = df['tweets']
    df.future_tweets = df.future_tweets.shift(-1)
    df.future_tweets[-1] = df.future_tweets[-2]
    target = df['future_tweets'].values.tolist()
    target = np.asarray(target)
    df = df.drop(['future_tweets'], axis=1)
    scores = cross_val_score(model, df.values.tolist(), target, cv=2)
    return np.sum(scores)/2

In [45]:
filenames = ['ECE219_tweet_test/sample0_period1.txt',
            'ECE219_tweet_test/sample0_period2.txt',
            'ECE219_tweet_test/sample0_period3.txt',
            'ECE219_tweet_test/sample1_period1.txt',
            'ECE219_tweet_test/sample1_period2.txt',
            'ECE219_tweet_test/sample1_period3.txt',
            'ECE219_tweet_test/sample2_period1.txt',
            'ECE219_tweet_test/sample2_period2.txt',
            'ECE219_tweet_test/sample2_period3.txt'
            ]

output_filenames = ['sample0_pre_active.txt','sample0_active.txt','sample0_post_active.txt',
                   'sample1_pre_active.txt','sample1_active.txt','sample1_post_active.txt',
                   'sample2_pre_active.txt','sample2_active.txt','sample2_post_active.txt']

for i in range(len(filenames)):
    pst_tz = pytz.timezone('US/Pacific')
    
    with open(filenames[i], 'r') as reader:
        headers = ['citation_date_raw','author_nick_names','tweets','retweets','followers','followers_max']
        with open(output_filenames[i], 'w') as writer:
            csv_writer = csv.writer(writer, lineterminator='\n')
            csv_writer.writerow(headers)
            for line in reader:
                data = json.loads(line)
                response =  [data['citation_date'],data['author']['nick'],1, data['metrics']['citations']['total'], data['author']['followers'], data['author']['followers'],]
                csv_writer.writerow(response)

In [46]:
preactive0 = pd.read_csv('sample0_pre_active.txt',delimiter=',')
active0 = pd.read_csv('sample0_active.txt',delimiter=',')
postactive0 = pd.read_csv('sample0_post_active.txt',delimiter=',')

preactive1 = pd.read_csv('sample1_pre_active.txt',delimiter=',')
active1 = pd.read_csv('sample1_active.txt',delimiter=',')
postactive1 = pd.read_csv('sample1_post_active.txt',delimiter=',')

preactive2 = pd.read_csv('sample2_pre_active.txt',delimiter=',')
active2 = pd.read_csv('sample2_active.txt',delimiter=',')
postactive2 = pd.read_csv('sample2_post_active.txt',delimiter=',')

In [47]:
def parse_date_pst(df):
    utc_tz = pytz.UTC
    pst_tz = pytz.timezone('America/Los_Angeles')
    df['citation_date'] = pd.to_datetime(df['citation_date_raw'], unit='s')
    df['citation_date_pst'] = pd.to_datetime(df['citation_date_raw'], unit='s').apply(lambda x: x.tz_localize(utc_tz).astimezone(pst_tz))
    df['date'] = df['citation_date_pst'].apply(lambda x: x.strftime('%Y%m%d'))
    df['hour'] = df['citation_date_pst'].apply(lambda x: x.hour)
    df['minute'] = df['citation_date_pst'].apply(lambda x: x.minute)
    
parse_date_pst(preactive0)
parse_date_pst(active0)
parse_date_pst(postactive0)

parse_date_pst(preactive1)
parse_date_pst(active1)
parse_date_pst(postactive1)

parse_date_pst(preactive2)
parse_date_pst(active2)
parse_date_pst(postactive2)

In [48]:
preactive0_agg = preactive0.groupby(pd.Grouper(key='citation_date_pst',freq='60Min')).agg({'hour':'max','tweets':'sum','retweets':'sum','followers':'sum','followers_max':'max'})
active0_agg = active0.groupby(pd.Grouper(key='citation_date_pst',freq='5Min')).agg({'hour':'max','tweets':'sum','retweets':'sum','followers':'sum','followers_max':'max'})
postactive0_agg = postactive0.groupby(pd.Grouper(key='citation_date_pst',freq='60Min')).agg({'hour':'max','tweets':'sum','retweets':'sum','followers':'sum','followers_max':'max'})
preactive1_agg = preactive1.groupby(pd.Grouper(key='citation_date_pst',freq='60Min')).agg({'hour':'max','tweets':'sum','retweets':'sum','followers':'sum','followers_max':'max'})
active1_agg = active1.groupby(pd.Grouper(key='citation_date_pst',freq='5Min')).agg({'hour':'max','tweets':'sum','retweets':'sum','followers':'sum','followers_max':'max'})
postactive1_agg = postactive1.groupby(pd.Grouper(key='citation_date_pst',freq='60Min')).agg({'hour':'max','tweets':'sum','retweets':'sum','followers':'sum','followers_max':'max'})
preactive2_agg = preactive2.groupby(pd.Grouper(key='citation_date_pst',freq='60Min')).agg({'hour':'max','tweets':'sum','retweets':'sum','followers':'sum','followers_max':'max'})
active2_agg = active2.groupby(pd.Grouper(key='citation_date_pst',freq='5Min')).agg({'hour':'max','tweets':'sum','retweets':'sum','followers':'sum','followers_max':'max'})
postactive2_agg = postactive2.groupby(pd.Grouper(key='citation_date_pst',freq='60Min')).agg({'hour':'max','tweets':'sum','retweets':'sum','followers':'sum','followers_max':'max'})

In [49]:
preactive0_comb_agg = preactive0_agg
preactive1_comb_agg = pd.concat([preactive0_agg, preactive1_agg])
preactive1_comb_agg.dropna()
preactive2_comb_agg = pd.concat([preactive0_agg, preactive1_agg, preactive2_agg])
preactive2_comb_agg.dropna()

active0_comb_agg = active0_agg
active1_comb_agg = pd.concat([active0_agg, active1_agg])
active1_comb_agg.dropna()
active2_comb_agg = pd.concat([active0_agg, active1_agg, active2_agg])
active2_comb_agg.dropna()

postactive0_comb_agg = postactive0_agg
postactive1_comb_agg = pd.concat([postactive0_agg, postactive1_agg])
postactive1_comb_agg.dropna()
postactive2_comb_agg = pd.concat([postactive0_agg, postactive1_agg, postactive2_agg])
postactive2_comb_agg.dropna()

Unnamed: 0_level_0,hour,tweets,retweets,followers,followers_max
citation_date_pst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-02-04 00:00:00-08:00,0,59,74,3564378.0,3329958.0
2015-02-04 01:00:00-08:00,1,48,103,146905.0,59219.0
2015-02-04 02:00:00-08:00,2,94,99,99613.0,16753.0
2015-02-04 03:00:00-08:00,3,45,61,119754.0,27669.0
2015-02-04 04:00:00-08:00,4,77,141,505238.0,257549.0
2015-02-04 05:00:00-08:00,5,87,131,164136.0,27199.0
2015-02-05 20:00:00-08:00,20,58,1285,10709633.0,9677129.0
2015-02-05 21:00:00-08:00,21,87,98,1846822.0,1458675.0
2015-02-05 22:00:00-08:00,22,43,43,48886.0,5022.0
2015-02-05 23:00:00-08:00,23,27,30,73474.0,32613.0


In [50]:
warnings.filterwarnings("ignore")

preactive0_lr = lr_train(preactive0_comb_agg)
print(cv_score(preactive0_comb_agg, preactive0_lr))

preactive1_lr = lr_train(preactive1_comb_agg)
print(cv_score(preactive1_comb_agg, preactive1_lr))

preactive2_lr = lr_train(preactive2_comb_agg)
print(cv_score(preactive2_comb_agg, preactive2_lr))

active0_lr = lr_train(active0_comb_agg)
print(cv_score(active0_comb_agg, active0_lr))

active1_lr = lr_train(active1_comb_agg)
print(cv_score(active1_comb_agg, active1_lr))

active2_lr = lr_train(active2_comb_agg)
print(cv_score(active2_comb_agg, active2_lr))

postactive0_lr = lr_train(postactive0_comb_agg)
print(cv_score(postactive0_comb_agg, postactive0_lr))

postactive1_lr = lr_train(postactive1_comb_agg)
print(cv_score(postactive1_comb_agg, postactive1_lr))

postactive2_lr = lr_train(postactive2_comb_agg)
print(cv_score(postactive2_comb_agg, postactive2_lr))

-3964.4305665692905
-789.5342950093461
-27.204891143622277
-77.56607905811335
-335.9576275574258
-119.76309265937948
-55260.466320882675
-34.112228923066155
-1.3901796354207405


In [53]:
warnings.filterwarnings("ignore")

preactive0_mlp = mlp_train(preactive0_comb_agg)
print(cv_score(preactive0_comb_agg, preactive0_mlp))

preactive1_mlp = mlp_train(preactive1_comb_agg)
print(cv_score(preactive1_comb_agg, preactive1_mlp))

preactive2_mlp = mlp_train(preactive2_comb_agg)
print(cv_score(preactive2_comb_agg, preactive2_mlp))

active0_mlp = mlp_train(active0_comb_agg)
print(cv_score(active0_comb_agg, active0_mlp))

active1_mlp = mlp_train(active1_comb_agg)
print(cv_score(active1_comb_agg, active1_mlp))

active2_mlp = mlp_train(active2_comb_agg)
print(cv_score(active2_comb_agg, active2_mlp))

postactive0_mlp = mlp_train(postactive0_comb_agg)
print(cv_score(postactive0_comb_agg, postactive0_mlp))

postactive1_mlp = mlp_train(postactive1_comb_agg)
print(cv_score(postactive1_comb_agg, postactive1_mlp))

postactive2_mlp = mlp_train(postactive2_comb_agg)
print(cv_score(postactive2_comb_agg, postactive2_mlp))

-11098357515.685926
-728848.9074233344
-6394958.677974194
-294826.2677092392
-7270613.401476458
-435.8861432219985
-4962846.376971026
-72584825.58007556
-13624441.00803786


In [54]:
warnings.filterwarnings("ignore")

preactive0_rf = rf_train(preactive0_comb_agg)
print(cv_score(preactive0_comb_agg, preactive0_rf_model))

preactive1_rf = rf_train(preactive1_comb_agg)
print(cv_score(preactive1_comb_agg, preactive1_rf_model))

preactive2_rf = rf_train(preactive2_comb_agg)
print(cv_score(preactive2_comb_agg, preactive2_rf_model))

active0_rf = rf_train(active0_comb_agg)
print(cv_score(active0_comb_agg, active0_rf_model))

active1_rf = rf_train(active1_comb_agg)
print(cv_score(active1_comb_agg, active1_rf_model))

active2_rf = rf_train(active2_comb_agg)
print(cv_score(active2_comb_agg, active2_rf_model))

postactive0_rf = rf_train(postactive0_comb_agg)
print(cv_score(postactive0_comb_agg, postactive0_rf_model))

postactive1_rf = rf_train(postactive1_comb_agg)
print(cv_score(postactive1_comb_agg, postactive1_rf_model))

postactive2_rf = rf_train(postactive2_comb_agg)
print(cv_score(postactive2_comb_agg, postactive2_rf_model))

-307.87850098944585
-39.83524784406326
-11.539645599794712
-16.489164564092192
-14.031248975511156
-3.2605204895680844
-15.487583274414497
-1.2461372641353388
0.2882031061193261


In [57]:
def print_prediction(df, model):
    df = df.drop(['future_tweets'], axis=1)
    
    next_hour = df.index[-1:]
    next_hour_index = next_hour.shift(1, freq='H')
    next_hour_index = next_hour_index.strftime("%Y-%m-%d %H:%M:%S-08:00")
    next_hour = str(df.index[-1])

    s = df.xs(next_hour)
    s.name = str(next_hour_index[0]) 
    
    df = df.append(s)
    df['hour'][-1] += 1
    
    if(df['tweets'][-2] <= df['tweets'][-3]):
        df['tweets'][-1] = df['tweets'][-2] - (df['tweets'][-3] - df['tweets'][-2])
    else:
        df['tweets'][-1] = df['tweets'][-2] + (df['tweets'][-2] - df['tweets'][-3])
    if(df['retweets'][-2] <= df['retweets'][-3]):
        df['retweets'][-1] = df['retweets'][-2] - (df['retweets'][-3] - df['retweets'][-2])
    else:
        df['retweets'][-1] = df['retweets'][-2] + (df['retweets'][-2] - df['retweets'][-3])
    if(df['followers'][-2] <= df['followers'][-3]):
        df['followers'][-1] = df['followers'][-2] - (df['followers'][-3] - df['followers'][-2])
    else:
        df['followers'][-1] = df['followers'][-2] + (df['followers'][-2] - df['followers'][-3])
    if(df['followers_max'][-2] <= df['followers_max'][-3]):
        df['followers_max'][-1] = df['followers_max'][-2] - (df['followers_max'][-3] - df['followers_max'][-2])
    else:
        df['followers_max'][-1] = df['followers_max'][-2] + (df['followers_max'][-2] - df['followers_max'][-3])

    return model.predict(df)[-1]

In [59]:
warnings.filterwarnings("ignore")

print(print_prediction(preactive0_comb_agg, preactive0_rf))
print(print_prediction(preactive1_comb_agg, preactive1_rf))
print(print_prediction(preactive2_comb_agg, preactive2_rf))

print(print_prediction(active0_comb_agg, active0_rf))
print(print_prediction(active1_comb_agg, active1_rf))
print(print_prediction(active2_comb_agg, active2_rf))

print(print_prediction(postactive0_comb_agg, postactive0_rf))
print(print_prediction(postactive1_comb_agg, postactive1_rf))
print(print_prediction(postactive2_comb_agg, postactive2_rf))


116.8
846.0
98.1
1145.4
924.2
212.1
77.0
35.7
33.339999999999996
