# Step1:  extracting features from raw data

In [141]:
# generate raw combine dataset
files = ['tweet_data/tweets_#gohawks.txt', 'tweet_data/tweets_#gopatriots.txt', 'tweet_data/tweets_#nfl.txt', 'tweet_data/tweets_#patriots.txt', 'tweet_data/tweets_#sb49.txt', 'tweet_data/tweets_#superbowl.txt']

with open('tweet_data/tweets_#combine.txt', 'w') as target:
    for file in files:
        with open(file, 'r') as cur_file:
                for line in cur_file:
                    target.write(line)

In [142]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime, time
import pytz


# --------------------- preprocessing ----------------------- #
# define paths
files = ['tweet_data/tweets_#gohawks.txt', 'tweet_data/tweets_#gopatriots.txt', 'tweet_data/tweets_#nfl.txt', 'tweet_data/tweets_#patriots.txt', 'tweet_data/tweets_#sb49.txt', 'tweet_data/tweets_#superbowl.txt', 'tweet_data/tweets_#combine.txt']

# calculate statistics of each hashtag
def cal_statistics(file):
    date = []
    time = []
    tweet_count = []
    followers_count = []
    retweet_count = []
    url_count = []
    author_time = {} # name+nick : date : set(time)
    authors_count = [] 
    mentions_count = []
    rank_score = []
    hashtag_count = []
    # extract data
    with open(file, 'r') as cur_file:
        for line in cur_file:
            data = json.loads(line)
            # date and time
            timestamp = data['citation_date']
            pst_tz = pytz.timezone('US/Pacific')
            timestamp = str(datetime.datetime.fromtimestamp(int(timestamp), pst_tz))
            date_split = timestamp[0:10].split('-')
            cur_date = int(date_split[0]+date_split[1]+date_split[2])
            date.append(cur_date)
            cur_time = int(timestamp[11:13])
            time.append(cur_time)
            
            tweet_count.append(1)
            followers_count.append(data['author']['followers'])
            retweet_count.append(data['metrics']['citations']['total'])
            url_count.append(len(data['tweet']['entities']['urls']))
            
            # unique authors
            author_name = data['author']['name']+'+'+data['author']['nick']
            if author_name in author_time:
                ori_ = author_time[author_name]
                if cur_date in ori_:
                    ori_times = ori_[cur_date] # set
                    if cur_time in ori_times:
                        authors_count.append(0)
                    else:
                        authors_count.append(1)
                        ori_times.add(cur_time)
                else:
                    authors_count.append(1)
                    new_times = set()
                    new_times.add(cur_time)
                    ori_[cur_date] = new_times
            else:
                authors_count.append(1)
                new_times = set()
                new_times.add(cur_time)
                new_dates = {}
                new_dates[cur_date] = new_times
                author_time[author_name] = new_dates
                
            mentions_count.append(len(data['tweet']['entities']['user_mentions']))
            rank_score.append(data['metrics']['ranking_score'])
            hashtag_count.append(data['title'].count('#'))
        df = pd.DataFrame({
            'tweet' : tweet_count,
            'date' : date,
            'time' : time,
            'followers' : followers_count,
            'retweets' : retweet_count,
            'urls' : url_count,
            'authors' : authors_count,
            'mentions' : mentions_count,
            'ranking score' : rank_score,
            'hashtags' : hashtag_count
        }, columns = ['tweet', 'date', 'time', 'followers', 'retweets', 'urls', 'authors', 'mentions', 'ranking score', 'hashtags'])
        df.to_csv('extracted_data/Q1.3_'+file[18:-4]+'.csv', index = False)

# extract data from each hashtag
for file in files:
    cal_statistics(file)


# Step2: aggregating data from step1

In [3]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime, time
import pytz
import statsmodels.api as sm
from statsmodels.regression.linear_model import RegressionResults


# define paths
files = ['extracted_data/Q1.3_#gohawks.csv', 'extracted_data/Q1.3_#gopatriots.csv', 'extracted_data/Q1.3_#nfl.csv', 'extracted_data/Q1.3_#patriots.csv', 'extracted_data/Q1.3_#sb49.csv', 'extracted_data/Q1.3_#superbowl.csv', 'extracted_data/Q1.3_#combine.csv']

def load_and_process(file):
    # process and groupby data
    data = pd.read_csv(file)
    data.columns = ['tweet', 'date', 'time', 'followers', 'retweets', 'urls', 'authors', 'mentions', 'ranking score', 'hashtags']
    df = data.groupby(['date', 'time']).agg({'tweet' : np.sum, 'retweets' : np.sum, 'followers' : np.sum, 'urls' : np.sum, 'authors' : np.sum, 'mentions' : np.sum, 'ranking score' : np.sum, 'hashtags' : np.sum})
    
    # fill up non-exists hours with all zero data
    app_rows = []
    for i in range(1,len(df.index)):  
        pre_date = df.index[i-1][0]
        pre_hour = int(df.index[i-1][1])
        cur_date = df.index[i][0]
        cur_hour = int(df.index[i][1])
        if (cur_hour < pre_hour):
            cur_hour = cur_hour + 24
        hour_diff = cur_hour - pre_hour
        while (hour_diff > 1):
            pre_hour = pre_hour + 1
            if (pre_hour > 23):
                pre_date = cur_date
                app_rows.append({'tweet':0,'date':pre_date,'time':pre_hour-24,'followers':0,'retweets':0,'urls':0,'authors':0,'mentions':0,'ranking score':0,'hashtags':0})
            else:
                app_rows.append({'tweet':0,'date':pre_date,'time':pre_hour,'followers':0,'retweets':0,'urls':0,'authors':0,'mentions':0,'ranking score':0,'hashtags':0})
            hour_diff = cur_hour - pre_hour
    for row in app_rows:
        data = data.append(row, ignore_index=True)
    
    df = data.groupby(['date', 'time']).agg({'date' : pd.Series.unique, 'time' : pd.Series.unique, 'tweet' : np.sum, 'retweets' : np.sum, 'followers' : np.sum, 'urls' : np.sum, 'authors' : np.sum, 'mentions' : np.sum, 'ranking score' : np.sum, 'hashtags' : np.sum})
    df.to_csv('extracted_data/Q1.4_'+file[20:-4]+'.csv', index=False)
    return df


  from pandas.core import datetools


# Step3: seperating data according to date and time

1. Before Feb. 1, 8:00 a.m.
2. Between Feb. 1, 8:00 a.m. and 8:00 p.m.
3. After Feb. 1, 8:00 p.m.

In [4]:
from IPython.display import display
def seperate(df):
    periods = []
    periods.append(df.query('date < 20150201 or (date == 20150201 and time < 8)'))
    periods.append(df.query('date == 20150201 and time >= 8 and time <= 20'))
    periods.append(df.query('date > 20150201 or (date == 20150201 and time > 20)'))
    return periods


# Step4: using 3 models to train and predict
For each hashtag, report the average cross-validation errors for the 3 different models.
Note that you should do the 90-10% splitting for each model within its specific time
window.
<br><br>Your evaluated error should be of the form |Npredicted - Nreal|.
<br>MAE (mean of 10 absolute errors) for each piece and each model
<br><br>\- 6 hashtags
<br>&emsp;&emsp;\- 3 time pieces 
<br>&emsp;&emsp;&emsp;&emsp;\- **3 models**
<br>&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;\- **10 folds**
<br>&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;\- **average cross-validation error**

In [7]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn import svm
from sklearn.neural_network import MLPRegressor
from sklearn import linear_model


def regression_analysis(file, periods):
    # input: dataframes of 3 time pieces of a hashtag and the file name
    titles = ['Before', 'Between', 'After']
    res = {}
    res[str(file[20:-4])] = {}
    
    for i in range(len(periods)):
        print('================' + str(file[20:-4]) + ' ' + titles[i] + '================')
        res[str(file[20:-4])][titles[i]] = {}
        period = periods[i]
        input_arr = []
        for index in period.index:
            input_arr.append(period.loc[index, 'tweet':'hashtags'].values)
        input_arr.pop()
        output_arr = period.loc[period.index[1]:, 'tweet'].values
        errors = three_models_ten_folds_errors(input_arr, output_arr)
        for key in errors:
            print(key + ' average error: ' + str(errors[key]))
            res[str(file[20:-4])][titles[i]][key] = errors[key]
    return res


def three_models_ten_folds_errors(input_arr, output_arr):
    ave_error = {}
    ave_error['LR'] = 0
    ave_error['SVM'] = 0
    ave_error['NN'] = 0
    for model in ave_error:
        MAE = []
        kf = KFold(n_splits=10, shuffle=False)
        for train_index, test_index in kf.split(input_arr):
            train_in = [input_arr[i] for i in train_index]
            test_in = [input_arr[i] for i in test_index]
            train_out = [output_arr[i] for i in train_index]
            test_out = [output_arr[i] for i in test_index]
            test_pre = fit_predict(model, train_in, train_out, test_in)
            MAE.append(mean_absolute_error(test_out, test_pre))
        ave_error[model] = np.mean(MAE)
    return ave_error


def fit_predict(model, train_in, train_out, test_in):
    if model == 'LR':
        tr_in = []
        for i in range(len(train_in)):
            tr_in.append(train_in[i][:])
            np.append(tr_in[len(tr_in) - 1], 1)
        te_in = []
        for i in range(len(test_in)):
            te_in.append(test_in[i][:])
            np.append(te_in[len(te_in) - 1], 1)
        reg = sm.OLS(train_out, tr_in)
        results = reg.fit()
        return results.predict(te_in)
    elif model == 'SVM':
        reg = svm.SVC(gamma=6)
        reg.fit(train_in, train_out)
        return reg.predict(test_in)
    elif model == 'NN':
        reg = MLPRegressor(hidden_layer_sizes=(10, ), activation='relu')
        reg.fit(train_in, train_out)
        return reg.predict(test_in)

for file in files:
    df = load_and_process(file)
    periods = seperate(df)
    res = regression_analysis(file, periods)
    display(res)
    res_ = res[file[20:-4]]
    titles = ['Before', 'Between', 'After']
    res_LR = []
    res_NN = []
    res_SVM = []
    for i in range(3):
        cur_res = res_[titles[i]]
        res_LR.append(cur_res['LR'])
        res_NN.append(cur_res['NN'])
        res_SVM.append(cur_res['SVM'])
        
    df = pd.DataFrame({
        file[20:-4] : titles,
        'Linear Regression' : res_LR,
        'Neural Network' : res_NN,
        'SVM' : res_SVM
    }, columns = [file[20:-4], 'Linear Regression', 'Neural Network', 'SVM'])
    display(df)





LR average error: 375.53193355
SVM average error: 255.007610994
NN average error: 8314.90389076
LR average error: 5027.97300083
SVM average error: 6778.95
NN average error: 180303.544592
LR average error: 25.5056065135
SVM average error: 32.0916666667
NN average error: 357.586269571


{'#gohawks': {'After': {'LR': 25.505606513540648,
   'NN': 357.58626957098897,
   'SVM': 32.091666666666661},
  'Before': {'LR': 375.53193354986973,
   'NN': 8314.903890763384,
   'SVM': 255.00761099365755},
  'Between': {'LR': 5027.9730008306415,
   'NN': 180303.54459220267,
   'SVM': 6778.9499999999998}}}

Unnamed: 0,#gohawks,Linear Regression,Neural Network,SVM
0,Before,375.531934,8314.903891,255.007611
1,Between,5027.973001,180303.544592,6778.95
2,After,25.505607,357.58627,32.091667


LR average error: 17.744468454
SVM average error: 14.4570824524
NN average error: 182.638252803
LR average error: 569.792956022
SVM average error: 2007.05
NN average error: 73859.1943002
LR average error: 2.74936825056
SVM average error: 4.73269230769
NN average error: 28.2679534253


{'#gopatriots': {'After': {'LR': 2.7493682505608059,
   'NN': 28.267953425301243,
   'SVM': 4.7326923076923091},
  'Before': {'LR': 17.74446845398672,
   'NN': 182.63825280271351,
   'SVM': 14.45708245243129},
  'Between': {'LR': 569.79295602207526,
   'NN': 73859.194300229399,
   'SVM': 2007.05}}}

Unnamed: 0,#gopatriots,Linear Regression,Neural Network,SVM
0,Before,17.744468,182.638253,14.457082
1,Between,569.792956,73859.1943,2007.05
2,After,2.749368,28.267953,4.732692


LR average error: 119.813957842
SVM average error: 189.861680761
NN average error: 5042.33379005
LR average error: 5224.18796855
SVM average error: 6023.1
NN average error: 308582.744264
LR average error: 108.621711387
SVM average error: 592.968681319
NN average error: 87831.2782756


{'#nfl': {'After': {'LR': 108.62171138652306,
   'NN': 87831.278275555203,
   'SVM': 592.96868131868121},
  'Before': {'LR': 119.81395784162035,
   'NN': 5042.3337900474535,
   'SVM': 189.86168076109939},
  'Between': {'LR': 5224.187968552249,
   'NN': 308582.74426441302,
   'SVM': 6023.1000000000004}}}

Unnamed: 0,#nfl,Linear Regression,Neural Network,SVM
0,Before,119.813958,5042.33379,189.861681
1,Between,5224.187969,308582.744264,6023.1
2,After,108.621711,87831.278276,592.968681


LR average error: 252.338981366
SVM average error: 291.941173362
NN average error: 13027.0943522
LR average error: 92370.6873622
SVM average error: 27055.25
NN average error: 198878.623067
LR average error: 65.207327554
SVM average error: 149.131868132
NN average error: 6103.75925147


{'#patriots': {'After': {'LR': 65.207327554049215,
   'NN': 6103.7592514723819,
   'SVM': 149.13186813186812},
  'Before': {'LR': 252.33898136556914,
   'NN': 13027.094352197206,
   'SVM': 291.9411733615222},
  'Between': {'LR': 92370.687362200508,
   'NN': 198878.62306661086,
   'SVM': 27055.25}}}

Unnamed: 0,#patriots,Linear Regression,Neural Network,SVM
0,Before,252.338981,13027.094352,291.941173
1,Between,92370.687362,198878.623067,27055.25
2,After,65.207328,6103.759251,149.131868


LR average error: 46.7453583267
SVM average error: 106.868181818
NN average error: 4718.9917164
LR average error: 93722.963115
SVM average error: 50906.6
NN average error: 3788424.72778
LR average error: 99.3238449435
SVM average error: 323.721978022
NN average error: 461428.016389


{'#sb49': {'After': {'LR': 99.323844943525302,
   'NN': 461428.01638916787,
   'SVM': 323.72197802197809},
  'Before': {'LR': 46.745358326665198,
   'NN': 4718.9917164028393,
   'SVM': 106.86818181818182},
  'Between': {'LR': 93722.963115048158,
   'NN': 3788424.7277802574,
   'SVM': 50906.599999999999}}}

Unnamed: 0,#sb49,Linear Regression,Neural Network,SVM
0,Before,46.745358,4718.992,106.868182
1,Between,93722.963115,3788425.0,50906.6
2,After,99.323845,461428.0,323.721978


LR average error: 362.654304676
SVM average error: 451.030761099
NN average error: 69978.7658238
LR average error: 264251.105849
SVM average error: 183828.15
NN average error: 3357777.9188
LR average error: 167.808673838
SVM average error: 857.338461538
NN average error: 912270.030481


{'#superbowl': {'After': {'LR': 167.80867383798019,
   'NN': 912270.03048084606,
   'SVM': 857.33846153846173},
  'Before': {'LR': 362.65430467583508,
   'NN': 69978.765823806549,
   'SVM': 451.03076109936575},
  'Between': {'LR': 264251.10584921815,
   'NN': 3357777.9187985784,
   'SVM': 183828.14999999999}}}

Unnamed: 0,#superbowl,Linear Regression,Neural Network,SVM
0,Before,362.654305,69978.77,451.030761
1,Between,264251.105849,3357778.0,183828.15
2,After,167.808674,912270.0,857.338462


LR average error: 732.172940919
SVM average error: 1460.00761099
NN average error: 83068.193724
LR average error: 160531.399686
SVM average error: 201545.0
NN average error: 7754142.37802
LR average error: 436.631025448
SVM average error: 8042.51593407
NN average error: 1421776.71545


{'#combine': {'After': {'LR': 436.63102544760557,
   'NN': 1421776.7154475492,
   'SVM': 8042.5159340659329},
  'Before': {'LR': 732.17294091872293,
   'NN': 83068.193724029916,
   'SVM': 1460.0076109936574},
  'Between': {'LR': 160531.39968610316,
   'NN': 7754142.3780219201,
   'SVM': 201545.0}}}

Unnamed: 0,#combine,Linear Regression,Neural Network,SVM
0,Before,732.172941,83068.19,1460.007611
1,Between,160531.399686,7754142.0,201545.0
2,After,436.631025,1421777.0,8042.515934
