# Step1: preprocessing test data from raw data

In [5]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime, time
import pytz


# --------------------- preprocessing test data (raw -> csv) ----------------------- #

# define paths
files_raw = ['test_data/sample1_period1.txt', 'test_data/sample2_period2.txt', 'test_data/sample3_period3.txt', 
             'test_data/sample4_period1.txt', 'test_data/sample5_period1.txt', 'test_data/sample6_period2.txt', 
             'test_data/sample7_period3.txt', 'test_data/sample8_period1.txt', 'test_data/sample9_period2.txt',
             'test_data/sample10_period3.txt']

# calculate statistics of each file
def cal_statistics(file):
    date = []
    time = []
    tweet_count = []
    followers_count = []
    retweet_count = []
    url_count = []
    author_time = {} # name+nick : date : set(time)
    authors_count = [] 
    mentions_count = []
    rank_score = []
    hashtag_count = []
    # extract data
    with open(file, 'r') as cur_file:
        for line in cur_file:
            data = json.loads(line)
            # date and time
            timestamp = data['firstpost_date']
            pst_tz = pytz.timezone('US/Pacific')
            timestamp = str(datetime.datetime.fromtimestamp(int(timestamp), pst_tz))
            date_split = timestamp[0:10].split('-')
            cur_date = int(date_split[0]+date_split[1]+date_split[2])
            date.append(cur_date)
            cur_time = int(timestamp[11:13])
            time.append(cur_time)
            
            tweet_count.append(1)
            followers_count.append(data['author']['followers'])
            retweet_count.append(data['metrics']['citations']['total'])
            url_count.append(len(data['tweet']['entities']['urls']))
            
            # unique authors
            author_name = data['author']['name']+'+'+data['author']['nick']
            if author_name in author_time:
                ori_ = author_time[author_name]
                if cur_date in ori_:
                    ori_times = ori_[cur_date] # set
                    if cur_time in ori_times:
                        authors_count.append(0)
                    else:
                        authors_count.append(1)
                        ori_times.add(cur_time)
                else:
                    authors_count.append(1)
                    new_times = set()
                    new_times.add(cur_time)
                    ori_[cur_date] = new_times
            else:
                authors_count.append(1)
                new_times = set()
                new_times.add(cur_time)
                new_dates = {}
                new_dates[cur_date] = new_times
                author_time[author_name] = new_dates
                
            mentions_count.append(len(data['tweet']['entities']['user_mentions']))
            rank_score.append(data['metrics']['ranking_score'])
            hashtag_count.append(data['title'].count('#'))
        df = pd.DataFrame({
            'tweet' : tweet_count,
            'date' : date,
            'time' : time,
            'followers' : followers_count,
            'retweets' : retweet_count,
            'urls' : url_count,
            'authors' : authors_count,
            'mentions' : mentions_count,
            'ranking score' : rank_score,
            'hashtags' : hashtag_count
        }, columns = ['tweet', 'date', 'time', 'followers', 'retweets', 'urls', 'authors', 'mentions', 'ranking score', 'hashtags'])
        df.to_csv('extracted_data/Q1.5_'+file[10:-4]+'.csv', index = False)

# extract data from each file
for file in files_raw:
    cal_statistics(file)
print ('Raw test data has been done!')


# --------------------- preprocessing test data (csv -> hourly grouped csv) ----------------------- #

# define paths
files_hour = ['extracted_data/Q1.5_sample1_period1.csv', 'extracted_data/Q1.5_sample2_period2.csv', 
              'extracted_data/Q1.5_sample3_period3.csv', 'extracted_data/Q1.5_sample4_period1.csv', 
              'extracted_data/Q1.5_sample5_period1.csv', 'extracted_data/Q1.5_sample6_period2.csv', 
              'extracted_data/Q1.5_sample7_period3.csv', 'extracted_data/Q1.5_sample8_period1.csv', 
              'extracted_data/Q1.5_sample9_period2.csv', 'extracted_data/Q1.5_sample10_period3.csv']

# load and process data from each test file
def load_and_process(file):
    # process and groupby data
    data = pd.read_csv(file)
    data.columns = ['tweet', 'date', 'time', 'followers', 'retweets', 'urls', 'authors', 'mentions', 'ranking score', 'hashtags']
    df = data.groupby(['date', 'time']).agg({'time' : np.max, 'tweet' : np.sum, 'retweets' : np.sum, 'followers' : np.sum, 'urls' : np.sum, 'authors' : np.sum, 'mentions' : np.sum, 'ranking score' : np.sum})
    df.to_csv('extracted_data/Q1.5_hourly_'+file[20:-4]+'.csv', index=False)
    display(df)
    return df

# linear regression model on each file
for file in files_hour:
    load_and_process(file)
print ('Each test data file has been grouped hourly!')


Raw test data has been done!


Unnamed: 0_level_0,Unnamed: 1_level_0,time,tweet,retweets,followers,urls,authors,mentions,ranking score
date,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
20150129,10,10,137,283,1717045.0,72,131,75,620.221406
20150129,11,11,82,165,373976.0,43,80,43,367.463808
20150129,12,12,68,153,17131333.0,38,64,33,295.074807
20150129,13,13,94,140,357749.0,53,88,79,404.238342
20150129,14,14,171,224,363126.0,129,91,109,680.48406
20150129,15,15,178,259,419740.0,145,68,173,726.513034


Unnamed: 0_level_0,Unnamed: 1_level_0,time,tweet,retweets,followers,urls,authors,mentions,ranking score
date,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
20150201,11,11,7591,15316,36948893.0,1300,6911,3275,33766.673976
20150201,12,12,9361,20889,63149167.0,1502,8353,4299,41664.703101
20150201,13,13,10374,23780,56331829.0,1536,9311,4418,46302.657268
20150201,14,14,20066,40401,71509532.0,2347,18040,7288,89186.285529
20150201,15,15,81958,140370,341986255.0,4720,62242,20642,370798.525836
20150201,16,16,82923,145849,339357959.0,3667,53360,22175,375573.325872


Unnamed: 0_level_0,Unnamed: 1_level_0,time,tweet,retweets,followers,urls,authors,mentions,ranking score
date,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
20150202,3,3,441,1560,4672767.0,195,365,222,1977.669733
20150202,4,4,550,840,6861668.0,212,482,264,2513.104107
20150202,5,5,610,1765,16062893.0,222,550,330,2814.89046
20150202,6,6,888,1884,11941082.0,368,776,488,4014.72376
20150202,7,7,616,1755,11524125.0,296,543,312,2876.7587
20150202,8,8,523,1166,5436365.0,253,468,236,2442.247139


Unnamed: 0_level_0,Unnamed: 1_level_0,time,tweet,retweets,followers,urls,authors,mentions,ranking score
date,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
20150125,14,14,419,1918,3955589.0,179,310,178,2092.904778
20150125,15,15,257,576,1813464.0,112,186,142,1215.627649
20150125,16,16,236,480,1911266.0,150,186,88,1080.763023
20150125,17,17,266,494,702588.0,139,197,87,1223.315477
20150125,18,18,267,512,1410837.0,146,218,103,1224.770661
20150125,19,19,201,412,1348812.0,119,166,96,882.842881


Unnamed: 0_level_0,Unnamed: 1_level_0,time,tweet,retweets,followers,urls,authors,mentions,ranking score
date,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
20150127,17,17,342,1498,1954132.0,213,246,195,1527.509883
20150127,18,18,508,1068,4423248.0,383,284,336,2120.664556
20150127,19,19,353,568,1138637.0,235,265,182,1584.879378
20150127,20,20,362,614,8295227.0,238,269,164,1583.629605
20150127,21,21,281,540,866203.0,183,204,142,1253.004368
20150127,22,22,213,271,225292.0,131,142,58,934.163891


Unnamed: 0_level_0,Unnamed: 1_level_0,time,tweet,retweets,followers,urls,authors,mentions,ranking score
date,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
20150201,9,9,979,3210,6449961.0,390,903,308,4429.966165
20150201,10,10,12931,17965,19308199.0,10093,12434,19275,54889.388546
20150201,11,11,60619,74766,51041619.0,57365,59791,114170,250622.50181
20150201,12,12,52699,61205,39313052.0,48895,52010,97164,217610.347658
20150201,13,13,41019,48759,33190970.0,36957,40317,72846,170267.571385
20150201,14,14,37307,47341,54713761.0,31530,36271,60732,156395.658467


Unnamed: 0_level_0,Unnamed: 1_level_0,time,tweet,retweets,followers,urls,authors,mentions,ranking score
date,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
20150202,22,22,125,204,181846.0,103,98,55,538.67139
20150202,23,23,102,134,261888.0,86,76,30,436.287546
20150203,0,0,66,72,490172.0,55,45,21,287.632988
20150203,1,1,60,67,94070.0,49,38,5,279.019312
20150203,2,2,55,64,51985.0,39,34,14,240.645006
20150203,3,3,120,711,1288840.0,87,88,31,502.49741


Unnamed: 0_level_0,Unnamed: 1_level_0,time,tweet,retweets,followers,urls,authors,mentions,ranking score
date,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
20150128,16,16,49,65,55217.0,44,19,6,201.436952
20150128,17,17,72,83,66514.0,66,28,6,291.508929
20150128,18,18,56,59,35577.0,52,21,2,220.851097
20150128,19,19,41,43,13027.0,40,15,7,164.630682
20150128,20,20,11,11,3862.0,11,7,0,43.826498


Unnamed: 0_level_0,Unnamed: 1_level_0,time,tweet,retweets,followers,urls,authors,mentions,ranking score
date,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
20150201,10,10,1729,2451,5660117.0,1356,993,265,7372.003764
20150201,11,11,1734,2358,7640535.0,1393,920,221,7403.367862
20150201,12,12,1619,2563,7850956.0,1221,926,285,6938.297642
20150201,13,13,1582,2277,4955196.0,1177,967,271,6771.375261
20150201,14,14,1857,4734,10871571.0,1413,1239,288,7989.608872
20150201,15,15,2790,4167,9455386.0,1690,2113,410,12195.832633


Unnamed: 0_level_0,Unnamed: 1_level_0,time,tweet,retweets,followers,urls,authors,mentions,ranking score
date,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
20150205,12,12,62,65,121923.0,56,34,4,253.148424
20150205,13,13,54,64,62833.0,51,33,9,220.584601
20150205,14,14,68,77,157875.0,67,33,6,278.696486
20150205,15,15,62,65,54620.0,53,29,4,248.195699
20150205,16,16,58,59,55630.0,51,28,4,229.641543
20150205,17,17,61,64,58171.0,55,28,5,245.690592


Each test data file has been grouped hourly!


# Step2: fit best model on train data for each period

In [91]:
import statsmodels.api as sm
from sklearn import svm
from sklearn.neural_network import MLPRegressor
from statsmodels.regression.linear_model import RegressionResults
from IPython.display import display
from sklearn.metrics import mean_absolute_error


# seperate aggregated train data into three period
# 1. Before Feb. 1, 8:00 a.m.
# 2. Between Feb. 1, 8:00 a.m. and 8:00 p.m.
# 3. After Feb. 1, 8:00 p.m.
def seperate(df):
    periods = []
    periods.append(df.query('date < 20150201 or (date == 20150201 and time < 8)'))
    periods.append(df.query('date == 20150201 and time >= 8 and time <= 20'))
    periods.append(df.query('date > 20150201 or (date == 20150201 and time > 20)'))
    
    return periods

# using the best model in 1.4.2 (all are LR) to train each period (35 features & 28 features for period 1)
def regression_model_for_periods(periods):
    period_model = {}
    # 1 train model for each period (35 features)
    for i in range(3): # 3 periods
        period = periods[i]
        print (len(period.index))
        input_arr = []
        index_start = 0
        for j in range(index_start, index_start+len(period.index)-5): # n-5 points
            cur_input = []
            for k in range(5): # each point has 35 features
                for p in range(2,9): # append each column
                    cur_input.append(period.iloc[j+k, p])
            input_arr.append(cur_input)
        index_start = index_start + len(period.index)

        output_arr = period.loc[period.index[5]:, 'tweet'].values
        
        results = sm.OLS(output_arr, input_arr).fit()
#         if (i == 0):
#             results = svm.SVC(gamma=6)
#             results.fit(input_arr, output_arr)
#         else:
#             results = sm.OLS(output_arr, input_arr).fit()
        period_model[str(i+1)] = results
    
    # 2 train model for period 1 with 28 features
    period1 = periods[0]
    input_arr_ = []
    for j in range(0,len(period1.index)-4): # n-4 points
        cur_input = []
        for k in range(4): # each point has 28 features
            for p in range(2,9): # append each column
                cur_input.append(period1.iloc[j+k, p])
        input_arr_.append(cur_input)
        
    output_arr_ = period1.loc[period1.index[4]:, 'tweet'].values

    results_ = sm.OLS(output_arr_, input_arr_).fit()
#     results_ = svm.SVC(gamma=6)
#     results_.fit(input_arr_, output_arr_)
    period_model['4'] = results_
    
    return period_model

# load data from hourly grouped aggregated train data
df = pd.read_csv('extracted_data/Q1.4_#combine.csv')
df.columns = ['date', 'time', 'tweet', 'retweets', 'followers', 'urls', 'authors', 'mentions', 'ranking score', 'hashtags']
periods = seperate(df)
period_model = regression_model_for_periods(periods)


440
13
134


# Step3: predict on each test file using corresponding model

In [93]:
# define paths
files = ['extracted_data/Q1.5_hourly_sample1_period1.csv', 'extracted_data/Q1.5_hourly_sample2_period2.csv', 
         'extracted_data/Q1.5_hourly_sample3_period3.csv', 'extracted_data/Q1.5_hourly_sample4_period1.csv', 
         'extracted_data/Q1.5_hourly_sample5_period1.csv', 'extracted_data/Q1.5_hourly_sample6_period2.csv', 
         'extracted_data/Q1.5_hourly_sample7_period3.csv', 'extracted_data/Q1.5_hourly_sample8_period1.csv', 
         'extracted_data/Q1.5_hourly_sample9_period2.csv', 'extracted_data/Q1.5_hourly_sample10_period3.csv']

# predict on test data
def predict_on_test_data(file, period_model, df):
    period = file[-1]
    input_arr = []
    predicted_output = None
    results = period_model[period]
    cur_input = []
    if file[6] == '8': # 4-hour window
        for i in range(4):
            for p in range(1,8): # append each column
                cur_input.append(df.iloc[i, p])
        results = period_model['4']
    else:
        for i in range(5):
            for p in range(1,8): # append each column
                cur_input.append(df.iloc[i, p])

    input_arr.append(cur_input)
        
    predicted_output = results.predict(input_arr)
    return predicted_output

test_data = []
predict = []
true = []
error = []
# predict on each test file
for file in files:
    test_data.append(file[27:-4])
    
    # load data
    df = pd.read_csv(file)
    df.columns = ['time', 'tweet', 'retweets', 'followers', 'urls', 'authors', 'mentions', 'ranking score']
    
    # predict
    predicted_output = predict_on_test_data(file[27:-4], period_model, df)
    predict.append(predicted_output[0])
    
    # relative error
    true_value = df.loc[df.index[len(df.index)-1], 'tweet':'tweet'].values
    true.append(true_value[0])
    rel_error = abs(predicted_output-true_value)/true_value
    error.append(rel_error[0])
    
res = pd.DataFrame({
    'test file' : test_data,
    'predicted' : predict,
    'true value' : true,
    'relative error' : error
}, columns = ['test file', 'predicted', 'true value', 'relative error'])
display(res)
    

Unnamed: 0,test file,predicted,true value,relative error
0,sample1_period1,-576.333912,178.0,4.237831
1,sample2_period2,36342.457437,82923.0,0.561732
2,sample3_period3,854.793986,523.0,0.634405
3,sample4_period1,231.368785,201.0,0.151088
4,sample5_period1,1093.155754,213.0,4.132187
5,sample6_period2,60845.643856,37307.0,0.630944
6,sample7_period3,94.144711,120.0,0.215461
7,sample8_period1,-150.774339,11.0,14.706758
8,sample9_period2,1560.533447,2790.0,0.440669
9,sample10_period3,84.020009,61.0,0.377377
