# Part 4: Prediction on test dataset

## Importing libraries

In [2]:
import os
import json
import numpy as np
import pandas as pd
from pandas import DataFrame
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_predict

## Extracting data

In [3]:
files_dictionary_train = {'#GoHawks' : ['tweets_#gohawks.txt', 188136],
                          '#GoPatriots' : ['tweets_#gopatriots.txt', 26232],
                          '#NFL' : ['tweets_#nfl.txt', 259024],
                          '#Patriots' : ['tweets_#patriots.txt', 489713],
                          '#SB49' : ['tweets_#sb49.txt', 826951],
                          '#SuperBowl' : ['tweets_#superbowl.txt', 1348767]}
    
files_dictionary_test = {1 : ['sample1_period1.txt', 730],
                         2 : ['sample2_period2.txt', 212273],
                         3 : ['sample3_period3.txt', 3628],
                         4 : ['sample4_period1.txt', 1646],
                         5 : ['sample5_period1.txt', 2059],
                         6 : ['sample6_period2.txt', 205554],
                         7 : ['sample7_period3.txt', 528],
                         8 : ['sample8_period1.txt', 229],
                         9 : ['sample9_period2.txt', 11311],
                        10 : ['sample10_period3.txt', 365]}

#----------------------Function to extract the data from the file--------------
def extract_info(filename_key, file_name_dictionary, is_test_data):                         
                    
    #-----------------------To extract the data from the file------------------                
    time_stamps = [0]*file_name_dictionary[filename_key][1]
    is_retweet = [False]*file_name_dictionary[filename_key][1]
    followers_of_users = [0]*file_name_dictionary[filename_key][1]
    
    no_of_url_citations = [0]*file_name_dictionary[filename_key][1]
    usernames = ['']*file_name_dictionary[filename_key][1]
    no_of_mentions = [0]*file_name_dictionary[filename_key][1]
    ranking_scores = [0.0]*file_name_dictionary[filename_key][1]
    no_of_hashtags = [0]*file_name_dictionary[filename_key][1]
    
    location_path = ''
    feature_name = ''
    
    if is_test_data:
        location_path = './Test_data/'+file_name_dictionary[filename_key][0]
        feature_name = 'firstpost_date'
    else:
        location_path = './Training_data/'+file_name_dictionary[filename_key][0]
        feature_name = 'citation_date'
        
    file_in = open(location_path, encoding = "utf8")
    for (line, index) in zip(file_in, range(0, file_name_dictionary[filename_key][1])):
        data = json.loads(line)
        time_stamps[index] = data[feature_name]
        followers_of_users[index] = data['author']['followers']

        username = data['author']['nick']
        original_username = data['original_author']['nick']
        if username != original_username:
            is_retweet[index] = True

        no_of_url_citations[index] = len(data['tweet']['entities']['urls'])
        usernames[index] = username
        no_of_mentions[index] = len(data['tweet']['entities']['user_mentions'])
        ranking_scores[index] = data['metrics']['ranking_score']
        no_of_hashtags[index] = data['title'].count('#')
                
    file_in.close()
    
    #--------------------To calculate the related parameters-------------------
    start_time = min(time_stamps)
    
    if is_test_data:
        start_time = (min(time_stamps)/3600)*3600

    hrs_passed = int((max(time_stamps)-start_time)/3600)+1
    hr_no_of_tweets = [0] * hrs_passed
    hr_no_of_retweets = [0] * hrs_passed
    hr_sum_of_followers = [0] * hrs_passed
    hr_max_no_of_followers = [0] * hrs_passed
    hr_time_of_the_day = [0] * hrs_passed
    hr_no_of_url_citations = [0] * hrs_passed
    hr_no_of_users = [0] * hrs_passed
    hr_user_set = [0] * hrs_passed
    hr_no_of_mentions = [0] * hrs_passed
    hr_total_ranking_scores = [0.0] * hrs_passed
    hr_no_of_hashtags = [0] * hrs_passed
    for i in range(0, hrs_passed):
        hr_user_set[i] = set([])
        
    for i in range(0, file_name_dictionary[filename_key][1]):
        current_hr = int((time_stamps[i]-start_time)/3600)
        
        if is_retweet[i]:
            hr_no_of_retweets[current_hr] += 1
    
        if followers_of_users[i] > hr_max_no_of_followers[current_hr]:
            hr_max_no_of_followers[current_hr] = followers_of_users[i]

        hr_sum_of_followers[current_hr] += followers_of_users[i]
        hr_no_of_tweets[current_hr] += 1
        hr_no_of_url_citations[current_hr] += no_of_url_citations[i]
        hr_user_set[current_hr].add(usernames[i])
        hr_no_of_mentions[current_hr] += no_of_mentions[i]
        hr_total_ranking_scores[current_hr] += ranking_scores[i]
        hr_no_of_hashtags[current_hr] += no_of_hashtags[i]

    for i in range(0, len(hr_user_set)):
        hr_no_of_users[i] = len(hr_user_set[i])
    
    if is_test_data:
        for i in range(0, len(hr_time_of_the_day)):
            hr_time_of_the_day[i] = ((start_time-1421222400)/3600+i)%24
    else:
        for i in range(0, len(hr_time_of_the_day)):
            hr_time_of_the_day[i] = i%24
   
    #------------------To build the DataFrame and save it to file--------------
    target = hr_no_of_tweets[1:]
    target.append(0)
    data = np.array([hr_no_of_tweets,
                     hr_no_of_retweets,
                     hr_sum_of_followers,
                     hr_max_no_of_followers,
                     hr_time_of_the_day,
                     hr_no_of_url_citations,
                     hr_no_of_users,
                     hr_no_of_mentions,
                     hr_total_ranking_scores,
                     hr_no_of_hashtags,
                     target])
    data = np.transpose(data)
    
    data_frame = DataFrame(data)
    data_frame.columns = ['no_of_tweets', 
                          'no_of_retweets', 
                          'sum_of_followers',
                          'max_no_of_followers',
                          'time_of_day',
                          'no_of_URLs',
                          'no_of_users',
                          'no_of_mentions',
                          'ranking_score',
                          'no_of_hashtags',
                          'target']

    if os.path.isdir('./Extracted_data'):
        pass
    else:
        os.mkdir('./Extracted_data')
           
    if is_test_data:
        data_frame.to_csv('./Extracted_data/pred_test_'+file_name_dictionary[filename_key][0][:-4]+'.csv', index = False)  
    else:
        data_frame.to_csv('./Extracted_data/pred_test_'+filename_key+'.csv', index = False)  
#------------------------------------------------------------------------------  


## One Hot Encoding

In [4]:
#----------------------------One-hot Encoding----------------------------------
def one_hot_encode(data_frame):
    time_of_day_set = range(0,24)
    for time_of_day in time_of_day_set:
        time_of_day_column_to_add = []
        for time_of_day_item in data_frame['time_of_day']:
            if time_of_day_item == time_of_day:
                time_of_day_column_to_add.append(1)
            else:
                time_of_day_column_to_add.append(0)
        data_frame.insert(data_frame.shape[1]-1,
                  str(time_of_day)+'th_hour',
                  time_of_day_column_to_add)
    return data_frame
#------------------------------------------------------------------------------  

## Function to perform cross validation

In [5]:

#---------------------Function to perform Cross Validation---------------------
def regression(train_hashtag, test_data_index):
    train_x = pd.read_csv('./Extracted_data/pred_test_'+train_hashtag+'.csv')
    test_x = pd.read_csv('./Extracted_data/pred_test_'+files_dictionary_test[test_data_index][0][:-4]+'.csv')
    
    train_x = one_hot_encode(train_x)
    test_x = one_hot_encode(test_x)
         
    #----------------------------Splitting the data----------------------------   
    train_x.drop('time_of_day', 1, inplace = True)
    train_y = train_x.pop('target')
    
    test_x.drop('time_of_day', 1, inplace = True)
    test_y = test_x.pop('target')
    
    train_x_before = train_x[:440]
    train_x_during = train_x[440:452]
    train_x_after = train_x[452:]
        
    train_y_before = train_y[:440]
    train_y_during = train_y[440:452]
    train_y_after = train_y[452:]
    
    #---------------------------Regression Prediction--------------------------
    reg_before = RandomForestRegressor(n_estimators = 20, max_depth = 9)
    reg_during = RandomForestRegressor(n_estimators = 20, max_depth = 9)
    reg_after = RandomForestRegressor(n_estimators = 20, max_depth = 9)

    reg_before.fit(train_x_before,train_y_before)
    reg_during.fit(train_x_during,train_y_during)
    reg_after.fit(train_x_after,train_y_after)
    
    predicted_y = []
    if files_dictionary_test[test_data_index][0][-5] == '1':
        predicted_y = reg_before.predict(test_x)
    elif files_dictionary_test[test_data_index][0][-5] == '2':
        predicted_y = reg_during.predict(test_x)
    else:
        predicted_y = reg_after.predict(test_x)
    
    #-------------------------To print the predicted values--------------------   
    data = np.array([predicted_y, test_y])
    data = np.transpose(data)
    results = DataFrame(data)
    results.columns = ['Predicted', 'Actual']
     
    #--------------To calculate the average cross-validation error-------------
    total_error = 0.0
    for i in range(len(test_y)-1):
        total_error += abs(test_y[i] - predicted_y[i])
    
    return results, total_error/(len(test_y)-1)
#------------------------------------------------------------------------------

## Predict

In [6]:
def predict(test_data_index):
    result_list = []
    error_list = []
    hashtag_list = ['#GoHawks',
                    '#GoPatriots',
                    '#NFL',
                    '#Patriots',
                    '#SB49',
                    '#SuperBowl']

    for hashtag in hashtag_list:
        result, error = regression(hashtag, test_data_index)
        result_list.append(result)
        error_list.append(error)
    
    min_index = 0
    min_error = error_list[0]
    
    for i in range(0, len(error_list)):
        if error_list[i] < min_error:
            min_error = error_list[i]
            min_index = i
    
    return result_list[min_index],min_error,hashtag_list[min_index]


## Get Optimal result

In [7]:
def get_optimal_result(test_data_index):
    optimal_result, min_error, optimal_hashtag = predict(test_data_index)
    
    for i in range(20):
        result, error, hashtag = predict(test_data_index)
        if error  < min_error:
            min_error = error
            optimal_result = result
            optimal_hashtag = hashtag

    print('------------------------------------------------') 
    print('\n'+files_dictionary_test[test_data_index][0]+'\n')
    print('Best training dataset:',optimal_hashtag)
    print(optimal_result)
    print('Average cross-validation error:',min_error)
    print('------------------------------------------------')  


## Making csv files

In [8]:
extract_info('#GoHawks',files_dictionary_train,False)
extract_info('#GoPatriots',files_dictionary_train,False)
extract_info('#NFL',files_dictionary_train,False)
extract_info('#Patriots',files_dictionary_train,False)
extract_info('#SB49',files_dictionary_train,False)
extract_info('#SuperBowl',files_dictionary_train,False)

## Getting results

In [10]:
for i in range(1,11):
    extract_info(i,files_dictionary_test,True)

for i in range(1,11):
    get_optimal_result(i) 

------------------------------------------------

sample1_period1.txt

Best training dataset: #GoPatriots
    Predicted  Actual
0  141.110000    82.0
1   92.850909    68.0
2   65.500909    94.0
3  166.110000   171.0
4  209.660000   178.0
5  207.210000     0.0
Average cross-validation error: 29.802
------------------------------------------------
------------------------------------------------

sample2_period2.txt

Best training dataset: #SuperBowl
   Predicted   Actual
0   15047.65   9361.0
1   14977.85  10374.0
2   25317.15  20066.0
3   45292.40  81958.0
4   93018.70  82923.0
5   95139.30      0.0
Average cross-validation error: 12460.59
------------------------------------------------
------------------------------------------------

sample3_period3.txt

Best training dataset: #GoHawks
   Predicted  Actual
0     458.35   549.0
1     528.45   610.0
2     522.55   888.0
3     557.75   616.0
4     522.55   523.0
5     490.45     0.0
Average cross-validation error: 119.27000000000001
--