In [7]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import datetime, time
import pytz

In [8]:
files = ['ECE219_tweet_data/tweets_#gohawks.txt', 
         'ECE219_tweet_data/tweets_#gopatriots.txt', 
         'ECE219_tweet_data/tweets_#nfl.txt', 
         'ECE219_tweet_data/tweets_#patriots.txt', 
         'ECE219_tweet_data/tweets_#sb49.txt', 
         'ECE219_tweet_data/tweets_#superbowl.txt']

def getcsv(file):
    timestamp = []
    tweet_count = []
    followers_count = []
    retweet_count = []
    # extract data
    with open(file, 'r') as fl:
        for line in fl:
            data = json.loads(line)
            timestamp.append(data['citation_date'])
            tweet_count.append(1)
            followers_count.append(data['author']['followers'])
            retweet_count.append(data['metrics']['citations']['total'])
        df = pd.DataFrame({
            'tweet' : tweet_count,
            'timestamp' : timestamp,
            'followers' : followers_count,
            'retweeted times' : retweet_count
        }, columns = ['tweet', 'timestamp', 'followers', 'retweeted times'])
        df.to_csv('part1_1'+file[18:-4]+'.csv', index = False)

for file in files:
    getcsv(file)

In [9]:
newfiles=['part1_1tweets_#gohawks.csv',
         'part1_1tweets_#gopatriots.csv',
         'part1_1tweets_#nfl.csv',
         'part1_1tweets_#patriots.csv',
         'part1_1tweets_#sb49.csv',
         'part1_1tweets_#superbowl.csv']

In [10]:
def preprocess(file):
    data = pd.read_csv(file)
    data.columns = ['tweet', 'timestamp', 'followers', 'retweets']
    date,time = [],[]
    for timestamp in data['timestamp']:
        pst_tz = pytz.timezone('US/Pacific')
        timestamp = str(datetime.datetime.fromtimestamp(int(timestamp), pst_tz))
        date_split = timestamp[0:10].split('-')
        date.append(int(date_split[0]+date_split[1]+date_split[2]))
        time.append(int(timestamp[11:13]))
    data.drop('timestamp', 1, inplace = True)
    data.insert(1, 'date', date)
    data.insert(2, 'time', time)
    data.insert(3, 'followers_max', data['followers'])
    df = data.groupby(['date', 'time']).agg({'date' : np.max, 'time' : np.max, 'tweet' : np.sum, 'retweets' : np.sum, 'followers' : np.sum, 'followers_max' : np.max})
    addrow = []
    for i in range(1,len(df.index)):  
        oldday = df.index[i-1][0]
        oldtime = int(df.index[i-1][1])
        newday = df.index[i][0]
        newtime = int(df.index[i][1])
        if (newtime < oldtime):
            newtime = newtime + 24
        hour_diff = newtime - oldtime
        while (hour_diff > 1):
            oldtime = oldtime + 1
            if (oldtime > 23):
                oldday = newday
                addrow.append({'tweet':0,'date':oldday,'time':oldtime-24,'followers_max':0,'followers':0,'retweets':0})
            else:
                addrow.append({'tweet':0,'date':oldday,'time':oldtime,'followers_max':0,'followers':0,'retweets':0})
            hour_diff = newtime - oldtime
    for row in addrow:
        data = data.append(row, ignore_index=True)
    df = data.groupby(['date', 'time']).agg({'time' : np.max, 'tweet' : np.sum, 'retweets' : np.sum, 'followers' : np.sum, 'followers_max' : np.max})
    df.to_csv('part1_2'+file[15:-4]+'.csv', index=False)
    return df


In [11]:
def regression_analysis(file, df):
    Xs = []
    for index in df.index:
        Xs.append(df.loc[index, 'time':'followers_max'].values)
    Xs.pop()
    Xs = sm.add_constant(Xs)
    Ys = df.loc[df.index[1]:, 'tweet'].values
    model = sm.OLS(Ys, Xs)
    results = model.fit()
    predict = results.predict(Xs)
    rmse = calculate_rmse(predict, Ys)
    print ('RMSE of '+file[15:-4]+' for the linear regression model is: '+str(rmse))
    print (results.summary())


In [12]:
def calculate_rmse(prediction, real):
    return np.sqrt(((prediction - real) ** 2).mean())

In [13]:
def regression_model(file):
    df = preprocess(file)
    regression_analysis(file, df)
for file in newfiles:
    regression_model(file)

RMSE of gohawks for the linear regression model is: 870.950198592
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.476
Model:                            OLS   Adj. R-squared:                  0.472
Method:                 Least Squares   F-statistic:                     104.1
Date:                Mon, 18 Mar 2019   Prob (F-statistic):           5.01e-78
Time:                        01:54:01   Log-Likelihood:                -4733.0
No. Observations:                 578   AIC:                             9478.
Df Residuals:                     572   BIC:                             9504.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------

RMSE of sb49 for the linear regression model is: 4023.48408495
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.805
Model:                            OLS   Adj. R-squared:                  0.803
Method:                 Least Squares   F-statistic:                     474.3
Date:                Mon, 18 Mar 2019   Prob (F-statistic):          1.66e-201
Time:                        01:54:20   Log-Likelihood:                -5656.4
No. Observations:                 582   AIC:                         1.132e+04
Df Residuals:                     576   BIC:                         1.135e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------