# Predicting vaccination using tweets and government data

In [197]:
%%time 

import pandas as pd
import geocoder
import numpy as np
import os
import json
import datetime 
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn import metrics

government_data_dir = './vaccine_data/hhs_data/'


CPU times: user 30 µs, sys: 2 µs, total: 32 µs
Wall time: 38.9 µs


In [198]:
def week_num_from_start_date(curr_date,start_date):
    '''Returns the number of weeks from start date'''
    date_delta = datetime.datetime.strptime(curr_date , '%Y_%m_%d') - start_date
    return (date_delta.days//7+1)

In [199]:
government_data_dir = './vaccine_data/hhs_data/'
tweet_trends_weekly_output_dir = './tweet_trends/weekly_data'
mined_trends_dir = 'mined_trends'

In [200]:
train_years = ['2013','2014']
dev_years = ['2015']
test_years = ['2016','2017']

train_download_dir = ('train',train_years)
dev_download_dir = ('dev',dev_years)
test_download_dir = ('test',test_years)
data_split_ls = [train_download_dir,dev_download_dir,test_download_dir]

# Generate Weekly tweet trends df

In [201]:
def generate_tweet_trends_weekly_csv(tweet_file_names,tweet_output_directory='./tweet_trends/weekly_data'):
    '''Takes as input the tweets csv name to plots government trends against it '''
    train_years = [2013,2014]
    dev_years = [2015]
    test_years = [2016,2017]
    train_download_dir = ('train',train_years)
    dev_download_dir = ('dev',dev_years)
    test_download_dir = ('test',test_years)
    data_split_ls = [train_download_dir,dev_download_dir,test_download_dir]

    for tweet_file_name in tweet_file_names:
        tweets_df = \
            pd.read_csv(tweet_file_name,index_col=['Unnamed: 0'])

    os.makedirs(tweet_output_directory,exist_ok=True)
    for data_split in data_split_ls:
        split,years = data_split
        for year in years:
            government_data_df = pd.read_json(open(os.path.join(government_data_dir,split,str(year),'national.json')))
            government_df_start_date = government_data_df.loc[0,'week_start']

            government_df_start_date = datetime.datetime.strptime("{}_{}".\
                                                                  format(year-1,government_df_start_date) , '%Y_%d%b')
            
            tweet_trends_dict = {}
            for tweet_file_name in tweet_file_names:
                tweets_df = pd.read_csv(tweet_file_name,index_col=['Unnamed: 0'])

                curr_tweet_df = tweets_df[tweets_df['year'].isin([year-1,year])]
                curr_tweet_df['week_number']=curr_tweet_df['date'].apply(lambda x: week_num_from_start_date(x,government_df_start_date))
                curr_tweet_df=curr_tweet_df[(curr_tweet_df['week_number']>0) & (curr_tweet_df['week_number']<53)]
                curr_tweet_df = curr_tweet_df.groupby('week_number',as_index=False)['count'].sum()
                count_tweets_week= curr_tweet_df['count']
                
                tweet_trends_dict[tweet_file_name[13:-4]]=count_tweets_week
            tweet_trends_df = pd.DataFrame(tweet_trends_dict)
            tweet_trends_df.to_csv("{}/{}.csv".format(tweet_output_directory,year))

In [202]:
curr_files_list = os.listdir(mined_trends_dir)
curr_files_list = ["mined_trends/{}".format(x) for x in curr_files_list if '.csv' in x]
generate_tweet_trends_weekly_csv(curr_files_list,tweet_trends_weekly_output_dir)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


# Prediction Part

In [203]:
split = 'train'
train_years = ['2013','2014']
#train_years = ['2014']
train_download_dir = (split,train_years)

X = []
for year in train_years:
    data_df = pd.read_json(open(os.path.join(government_data_dir,split,year,'national.json')))
    X.append(data_df['percentage'].values)
X = np.asarray(X)
week_means = X.mean(axis = 0)

threshold_start_pred  = 1 


In [204]:
#train_years = ['2014']
train_download_dir = ('train',train_years)
split,years = train_download_dir
government_threshold_start_pred  = 4
tweet_threshold_start_pred = 4
Y = []
X = []
for year in years:
    government_data_df = pd.read_json(open(os.path.join(government_data_dir,split,year,'national.json')))
    percent_vaccinated_week = (government_data_df['percentage']).values
    percent_vaccinated_week = percent_vaccinated_week -  week_means
    
    tweet_weekly_df = pd.read_csv(open(os.path.join(tweet_trends_weekly_output_dir,"{}.csv".format(year))),index_col=['Unnamed: 0'])

    initial_week_government_val = [percent_vaccinated_week[i] for i in range(0,government_threshold_start_pred)]
    
    inital_week_tweet_vals  = []
    for i in range(0,tweet_threshold_start_pred):
        inital_week_tweet_vals += list(tweet_weekly_df.loc[i].values)
        
    for i in range(threshold_start_pred,len(percent_vaccinated_week)):
        X.append(initial_week_government_val+inital_week_tweet_vals)        
        Y.append(percent_vaccinated_week[i])


X = np.asarray(X)
Y = np.asarray(Y)
model = linear_model.LinearRegression()
model.fit(X,Y)
y_pred = model.predict(X)
error = metrics.mean_squared_error([y_pred[-1]],[Y[-1]])
print("Error in last train prediction {} %".format(error*100 ))

Error in last train prediction 0.00022510474748241332 %


# DEV Prediction

In [205]:
split,years = dev_download_dir
Y = []
X = []
for year in years:
    government_data_df = pd.read_json(open(os.path.join(government_data_dir,split,year,'national.json')))
    percent_vaccinated_week = (government_data_df['percentage']).values
    percent_vaccinated_week = percent_vaccinated_week -  week_means
    
    tweet_weekly_df = pd.read_csv(open(os.path.join(tweet_trends_weekly_output_dir,"{}.csv".format(year))),index_col=['Unnamed: 0'])

    initial_week_government_val = [percent_vaccinated_week[i] for i in range(0,government_threshold_start_pred)]
    
    inital_week_tweet_vals  = []
    for i in range(0,tweet_threshold_start_pred):
        inital_week_tweet_vals += list(tweet_weekly_df.loc[i].values)
        
    for i in range(threshold_start_pred,len(percent_vaccinated_week)):
        X.append(initial_week_government_val+inital_week_tweet_vals)        
        Y.append(percent_vaccinated_week[i])


X = np.asarray(X)
Y = np.asarray(Y)
y_pred = model.predict(X)
error = metrics.mean_squared_error([y_pred[-1]],[Y[-1]])
print("Error in dev train prediction {} %".format(error*100 ))

Error in dev train prediction 0.032246304565517005 %


In [206]:
split,years = test_download_dir
for year in years:
    Y = []
    X = []

    government_data_df = pd.read_json(open(os.path.join(government_data_dir,split,year,'national.json')))
    percent_vaccinated_week = (government_data_df['percentage']).values
    percent_vaccinated_week = percent_vaccinated_week -  week_means[0:len(percent_vaccinated_week)]
    
    tweet_weekly_df = pd.read_csv(open(os.path.join(tweet_trends_weekly_output_dir,"{}.csv".format(year))),index_col=['Unnamed: 0'])

    initial_week_government_val = [percent_vaccinated_week[i] for i in range(0,government_threshold_start_pred)]
    
    inital_week_tweet_vals  = []
    for i in range(0,tweet_threshold_start_pred):
        inital_week_tweet_vals += list(tweet_weekly_df.loc[i].values)
        
    for i in range(threshold_start_pred,len(percent_vaccinated_week)):
        X.append(initial_week_government_val+inital_week_tweet_vals)
        list(tweet_weekly_df.loc[i].values)
        Y.append(percent_vaccinated_week[i])

    X = np.asarray(X)
    Y = np.asarray(Y)
    y_pred = model.predict(X)
    error = metrics.mean_squared_error([y_pred[-1]],[Y[-1]])
    print("Error in test prediction for year {} = {} %".format(year, error*100))

Error in test prediction for year 2016 = 0.02859682623095311 %
Error in test prediction for year 2017 = 0.15564644158487295 %


In [209]:
print("Model Coeff {},".format(model.coef_))

Model Coeff [-4.10263935e-13 -8.37865640e-13 -9.51696650e-13 -4.35041049e-13
  4.22266597e-08  6.38722584e-09  1.02905305e-08  1.20292753e-07
  1.34841434e-08 -6.67110254e-08  8.87114700e-09 -2.94522080e-08
 -5.46462655e-08  1.31292976e-08  7.06143301e-08  2.69682869e-08
  1.77422940e-08  1.41583506e-07  3.86782009e-08 -7.70015560e-08
  1.84519858e-08 -9.58083876e-08 -1.71745406e-07  2.73231328e-08],
