# Predicting state vaccination using tweets and government data

In [1]:
import pandas as pd
import geocoder
import numpy as np
import os
import json
import datetime 
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn import metrics



In [2]:
def week_num_from_start_date(curr_date,start_date):
    '''Returns the number of weeks from start date'''
    date_delta = datetime.datetime.strptime(curr_date , '%Y_%m_%d') - start_date
    return (date_delta.days//7+1)

In [3]:
government_data_dir = './vaccine_data/hhs_data/'
tweet_trends_weekly_output_dir = './tweet_trends/weekly_data'
mined_trends_dir = 'mined_trends'

In [4]:
train_years = ['2013','2014']
dev_years = ['2015']
test_years = ['2016','2017']

train_download_dir = ('train',train_years)
dev_download_dir = ('dev',dev_years)
test_download_dir = ('test',test_years)
data_split_ls = [train_download_dir,dev_download_dir,test_download_dir]

# Generate Tweets for National, CA and  NY

In [None]:
def generate_tweet_trends_weekly_csv(tweet_file_names,\
                                     tweet_output_directory='./tweet_trends/weekly_data',state = 'National'):
    '''Takes as input the tweets csv name to plots government trends against it '''
    train_years = [2013,2014]
    dev_years = [2015]
    test_years = [2016,2017]
    train_download_dir = ('train',train_years)
    dev_download_dir = ('dev',dev_years)
    test_download_dir = ('test',test_years)
    data_split_ls = [train_download_dir,dev_download_dir,test_download_dir]

    for tweet_file_name in tweet_file_names:
        tweets_df = \
            pd.read_csv(tweet_file_name,index_col=['Unnamed: 0'])

    os.makedirs(tweet_output_directory,exist_ok=True)
    for data_split in data_split_ls:
        split,years = data_split
        for year in years:
            government_data_df = pd.read_json(open(os.path.join(government_data_dir,split,str(year),'national.json')))
            government_df_start_date = government_data_df.loc[0,'week_start']

            government_df_start_date = datetime.datetime.strptime("{}_{}".\
                                                                  format(year-1,government_df_start_date) , '%Y_%d%b')
            tweet_trends_dict = {}
            for tweet_file_name in tweet_file_names:
                tweets_df = pd.read_csv(tweet_file_name,index_col=['Unnamed: 0'])

                curr_tweet_df = tweets_df[tweets_df['year'].isin([year-1,year])]
                curr_tweet_df['week_number']=curr_tweet_df['date'].apply(lambda x: week_num_from_start_date(x,government_df_start_date))
                curr_tweet_df=curr_tweet_df[(curr_tweet_df['week_number']>0) & (curr_tweet_df['week_number']<53)]
                if state is not 'national':
                    curr_tweet_df=curr_tweet_df[curr_tweet_df['state']==state]
                        
                curr_tweet_df = curr_tweet_df.groupby('week_number',as_index=False)['count'].sum()
                count_tweets_week= curr_tweet_df['count']
                tweet_trends_dict[tweet_file_name[13:-4]]=count_tweets_week
            
            tweet_trends_df = pd.DataFrame(tweet_trends_dict)
            tweet_trends_df.to_csv("{}/{}_{}.csv".format(tweet_output_directory,state,year))
                

In [None]:
states = ['national','California','New York']
curr_files_list = os.listdir(mined_trends_dir)
curr_files_list = ["mined_trends/{}".format(x) for x in curr_files_list if '.csv' in x]
for state in states:
    generate_tweet_trends_weekly_csv(curr_files_list,tweet_trends_weekly_output_dir,state)

# Prediction

In [5]:
def return_data_split(week_means,state,state_code,split,years,tweet_threshold_start_pred,government_threshold_start_pred):
    X=[]
    Y=[]
    for year in years:
        government_data_df = pd.read_json(open(os.path.join(government_data_dir,split,year,"{}.json".format(state_code))))
        percent_vaccinated_week = (government_data_df['percentage']).values
        percent_vaccinated_week = percent_vaccinated_week -  week_means[0:len(percent_vaccinated_week)]

        tweet_weekly_df = pd.read_csv(open(os.path.join(tweet_trends_weekly_output_dir,"{}_{}.csv".format(state,year))),index_col=['Unnamed: 0'])

        initial_week_government_val = [percent_vaccinated_week[i] for i in range(0,government_threshold_start_pred)]

        inital_week_tweet_vals  = []
        for i in range(0,tweet_threshold_start_pred):
            inital_week_tweet_vals += list(tweet_weekly_df.loc[i].values)

        for i in range(government_threshold_start_pred,len(percent_vaccinated_week)):
            X.append(initial_week_government_val+inital_week_tweet_vals)        
            Y.append(percent_vaccinated_week[i])
        
    return X,Y


In [11]:
tweet_threshold_start_pred = 4 
govt_threshold_start_pred = 4

for state_code,state  in [ ('national','national'),('CA', 'California'),('NY','New York')]:
    print("-"*20)
    print("Curr State {} {}".format(state,state_code))
    print("-"*20)

    split,years = train_download_dir
    X = []
    for year in train_years:
        data_df = pd.read_json(open(os.path.join(government_data_dir,split,year,'{}.json'.format(state_code))))
        X.append(data_df['percentage'].values)
    X = np.asarray(X)
    week_means = X.mean(axis = 0)

    X,Y = return_data_split(week_means,state,state_code,split,years,tweet_threshold_start_pred,govt_threshold_start_pred)
    model = linear_model.LinearRegression()
    model.fit(X,Y)

    y_pred = model.predict(X)
    error = metrics.mean_squared_error([y_pred[-1]],[Y[-1]])
    print("Error in train prediction {} %".format(error*100 ))

    # DEV Prediction

    split,years = dev_download_dir
    X,Y = return_data_split(week_means,state,state_code,split,years,tweet_threshold_start_pred,govt_threshold_start_pred)
    y_pred = model.predict(X)
    error = metrics.mean_squared_error([y_pred[-1]],[Y[-1]])
    print("Error in dev train prediction {} %".format(error*100 ))

    split,years = test_download_dir
    for year in years:
        X,Y = return_data_split(week_means,state,state_code,split,[year],tweet_threshold_start_pred,govt_threshold_start_pred)
        y_pred = model.predict(X)
        error = metrics.mean_squared_error([y_pred[-1]],[Y[-1]])
        print("Error in test prediction for year {} = {} %".format(year, error*100))



--------------------
Curr State national national
--------------------
Error in train prediction 0.00020345575271005283 %
Error in dev train prediction 0.032536767143358895 %
Error in test prediction for year 2016 = 0.029141779701367027 %
Error in test prediction for year 2017 = 0.1550957022476945 %
--------------------
Curr State California CA
--------------------
Error in train prediction 0.0006584484300624913 %
Error in dev train prediction 0.12108231351020352 %
Error in test prediction for year 2016 = 0.1996042301816189 %
Error in test prediction for year 2017 = 0.14736626917396864 %
--------------------
Curr State New York NY
--------------------
Error in train prediction 3.521668300791518e-05 %
Error in dev train prediction 0.024871267475208365 %
Error in test prediction for year 2016 = 0.018327242709688187 %
Error in test prediction for year 2017 = 0.16696746958582148 %
