# 2020 Polling Data

## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

### Date Conversion
Both types of data use this "date conversion" function.
It turns a m/d/y format into a Ymd format like our graphing function needs

In [2]:
def new_date(s):
    '''
    Turns a date from a m/d/y format into a Ymd
    '''
    mdy = s.split('/')        
    y = "20" + str(mdy[2])        
    if len(str(mdy[0])) == 1:
        m = "0" + str(mdy[0])
    else:
        m = str(mdy[0])            
    if len(str(mdy[1])) == 1:
        d = "0" + str(mdy[1])
    else:
        d = str(mdy[1])            
    return y + m + d


# Biden VS Sanders
First we will export some data which is Biden VS Sanders

### Read Data

In [3]:
df0 = pd.read_csv('2020_polling_data_import/president_primary_polls.csv')
df0.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,question_id,poll_id,cycle,state,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,...,nationwide_batch,created_at,notes,url,stage,party,answer,candidate_id,candidate_name,pct
0,119723,64740,2020,Washington,460,SurveyUSA,840.0,KING TV (Seattle),SurveyUSA,325.0,...,False,3/6/20 15:48,,https://www.king5.com/article/news/politics/el...,primary,DEM,Biden,13256,Joseph R. Biden Jr.,36.36
1,119723,64740,2020,Washington,460,SurveyUSA,840.0,KING TV (Seattle),SurveyUSA,325.0,...,False,3/6/20 15:48,,https://www.king5.com/article/news/politics/el...,primary,DEM,Sanders,13257,Bernard Sanders,35.35
2,119723,64740,2020,Washington,460,SurveyUSA,840.0,KING TV (Seattle),SurveyUSA,325.0,...,False,3/6/20 15:48,,https://www.king5.com/article/news/politics/el...,primary,DEM,Warren,13258,Elizabeth Warren,10.1
3,119720,64737,2020,,1189,Morning Consult,,,Morning Consult,218.0,...,False,3/6/20 13:00,,https://morningconsult.com/2020/03/06/democrat...,primary,DEM,Biden,13256,Joseph R. Biden Jr.,54.0
4,119720,64737,2020,,1189,Morning Consult,,,Morning Consult,218.0,...,False,3/6/20 13:00,,https://morningconsult.com/2020/03/06/democrat...,primary,DEM,Sanders,13257,Bernard Sanders,38.0


Group by each question and only look at Sanders and Biden

In [5]:
states = ['Florida', 'Ohio', 'Iowa', 'Michigan', 'Pennsylvania', 'Wisconsin']
simpl = ['fl', 'oh', 'ia', 'mi', 'pa', 'wi']

for state, s in zip(states, simpl):
    # filter each df by state
    df = df0[df0['state'] == state]
    
    # only keep relevant columns
    df = df[['question_id', 'answer', 'pct', 'end_date']]
    
    # self join such that we have all opponents
    df = df.merge(df, left_on = ['question_id', "end_date"], right_on = ['question_id', "end_date"])
    
    # only keep biden and sanders
    df = df[(df['answer_x'] == 'Biden') & (df['answer_y'] == 'Sanders')]
    
    
    # add a datetime object so we can sort later
    df['last_date'] = pd.to_datetime(df['end_date'], format='%m/%d/%y')
    
    # groupby and take the mean if multiple polls were conducted on the same day
    df = df.groupby(['end_date', 'last_date'])[['pct_x', 'pct_y']].mean()
    
    # sort by ascending
    df = df.sort_values(by=['last_date'], ascending = True)
    df = df.reset_index()
    
    # modify the date
    df['date'] = df['end_date'].apply(new_date) 
    
    # keep only relevant columns
    df = df[['date', 'pct_x', 'pct_y']]
    # rename columns
    df = df.rename(index=str, 
               columns={
                   "pct_x": "Biden",
                   "pct_y": "Sanders",                   
               })
    
    # reset the index
    df.set_index(['date'], inplace=True)
    
    # export as a tsv
    df.to_csv("2020_polling_data_export/biden_sanders_tsv/" + s + "_biden_sanders.tsv", sep = '\t')
df.head()

Unnamed: 0_level_0,Biden,Sanders
date,Unnamed: 1_level_1,Unnamed: 2_level_1
20190317,23.5,38.9
20190418,24.0,20.0
20190529,28.3,13.1
20190704,18.0,19.0
20190811,20.0,24.0


# Trump VS Other Candiates

In [30]:
# for every file in the folder
path = "2020_polling_data_import/dem_v_rep"
for f in os.listdir(path):
    # get the candidate name based on the file
    df = pd.read_csv(path + "/"+ f)    
    if f[3] == 'b':
        cand = 'Biden'
        other = 'Sanders'
    else:
        cand = 'Sanders'
        other = 'Biden'

    # drop the state column
    df = df.drop(columns=['state'])

    # combine pollsters
    df = df.merge(df, left_on=['pollster', 'date'], right_on=['pollster', 'date'])
    df = df[(df['answer_x'] == 'Trump') & (df['answer_y'] != 'Trump')]
    df = df.reset_index()

    # add a new column thats a date object
    df['last_date'] = pd.to_datetime(df['date'], format='%m/%d/%y')

    # groupby and take the mean if multiple polls were conducted on the same day
    df = df.groupby(['date', 'last_date'])[['pct_x', 'pct_y']].mean()

    # sort by ascending
    df = df.sort_values(by=['last_date'], ascending = True)
    df = df.reset_index()
    
    # modify the date
    df['date'] = df['date'].apply(new_date) 

    # remove columns
    df = df[['date', 'pct_x', 'pct_y']]

    # rename columns
    df = df.rename(index=str, 
               columns={
                   "pct_x": "Trump",
                   "pct_y": cand,                   
               })
    
    # add on the last guy:    
    df[other] = np.NaN
    
    # rearrange so it goes: Biden, Sanders, Trump
    df = df[['date', 'Sanders', 'Biden', 'Trump']]
    
    # get rid of the other guy
    df = df.rename(index=str, 
                   columns={
                       other : ""                
                   })

    # reset the index
    df.set_index(['date'], inplace=True)

    # export
    export_name = "2020_polling_data_export/trump_vs/"+ f[:-4].lower() + ".tsv"
    export_name = export_name.replace('_v_', '_')
    df.to_csv(export_name, sep = '\t')

I lost Iowa, so here is special treatment for Iowa

In [33]:
# for every file in the folder
path = "2020_polling_data_import/dem_v_rep_ia"
for f in os.listdir(path):
    # read in the data
    df = pd.read_csv(path + "/" + f, sep='\t')
    
    # find out who the candidate is
    if f[3] == 'b':
        cand = 'Biden'
        other = 'Sanders'
    else:
        cand = 'Sanders'
        other = 'Biden'
        
    # add on the last guy:    
    df[other] = np.NaN
    
    # rearrange so it goes: Biden, Sanders, Trump
    df = df[['date', 'Sanders', 'Biden', 'Trump']]
    
    # get rid of the other guy
    df = df.rename(index=str, 
                   columns={
                       other : ""                
                   })
    
    # reset the index
    df.set_index(['date'], inplace=True)
    
    # export
    export_name = "2020_polling_data_export/trump_vs/"+ f[:-4].lower() + ".tsv"
    export_name = export_name.replace('_v_', '_')
    df.to_csv(export_name, sep = '\t')