In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

**We would like to see analysis of:**

 

- Presidents Approval
- General Election
- **US Senate**
- **US House**
- **Governor**
- Generic Ballot (optional)
 

Please provide the analysis in a format of your choosing (not looking for specific formats, looking specifically for data analysis). If there are any questions around the data set, please include those questions with the analysis.

Files to be used:
<br>
> `president_approval_polls.csv` <br>
> `president_polls.csv` <br>
> `generic_ballot_polls.csv` <br>
> `president_primary_polls.csv` <br>
> `senate_polls.csv` <br>
> `house_polls.csv` <br>
> `governor_polls.csv` <br>
> `president_primary_polls` <br>
> `presidential_poll_averages_2020.csv` <br>


In [2]:
#code
def floatify(x):
    try:
        return float(int(x.replace("%",'')))
    except:
        return x
    
def eda(dataframe):
    print("Missing Values \n \n", dataframe.isnull().sum(),"\n")
    print("Duplicate Rows \n", dataframe.duplicated().sum(),"\n")
    print("Dataframe Types \n \n", dataframe.dtypes,"\n")
    print("Dataframe Shape \n", dataframe.shape,"\n")
    print("Dataframe Describe \n \n", dataframe.describe(include='all'),"\n")
    for item in dataframe:
        print(item)

In [3]:
df_senate = pd.read_csv('polls/senate_polls.csv')
df_house = pd.read_csv('polls/house_polls.csv')
df_gov = pd.read_csv('polls/governor_polls.csv')

df_list = [df_senate, df_house, df_gov]

In [4]:
for frame in df_list:
    display(frame.head())

Unnamed: 0,question_id,poll_id,cycle,state,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,fte_grade,sample_size,population,population_full,methodology,office_type,seat_number,seat_name,start_date,end_date,election_date,sponsor_candidate,internal,partisan,tracking,nationwide_batch,ranked_choice_reallocated,created_at,notes,url,stage,race_id,answer,candidate_id,candidate_name,candidate_party,pct
0,129448,69487,2020,Kentucky,396,Quinnipiac University,,,Quinnipiac University,267,Quinnipiac University,B+,1164.0,lv,lv,Live Phone,U.S. Senate,0,Class II,9/10/20,9/14/20,11/3/20,,False,,,False,False,9/16/20 14:21,,https://poll.qu.edu/kentucky/release-detail?Re...,general,6276,McGrath,13470,Amy McGrath,DEM,41.0
1,129448,69487,2020,Kentucky,396,Quinnipiac University,,,Quinnipiac University,267,Quinnipiac University,B+,1164.0,lv,lv,Live Phone,U.S. Senate,0,Class II,9/10/20,9/14/20,11/3/20,,False,,,False,False,9/16/20 14:21,,https://poll.qu.edu/kentucky/release-detail?Re...,general,6276,McConnell,13471,Mitch McConnell,REP,53.0
2,129449,69488,2020,Maine,396,Quinnipiac University,,,Quinnipiac University,267,Quinnipiac University,B+,1183.0,lv,lv,Live Phone,U.S. Senate,2,Class II,9/10/20,9/14/20,11/3/20,,False,,,False,False,9/16/20 14:21,,https://poll.qu.edu/kentucky/release-detail?Re...,general,6279,Gideon,13455,Sara Gideon,DEM,54.0
3,129449,69488,2020,Maine,396,Quinnipiac University,,,Quinnipiac University,267,Quinnipiac University,B+,1183.0,lv,lv,Live Phone,U.S. Senate,2,Class II,9/10/20,9/14/20,11/3/20,,False,,,False,False,9/16/20 14:21,,https://poll.qu.edu/kentucky/release-detail?Re...,general,6279,Collins,13456,Susan M. Collins,REP,42.0
4,129450,69491,2020,South Carolina,396,Quinnipiac University,,,Quinnipiac University,267,Quinnipiac University,B+,969.0,lv,lv,Live Phone,U.S. Senate,0,Class II,9/10/20,9/14/20,11/3/20,,False,,,False,False,9/16/20 14:21,,https://poll.qu.edu/kentucky/release-detail?Re...,general,6292,Harrison,13457,Jaime Harrison,DEM,48.0


Unnamed: 0,question_id,poll_id,cycle,state,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,fte_grade,sample_size,population,population_full,methodology,office_type,seat_number,seat_name,start_date,end_date,election_date,sponsor_candidate,internal,partisan,tracking,nationwide_batch,ranked_choice_reallocated,created_at,notes,url,stage,race_id,answer,candidate_id,candidate_name,candidate_party,pct
0,129392,69470,2020,California,460,SurveyUSA,443804,KGTV-TV (California) | San Diego Union-Tribune,SurveyUSA,325.0,SurveyUSA,A,517.0,lv,lv,IVR/Online,U.S. House,49,District 49,9/11/20,9/14/20,11/3/20,,False,,,False,False,9/15/20 20:46,,http://www.surveyusa.com/client/PollReport.asp...,general,7898,Levin,14899,Mike Levin,DEM,49.0
1,129392,69470,2020,California,460,SurveyUSA,443804,KGTV-TV (California) | San Diego Union-Tribune,SurveyUSA,325.0,SurveyUSA,A,517.0,lv,lv,IVR/Online,U.S. House,49,District 49,9/11/20,9/14/20,11/3/20,,False,,,False,False,9/15/20 20:46,,http://www.surveyusa.com/client/PollReport.asp...,general,7898,Maryott,14900,Brian Maryott,REP,37.0
2,129330,69438,2020,Utah,1554,RMG Research,400,Deseret News,RMG Research,555.0,RMG Research,B/C,800.0,lv,lv,IVR/Online/Text,U.S. House,4,District 4,9/7/20,9/12/20,11/3/20,,False,,,False,False,9/14/20 13:01,,https://www.deseret.com/utah/2020/9/14/2143622...,general,8235,McAdams,13565,Ben McAdams,DEM,45.0
3,129330,69438,2020,Utah,1554,RMG Research,400,Deseret News,RMG Research,555.0,RMG Research,B/C,800.0,lv,lv,IVR/Online/Text,U.S. House,4,District 4,9/7/20,9/12/20,11/3/20,,False,,,False,False,9/14/20 13:01,,https://www.deseret.com/utah/2020/9/14/2143622...,general,8235,Owens,14704,Burgess Owens,REP,41.0
4,129330,69438,2020,Utah,1554,RMG Research,400,Deseret News,RMG Research,555.0,RMG Research,B/C,800.0,lv,lv,IVR/Online/Text,U.S. House,4,District 4,9/7/20,9/12/20,11/3/20,,False,,,False,False,9/14/20 13:01,,https://www.deseret.com/utah/2020/9/14/2143622...,general,8235,Molnar,14705,John Molnar,LIB,2.0


Unnamed: 0,question_id,poll_id,cycle,state,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,fte_grade,sample_size,population,population_full,methodology,office_type,seat_number,seat_name,start_date,end_date,election_date,sponsor_candidate,internal,partisan,tracking,nationwide_batch,ranked_choice_reallocated,created_at,notes,url,stage,race_id,answer,candidate_id,candidate_name,candidate_party,pct
0,129338,69444,2020,North Carolina,460,SurveyUSA,241,WRAL News,SurveyUSA,325.0,SurveyUSA,A,596,lv,lv,Online,Governor,0,,9/10/20,9/13/20,11/3/20,,False,,,False,False,9/14/20 18:06,,http://www.surveyusa.com/client/PollReport.asp...,general,7824,Cooper,13404,Roy A. Cooper,DEM,49.0
1,129338,69444,2020,North Carolina,460,SurveyUSA,241,WRAL News,SurveyUSA,325.0,SurveyUSA,A,596,lv,lv,Online,Governor,0,,9/10/20,9/13/20,11/3/20,,False,,,False,False,9/14/20 18:06,,http://www.surveyusa.com/client/PollReport.asp...,general,7824,Forest,13406,Dan Forest,REP,42.0
2,129371,69462,2020,North Carolina,1361,SSRS,143,CNN,CNN/SSRS,297.0,SSRS,B/C,893,rv,rv,Live Phone,Governor,0,,9/9/20,9/13/20,11/3/20,,False,,,False,False,9/15/20 12:12,,https://cdn.cnn.com/cnn/2020/images/09/15/rel1...,general,7824,Cooper,13404,Roy A. Cooper,DEM,53.0
3,129371,69462,2020,North Carolina,1361,SSRS,143,CNN,CNN/SSRS,297.0,SSRS,B/C,893,rv,rv,Live Phone,Governor,0,,9/9/20,9/13/20,11/3/20,,False,,,False,False,9/15/20 12:12,,https://cdn.cnn.com/cnn/2020/images/09/15/rel1...,general,7824,Forest,13406,Dan Forest,REP,42.0
4,129371,69462,2020,North Carolina,1361,SSRS,143,CNN,CNN/SSRS,297.0,SSRS,B/C,893,rv,rv,Live Phone,Governor,0,,9/9/20,9/13/20,11/3/20,,False,,,False,False,9/15/20 12:12,,https://cdn.cnn.com/cnn/2020/images/09/15/rel1...,general,7824,DiFiore,14555,Steven DiFiore,LIB,2.0


In [5]:
for frame in df_list:
    display(frame.shape)

(2967, 38)

(2238, 38)

(1949, 38)

In [6]:
# Senate
eda(df_list[0])

Missing Values 
 
 question_id                     0
poll_id                         0
cycle                           0
state                           0
pollster_id                     0
pollster                        0
sponsor_ids                  1294
sponsors                     1294
display_name                    0
pollster_rating_id              0
pollster_rating_name            0
fte_grade                     272
sample_size                     2
population                      4
population_full                 4
methodology                   367
office_type                     0
seat_number                     0
seat_name                       0
start_date                      0
end_date                        0
election_date                   0
sponsor_candidate            2787
internal                        0
partisan                     2406
tracking                     2873
nationwide_batch                0
ranked_choice_reallocated       0
created_at                   

In [7]:
# House
eda(df_list[1])

Missing Values 
 
 question_id                     0
poll_id                         0
cycle                           0
state                           0
pollster_id                     0
pollster                        0
sponsor_ids                  1099
sponsors                     1099
display_name                    0
pollster_rating_id              5
pollster_rating_name            5
fte_grade                     165
sample_size                     2
population                      0
population_full                 0
methodology                   591
office_type                     0
seat_number                     0
seat_name                       0
start_date                      0
end_date                        0
election_date                   0
sponsor_candidate            1726
internal                        0
partisan                     1341
tracking                     2238
nationwide_batch                0
ranked_choice_reallocated       0
created_at                   

population
population_full
methodology
office_type
seat_number
seat_name
start_date
end_date
election_date
sponsor_candidate
internal
partisan
tracking
nationwide_batch
ranked_choice_reallocated
created_at
notes
url
stage
race_id
answer
candidate_id
candidate_name
candidate_party
pct


In [8]:
# Govenor
eda(df_list[2])

Missing Values 
 
 question_id                     0
poll_id                         0
cycle                           0
state                           0
pollster_id                     0
pollster                        0
sponsor_ids                   965
sponsors                      965
display_name                    0
pollster_rating_id              4
pollster_rating_name            4
fte_grade                     158
sample_size                     0
population                      2
population_full                 2
methodology                   286
office_type                     0
seat_number                     0
seat_name                    1949
start_date                      0
end_date                        0
election_date                   0
sponsor_candidate            1827
internal                        0
partisan                     1684
tracking                     1907
nationwide_batch                0
ranked_choice_reallocated       0
created_at                   

pollster_rating_id
pollster_rating_name
fte_grade
sample_size
population
population_full
methodology
office_type
seat_number
seat_name
start_date
end_date
election_date
sponsor_candidate
internal
partisan
tracking
nationwide_batch
ranked_choice_reallocated
created_at
notes
url
stage
race_id
answer
candidate_id
candidate_name
candidate_party
pct


In [9]:
for frame in df_list:
    frame['start_date'] = pd.to_datetime(frame['start_date'])
    frame['end_date'] = pd.to_datetime(frame['end_date'])
    frame['election_date'] = pd.to_datetime(frame['election_date'])
    frame['created_at'] = pd.to_datetime(frame['created_at'])
    display(frame.dtypes)

question_id                           int64
poll_id                               int64
cycle                                 int64
state                                object
pollster_id                           int64
pollster                             object
sponsor_ids                          object
sponsors                             object
display_name                         object
pollster_rating_id                    int64
pollster_rating_name                 object
fte_grade                            object
sample_size                         float64
population                           object
population_full                      object
methodology                          object
office_type                          object
seat_number                           int64
seat_name                            object
start_date                   datetime64[ns]
end_date                     datetime64[ns]
election_date                datetime64[ns]
sponsor_candidate               

question_id                           int64
poll_id                               int64
cycle                                 int64
state                                object
pollster_id                           int64
pollster                             object
sponsor_ids                          object
sponsors                             object
display_name                         object
pollster_rating_id                  float64
pollster_rating_name                 object
fte_grade                            object
sample_size                         float64
population                           object
population_full                      object
methodology                          object
office_type                          object
seat_number                           int64
seat_name                            object
start_date                   datetime64[ns]
end_date                     datetime64[ns]
election_date                datetime64[ns]
sponsor_candidate               

question_id                           int64
poll_id                               int64
cycle                                 int64
state                                object
pollster_id                           int64
pollster                             object
sponsor_ids                          object
sponsors                             object
display_name                         object
pollster_rating_id                  float64
pollster_rating_name                 object
fte_grade                            object
sample_size                           int64
population                           object
population_full                      object
methodology                          object
office_type                          object
seat_number                           int64
seat_name                           float64
start_date                   datetime64[ns]
end_date                     datetime64[ns]
election_date                datetime64[ns]
sponsor_candidate               

In [10]:
df_gov.groupby(['state', 'candidate_party'])['pct'].mean()

state           candidate_party
Alabama         DEM                33.000000
                REP                54.000000
Alaska          DEM                35.962500
                IND                29.526667
                LIB                 3.300000
                REP                42.847619
Arizona         DEM                39.415000
                GRE                 3.625000
                LIB                 1.600000
                REP                50.941923
Arkansas        DEM                27.000000
                LIB                 5.500000
                REP                60.500000
California      DEM                51.026667
                REP                36.386667
Colorado        DEM                47.000000
                IND                 0.500000
                LIB                 3.000000
                REP                39.250000
Connecticut     DEM                44.791667
                IND                 8.328571
                LIB    

In [11]:
for frame in df_list:
    display(frame.groupby(['state', 'candidate_party'])['pct'].mean())

state           candidate_party
Alabama         DEM                40.511765
                IND                 2.300000
                REP                46.305882
Alaska          DEM                39.000000
                IND                40.000000
                REP                43.200000
Arizona         DEM                48.375033
                GRE                 3.604167
                LIB                 2.300000
                REP                42.083974
California      DEM                34.720455
Colorado        DEM                49.855000
                REP                40.235833
Connecticut     DEM                56.516667
                REP                36.850000
Delaware        DEM                56.000000
                GRE                 2.500000
                LIB                 4.000000
                REP                28.333333
Florida         DEM                45.599192
                REP                45.107778
Georgia         DEM    

state           candidate_party
Alaska          DEM                43.744444
                REP                46.411111
Arizona         DEM                42.250500
                GRE                 2.000000
                REP                45.637000
Arkansas        DEM                35.000000
                LIB                 3.400000
                REP                51.357143
California      DEM                44.725941
                REP                45.757723
Colorado        DEM                44.654375
                IND                 3.000000
                LIB                 5.666667
                REP                42.086250
                UNI                 1.000000
Delaware        DEM                51.000000
                REP                32.750000
Florida         DEM                42.590526
                NPA                 5.000000
                REP                45.725439
Georgia         DEM                44.076000
                REP    

state           candidate_party
Alabama         DEM                33.000000
                REP                54.000000
Alaska          DEM                35.962500
                IND                29.526667
                LIB                 3.300000
                REP                42.847619
Arizona         DEM                39.415000
                GRE                 3.625000
                LIB                 1.600000
                REP                50.941923
Arkansas        DEM                27.000000
                LIB                 5.500000
                REP                60.500000
California      DEM                51.026667
                REP                36.386667
Colorado        DEM                47.000000
                IND                 0.500000
                LIB                 3.000000
                REP                39.250000
Connecticut     DEM                44.791667
                IND                 8.328571
                LIB    

In [12]:
for frame in df_list:
#     print(f'This is the {} DataFrame of information'.format(frame))
    display(frame['state'].value_counts(dropna=False))

Arizona           315
Michigan          199
Florida           198
Georgia           189
Missouri          188
Texas             173
North Carolina    152
Tennessee         108
Mississippi       101
Nevada            100
Indiana            97
Minnesota          96
West Virginia      93
Montana            78
Ohio               72
Maine              67
Pennsylvania       66
New Jersey         63
Wisconsin          56
New Mexico         48
Massachusetts      44
California         44
New Hampshire      44
North Dakota       36
Iowa               36
Alabama            36
Kentucky           35
South Carolina     34
Virginia           30
Colorado           24
Kansas             24
Utah               20
New York           18
Connecticut        12
Rhode Island       10
Delaware           10
Alaska             10
Oklahoma            9
Vermont             8
Maryland            7
Nebraska            6
Washington          4
Wyoming             3
Idaho               2
Illinois            2
Name: stat

California        202
Pennsylvania      147
New York          137
Florida           117
Iowa              117
Texas             116
New Jersey        108
Virginia          101
Ohio               93
Utah               92
Michigan           86
North Carolina     83
Illinois           65
Minnesota          61
New Mexico         56
Kansas             56
New Hampshire      48
West Virginia      47
Arizona            41
Maine              38
Colorado           38
Montana            37
Washington         34
Nevada             33
Georgia            30
Kentucky           29
Oklahoma           27
North Dakota       22
Wisconsin          21
Nebraska           20
Arkansas           19
Alaska             18
Indiana            16
South Dakota       14
Missouri           10
Puerto Rico         9
Mississippi         8
Delaware            8
Louisiana           7
South Carolina      6
Idaho               6
Vermont             6
Oregon              5
Wyoming             4
Name: state, dtype: int64

Florida           143
North Carolina    124
Arizona           115
Michigan           98
Ohio               90
Nevada             82
Louisiana          70
Texas              67
Wisconsin          66
Illinois           62
Georgia            62
California         60
Tennessee          58
New Hampshire      53
Alaska             53
Pennsylvania       48
Missouri           46
Kansas             45
New York           45
Mississippi        41
Minnesota          40
Rhode Island       37
Puerto Rico        36
Maryland           34
Oregon             33
Connecticut        33
New Mexico         32
Massachusetts      26
Kentucky           26
Colorado           25
Oklahoma           24
Iowa               20
Maine              19
Vermont            19
Washington         18
South Dakota       18
Idaho              11
Arkansas           10
South Carolina     10
Montana            10
Indiana            10
Alabama             8
Utah                8
Hawaii              6
Wyoming             4
West Virgi

In [13]:
df_senate['state'].nunique()

45

In [14]:
df_gov['state'].nunique()

47

In [15]:
df_house['state'].nunique()

44

In [16]:
df_senate['candidate_party'].value_counts()

DEM    1353
REP    1302
LIB     147
IND      75
GRE      56
CON      16
IAP       4
UST       4
REF       4
NLP       4
W         2
Name: candidate_party, dtype: int64

**Idea Notes** <br>

- Create time series by state to see if there's a change by political part / candidate
- Create overall charts showing the `mean`, perhaps `max`, `min`, `mode`