# 2020 Dems Primary Endorsement

In [1]:
# Import Libraries

import pandas as pd
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
%matplotlib inline

import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
import plotly.figure_factory as ff
from plotly import subplots
from plotly.subplots import make_subplots
init_notebook_mode(connected=True)

from datetime import date, datetime, timedelta
import time, re, os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## EDA & Data Cleaning

In [2]:
def resumetable(data):
    print(f"Dataset Shape: {data.shape}")
    summary = pd.DataFrame(data.dtypes, columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = data.isnull().sum().values    
    summary['Uniques'] = data.nunique().values

    return summary

data = pd.read_csv("endorsements-2020.csv")

In [3]:
data.head()

Unnamed: 0,date,position,city,state,endorser,endorsee,endorser party,source,order,category,body,district,points
0,2017-07-28,representative,,MD,David Trone,John Delaney,D,https://twitter.com/davidjtrone/status/8909859...,,Representatives,,6.0,3
1,2019-01-02,governor,,NY,Andrew Cuomo,Joe Biden,D,https://www.cnn.com/2019/01/02/politics/cuomo-...,,Governors,,,8
2,2019-01-03,senator,,CA,Dianne Feinstein,Joe Biden,D,https://www.politico.com/story/2019/01/03/dian...,,Senators,,,6
3,2019-01-08,senator,,DE,Thomas R. Carper,Joe Biden,D,https://fox61.com/2019/01/08/will-he-or-wont-h...,,Senators,,,6
4,2019-01-12,mayor,San Antonio,TX,Ron Nirenberg,Julian Castro,,https://www.sacurrent.com/the-daily/archives/2...,6.0,Mayors,,,3


In [4]:
data.isnull().sum()

date              757
position            0
city              959
state               0
endorser            0
endorsee          757
endorser party      7
source            757
order             850
category            0
body              901
district          775
points              0
dtype: int64

In [5]:
percent_missing = np.round(data.isnull().sum() * 100 / len(data),2)
missing_value_data = pd.DataFrame({'column_name': data.columns,
                                 'percent_missing': percent_missing}).sort_values('percent_missing', ascending=False)


fig = go.Figure()
fig.add_trace(
        go.Bar(x=missing_value_data['column_name'],
               y=missing_value_data['percent_missing'],
               opacity=0.9,
               text=missing_value_data['percent_missing'],
               textposition='inside',
               marker={'color':'indianred'}
                   ))
fig.update_layout(
      title={'text': 'Percentage Missing by Column',
             'y':0.95, 'x':0.5,
            'xanchor': 'center', 'yanchor': 'top'},
      showlegend=False,
      xaxis_title_text='Columns',
      yaxis_title_text='Percentage',
      bargap=0.1
    )

fig.show()

In [6]:
data.drop(['city', 'order', 'body', 'district', 'date'], axis=1, inplace=True)

In [7]:
percent_missing = np.round(data.isnull().sum() * 100 / len(data),2)
missing_value_data = pd.DataFrame({'column_name': data.columns,
                                 'percent_missing': percent_missing}).sort_values('percent_missing', ascending=False)


fig = go.Figure()
fig.add_trace(
        go.Bar(x=missing_value_data['column_name'],
               y=missing_value_data['percent_missing'],
               opacity=0.9,
               text=missing_value_data['percent_missing'],
               textposition='inside',
               marker={'color':'indianred'}
                   ))
fig.update_layout(
      title={'text': 'Percentage Missing by Column',
             'y':0.95, 'x':0.5,
            'xanchor': 'center', 'yanchor': 'top'},
      showlegend=False,
      xaxis_title_text='Columns',
      yaxis_title_text='Percentage',
      bargap=0.1
    )

fig.show()

In [8]:
data.rename(columns={'source': 'raw_source'}, inplace=True)
data['raw_source'] = data.loc[:,'raw_source'].fillna('other')
data['source'] = 'other'

keys=['twitter', 'politico', 'youtube', '4president', 'cnn', 'apnews']

for k in keys:
    data['source'] =  np.where(data['raw_source'].str.contains(k), k,  data['source'])
    
data.drop('raw_source', axis=1, inplace=True)
data['endorsee'] = data.loc[:,'endorsee'].fillna('no_endorsee')
data['endorser party'] = data.loc[:, 'endorser party'].fillna('None')
resumetable(data)

Dataset Shape: (1006, 8)


Unnamed: 0,Name,dtypes,Missing,Uniques
0,position,object,0,51
1,state,object,0,57
2,endorser,object,0,1006
3,endorsee,object,0,17
4,endorser party,object,0,5
5,category,object,0,12
6,points,int64,0,7
7,source,object,0,7


In [16]:
data.to_csv(r'/Users/sid_macbookpro/Documents/forGitHub/KaggleDataset/2020demsCleaned.csv', index=False) 
# Save cleaned file for data analysis

## Data Analysis

In [13]:
endorsee_data = data[data['endorsee']!='no_endorsee']
endorsee_data['endorsee'] = endorsee_data['endorsee'].str.split(' ').apply(lambda r: r[-1])
endorsee_data.head(15)

Unnamed: 0,position,state,endorser,endorsee,endorser party,category,points,source
0,representative,MD,David Trone,Delaney,D,Representatives,3,twitter
1,governor,NY,Andrew Cuomo,Biden,D,Governors,8,cnn
2,senator,CA,Dianne Feinstein,Biden,D,Senators,6,politico
3,senator,DE,Thomas R. Carper,Biden,D,Senators,6,other
4,mayor,TX,Ron Nirenberg,Castro,,Mayors,3,other
5,DNC member,CA,Laphonza Butler,Harris,D,DNC members,1,politico
6,DNC member,DC,James J. Zogby,Sanders,D,DNC members,1,twitter
7,lieutenant governor,CA,Eleni Kounalakis,Harris,D,Statewide officeholders,2,twitter
8,mayor,CA,Libby Schaaf,Harris,D,Mayors,3,other
9,representative,CA,Ted Lieu,Harris,D,Representatives,3,other


## Data Analysis

Data Analysis completed using Table, file name: 2020DemsPrimary