In [1]:
# Data management code, scroll to next cell for analysis
import pandas as pd
from pandas import Series,DataFrame

"""
IMPORTANT NOTE: If parsing Excel files fails, conda install xlrd AND openpyxl
"""

# BLS dataset for labor participation
bls_file = pd.ExcelFile('staadata.xlsx')
# Parse the first sheet of the excel file and set as DataFrame
header = ['FIPS Code', 'State', 'Year', 'Civilian non-institutional population',
           'Civilian labor force Total', 'CLF Percent of population', 'CLF Employment Total',
           'CLF Employment Percent of population', 'CLF Unemployment Total',
           'CLF Unemployment Rate'
          ]
bls = bls_file.parse('Sheet1', skiprows=7, names=header)
bls12 = bls[bls.Year == 2012]
bls12 = bls12[['State','CLF Employment Percent of population','CLF Unemployment Rate']]
bls12['CLF Employment Percent of population'] = pd.to_numeric(bls12['CLF Employment Percent of population'])
bls12['CLF Unemployment Rate'] = pd.to_numeric(bls12['CLF Unemployment Rate'])
bls12.set_index('State')

# 2012 Election data to determine if a state is "red" or "blue"
pres_file = pd.ExcelFile('2012pres.xls')
pres = pres_file.parse('2012 Pres General Results')
pres = pres.drop([1, 'FEC ID', 'STATE ABBREVIATION', 'GENERAL ELECTION DATE'], axis=1)
winner = pres[pres['WINNER INDICATOR'] == 'W']
winner = winner[['STATE','LAST NAME', 'GENERAL %']]
winner.rename(columns={'LAST NAME': 'Winner', 'GENERAL %': 'PercentOfVote'}, inplace=True)
Dem = pres[pres['LAST NAME'] == 'Obama'].set_index('STATE')
Rep = pres[pres['LAST NAME'] == 'Romney'].set_index('STATE')
Dem_Rep = pd.DataFrame(Dem['GENERAL %'] - Rep['GENERAL %'])
Dem_Rep.rename(columns={'GENERAL %': 'Percent Obama Over Romney'}, inplace=True)
Dem_Rep.reset_index(level=0, inplace=True)

# Load income per capita as response variable
income = pd.read_csv('per_capita_income.csv', low_memory=False)
income = income[[1, 2]]
income.set_index('State')

# Merge data sets
data = pd.merge(Dem_Rep, income, left_on='STATE', right_on='State')
data = pd.merge(data, winner, left_on='STATE', right_on='STATE')
data = pd.merge(data, bls12).drop(['STATE'], axis=1).set_index('State')
data.rename(columns={'CLF Employment Percent of population': 'Labor Participation',
                    'CLF Unemployment Rate': 'Unemployment2012',
                    'Per capita\nincome': 'IncomePerCapita'}, inplace=True)

data['Percent Obama Over Romney'] = pd.to_numeric(data['Percent Obama Over Romney'])
data['IncomePerCapita'] = data['IncomePerCapita'].apply(lambda x: x.replace('$', '').replace(',', ''))
data['IncomePerCapita'] = pd.to_numeric(data['IncomePerCapita'])
data['Labor Participation'] = pd.to_numeric(data['Labor Participation'])
data['Unemployment2012'] = pd.to_numeric(data['Unemployment2012'])

data.reset_index(level=['State'], inplace=True)
data.drop_duplicates('State', keep='first', inplace=True)
data['IncomePerCapita_c'] = data['IncomePerCapita'] - data['IncomePerCapita'].mean()
data['Unemployment2012_c'] = data['Unemployment2012'] - data['Unemployment2012'].mean()
data

Unnamed: 0,State,Percent Obama Over Romney,IncomePerCapita,Winner,PercentOfVote,Labor Participation,Unemployment2012,IncomePerCapita_c,Unemployment2012_c
0,Alabama,-0.221868,23606,Romney,0.605458,53.4,8.0,-5180.54902,0.627451
1,Alaska,-0.139889,33062,Romney,0.548016,63.4,7.1,4275.45098,-0.272549
2,Arizona,-0.090648,25715,Romney,0.536545,55.5,8.3,-3071.54902,0.927451
3,Arkansas,-0.236879,22883,Romney,0.605669,54.8,7.6,-5903.54902,0.227451
4,California,0.231186,30441,Obama,0.60239,56.6,10.4,1654.45098,3.027451
5,Colorado,0.053652,32357,Obama,0.514921,63.6,7.9,3570.45098,0.527451
6,Connecticut,0.173315,39373,Obama,0.580568,60.9,8.3,10586.45098,0.927451
7,Delaware,0.186267,30488,Obama,0.586064,57.4,7.2,1701.45098,-0.172549
8,District of Columbia,0.836348,45877,Obama,0.909131,62.9,9.0,17090.45098,1.627451
9,Florida,0.008769,26582,Obama,0.500079,55.5,8.5,-2204.54902,1.127451


In [2]:
# logistic regression with predicting states' "Winner" base on "Per capita income"
import numpy
import statsmodels.formula.api as smf
data.replace(to_replace={'Winner' : {'Obama': 0, 'Romney': 1}}, inplace = True)
lreg1 = smf.logit(formula = 'Winner ~ IncomePerCapita_c', data = data).fit()
print (lreg1.summary())

Optimization terminated successfully.
         Current function value: 0.470450
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                 Winner   No. Observations:                   51
Model:                          Logit   Df Residuals:                       49
Method:                           MLE   Df Model:                            1
Date:                Sun, 21 Aug 2016   Pseudo R-squ.:                  0.3196
Time:                        00:01:37   Log-Likelihood:                -23.993
converged:                       True   LL-Null:                       -35.262
                                        LLR p-value:                 2.060e-06
                        coef    std err          z      P>|z|      [95.0% Conf. Int.]
-------------------------------------------------------------------------------------
Intercept            -0.4519      0.394     -1.148      0.251        -1.224     0.320
IncomePe

In [3]:
# coef = -.0005, means higher IncomePerCapita_c leads to 'lower' category in Winner(Obama)
# odds ratios
print ("Odds Ratios")
print (numpy.exp(lreg1.params))

# odd ratios with 95% confidence intervals
params = lreg1.params
conf = lreg1.conf_int()
conf['OR'] = params
conf.columns = ['Lower CI', 'Upper CI', 'OR']
print (numpy.exp(conf))

Odds Ratios
Intercept            0.636401
IncomePerCapita_c    0.999551
dtype: float64
                   Lower CI  Upper CI        OR
Intercept          0.294133  1.376951  0.636401
IncomePerCapita_c  0.999293  0.999810  0.999551


In [4]:
# logistic regression with IncomePerCapita_c and Unemployment2012_c
lreg4 = smf.logit(formula = 'Winner ~ IncomePerCapita_c + Unemployment2012_c', data = data).fit()
print (lreg4.summary())

Optimization terminated successfully.
         Current function value: 0.391694
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                 Winner   No. Observations:                   51
Model:                          Logit   Df Residuals:                       48
Method:                           MLE   Df Model:                            2
Date:                Sun, 21 Aug 2016   Pseudo R-squ.:                  0.4335
Time:                        00:01:37   Log-Likelihood:                -19.976
converged:                       True   LL-Null:                       -35.262
                                        LLR p-value:                 2.299e-07
                         coef    std err          z      P>|z|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------------
Intercept             -0.7317      0.477     -1.534      0.125        -1.667     0.203
Incom

In [5]:
# odd ratios with 95% confidence intervals
print ("Odds Ratios")
params = lreg4.params
conf = lreg4.conf_int()
conf['OR'] = params
conf.columns = ['Lower CI', 'Upper CI', 'OR']
print (numpy.exp(conf))

Odds Ratios
                    Lower CI  Upper CI        OR
Intercept           0.188887  1.225330  0.481091
IncomePerCapita_c   0.999051  0.999745  0.999398
Unemployment2012_c  0.300392  0.872093  0.511830


In [6]:
# Let see if Labor Participation is significant
data['LaborParticipation_c'] = data['Labor Participation'] - data['Labor Participation'].mean()
lreg5 = smf.logit(formula = 
                  'Winner ~ IncomePerCapita_c + LaborParticipation_c',
                  data = data).fit()
print (lreg5.summary())

Optimization terminated successfully.
         Current function value: 0.390226
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:                 Winner   No. Observations:                   51
Model:                          Logit   Df Residuals:                       48
Method:                           MLE   Df Model:                            2
Date:                Sun, 21 Aug 2016   Pseudo R-squ.:                  0.4356
Time:                        00:01:37   Log-Likelihood:                -19.902
converged:                       True   LL-Null:                       -35.262
                                        LLR p-value:                 2.133e-07
                           coef    std err          z      P>|z|      [95.0% Conf. Int.]
----------------------------------------------------------------------------------------
Intercept               -0.9154      0.540     -1.694      0.090        -1.975     0.144

In [7]:
# Looks like Labor Participation_c is significant, Red states have higher labor participation
print ("Odds Ratios")
params = lreg5.params
conf = lreg5.conf_int()
conf['OR'] = params
conf.columns = ['Lower CI', 'Upper CI', 'OR']
print (numpy.exp(conf))

Odds Ratios
                      Lower CI  Upper CI        OR
Intercept             0.138807  1.154690  0.400349
IncomePerCapita_c     0.998732  0.999669  0.999201
LaborParticipation_c  1.062305  1.808230  1.385963


In [8]:
# But looking at all three variables shows a bad fit
# Classic multi-colinearity,
# Should only use labor participation or Unemployment2012_c, not both
data['LaborParticipation_c'] = data['Labor Participation'] - data['Labor Participation'].mean()
lreg6 = smf.logit(formula = 
                  'Winner ~ IncomePerCapita_c + LaborParticipation_c + Unemployment2012_c',
                  data = data).fit()
print (lreg6.summary())

Optimization terminated successfully.
         Current function value: 0.378977
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:                 Winner   No. Observations:                   51
Model:                          Logit   Df Residuals:                       47
Method:                           MLE   Df Model:                            3
Date:                Sun, 21 Aug 2016   Pseudo R-squ.:                  0.4519
Time:                        00:01:38   Log-Likelihood:                -19.328
converged:                       True   LL-Null:                       -35.262
                                        LLR p-value:                 5.578e-07
                           coef    std err          z      P>|z|      [95.0% Conf. Int.]
----------------------------------------------------------------------------------------
Intercept               -0.8871      0.536     -1.654      0.098        -1.938     0.164