In [1]:
import pandas as pd
import scipy.stats


"""
IMPORTANT NOTE: If parsing Excel files fails, conda install xlrd AND openpyxl
"""

# BLS dataset for labor participation
bls_file = pd.ExcelFile('staadata.xlsx')
# Parse the first sheet of the excel file and set as DataFrame
header = ['FIPS Code', 'State', 'Year', 'Civilian non-institutional population',
           'Civilian labor force Total', 'CLF Percent of population', 'CLF Employment Total',
           'CLF Employment Percent of population', 'CLF Unemployment Total',
           'CLF Unemployment Rate'
          ]
bls = bls_file.parse('Sheet1', skiprows=7, names=header)
bls12 = bls[bls.Year == 2012]
bls12 = bls12[['State','CLF Employment Percent of population','CLF Unemployment Rate']]
bls12['CLF Employment Percent of population'] = pd.to_numeric(bls12['CLF Employment Percent of population'])
bls12['CLF Unemployment Rate'] = pd.to_numeric(bls12['CLF Unemployment Rate'])
bls12.set_index('State')

# 2012 Election data to determine if a state is "red" or "blue"
pres_file = pd.ExcelFile('2012pres.xls')
pres = pres_file.parse('2012 Pres General Results')
pres = pres.drop([1, 'FEC ID', 'STATE ABBREVIATION', 'GENERAL ELECTION DATE'], axis=1)
winner = pres[pres['WINNER INDICATOR'] == 'W']
winner = winner[['STATE','LAST NAME', 'GENERAL %']]
winner.rename(columns={'LAST NAME': 'Winner', 'GENERAL %': '%vote won'}, inplace=True)

# Load income per capita as response variable
income = pd.read_csv('per_capita_income.csv', low_memory=False)
income = income[[1, 2]]
income.set_index('State')

# Merge data sources
data = pd.merge(winner, income, left_on='STATE', right_on='State')
data = pd.merge(data, bls12).drop(['STATE'], axis=1).set_index('State')
data.columns = ['Winner', '%VoteWon', 'IncomePerCapita', 'LaborParticipation%', 'Unemployment%']
data['%VoteWon'] = pd.to_numeric(data['%VoteWon'])
data['IncomePerCapita'] = data['IncomePerCapita'].apply(lambda x: x.replace('$', '').replace(',', ''))
data['IncomePerCapita'] = pd.to_numeric(data['IncomePerCapita'])
data['LaborParticipation%'] = pd.to_numeric(data['LaborParticipation%'])
data['Unemployment%'] = pd.to_numeric(data['Unemployment%'])

# Get quartile info to convert IncomePerCapita into a categorical variable
data['IncomePerCapita'].describe()

count       51.000000
mean     28786.549020
std       4825.452386
min      21036.000000
25%      25422.000000
50%      27646.000000
75%      31335.500000
max      45877.000000
Name: IncomePerCapita, dtype: float64

In [2]:
def convert_IncomePerCapita(income):
    if income > 31335.5:
        return '4th quartile best'
    elif income > 27646:
        return '3rd quartile'
    elif income > 25422:
        return '2nd quartile'
    else:
        return '1st quartile lowest'

data['IncomePerCapitaQ'] = data['IncomePerCapita'].apply(convert_IncomePerCapita)
data

Unnamed: 0_level_0,Winner,%VoteWon,IncomePerCapita,LaborParticipation%,Unemployment%,IncomePerCapitaQ
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Alabama,Romney,0.605458,23606,53.4,8.0,1st quartile lowest
Alaska,Romney,0.548016,33062,63.4,7.1,4th quartile best
Arizona,Romney,0.536545,25715,55.5,8.3,2nd quartile
Arkansas,Romney,0.605669,22883,54.8,7.6,1st quartile lowest
California,Obama,0.60239,30441,56.6,10.4,3rd quartile
Colorado,Obama,0.514921,32357,63.6,7.9,4th quartile best
Connecticut,Obama,0.580568,39373,60.9,8.3,4th quartile best
Delaware,Obama,0.586064,30488,57.4,7.2,3rd quartile
District of Columbia,Obama,0.909131,45877,62.9,9.0,4th quartile best
Florida,Obama,0.500079,26582,55.5,8.5,2nd quartile


In [3]:
# contingency table of observed counts
ct1=pd.crosstab(data['Winner'], data['IncomePerCapitaQ']) # ResponseVar, ExplanatoryVar
print (ct1)

IncomePerCapitaQ  1st quartile lowest  2nd quartile  3rd quartile  \
Winner                                                              
Obama                               1             5            10   
Romney                             12             8             2   

IncomePerCapitaQ  4th quartile best  
Winner                               
Obama                            11  
Romney                            2  


In [4]:
# column percentages
colsum=ct1.sum(axis=0)
colpct=ct1/colsum
print(colpct)

IncomePerCapitaQ  1st quartile lowest  2nd quartile  3rd quartile  \
Winner                                                              
Obama                        0.076923      0.384615      0.833333   
Romney                       0.923077      0.615385      0.166667   

IncomePerCapitaQ  4th quartile best  
Winner                               
Obama                      0.846154  
Romney                     0.153846  


In [5]:
print ('chi-square value, p value, expected counts')
cs1= scipy.stats.chi2_contingency(ct1)
print (cs1)

chi-square value, p value, expected counts
(21.461894586894587, 8.4404005581705684e-05, 3, array([[ 6.88235294,  6.88235294,  6.35294118,  6.88235294],
       [ 6.11764706,  6.11764706,  5.64705882,  6.11764706]]))


In [6]:
#Bonferroni Adjustment change alpha to .05/# comparisons, .05/6 = .0083
recode1 = {'1st quartile lowest': '1st quartile lowest', '4th quartile best': '4th quartile best'}
sub2 = pd.DataFrame()
sub2['COMP1v4']= data['IncomePerCapitaQ'].map(recode1)

# contingency table of observed counts
ct2= pd.crosstab(data['Winner'], sub2['COMP1v4'])
print (ct2)

# column percentages
colsum=ct2.sum(axis=0)
colpct=ct2/colsum
print(colpct)

print ('chi-square value, p value, expected counts')
cs2= scipy.stats.chi2_contingency(ct2)
print (cs2)

COMP1v4  1st quartile lowest  4th quartile best
Winner                                         
Obama                      1                 11
Romney                    12                  2
COMP1v4  1st quartile lowest  4th quartile best
Winner                                         
Obama               0.076923           0.846154
Romney              0.923077           0.153846
chi-square value, p value, expected counts
(12.535714285714285, 0.00039924696320198179, 1, array([[ 6.,  6.],
       [ 7.,  7.]]))


In [7]:
recode2 = {'1st quartile lowest': '1st quartile lowest', '3rd quartile': '3rd quartile'}
sub2 = pd.DataFrame()
sub2['COMP1v3']= data['IncomePerCapitaQ'].map(recode2)

# contingency table of observed counts
ct2= pd.crosstab(data['Winner'], sub2['COMP1v3'])
print (ct2)

# column percentages
colsum=ct2.sum(axis=0)
colpct=ct2/colsum
print(colpct)

print ('chi-square value, p value, expected counts')
cs2= scipy.stats.chi2_contingency(ct2)
print (cs2)

COMP1v3  1st quartile lowest  3rd quartile
Winner                                    
Obama                      1            10
Romney                    12             2
COMP1v3  1st quartile lowest  3rd quartile
Winner                                    
Obama               0.076923      0.833333
Romney              0.923077      0.166667
chi-square value, p value, expected counts
(11.582427988677988, 0.00066577955275126048, 1, array([[ 5.72,  5.28],
       [ 7.28,  6.72]]))
