In [1]:
import pandas as pd
from pandas import Series,DataFrame

"""
IMPORTANT NOTE: If parsing Excel files fails, pip install xlrd AND openpyxl
"""

# Open the excel file as an object
cdc_file = pd.ExcelFile('cdc.xlsx')
# Parse the first sheet of the excel file and set as DataFrame
fluo14 = cdc_file.parse('2014')
fluo14.set_index('State')
fluo14['%'] = fluo14['%'].apply(lambda val: val.replace('%', ''))
fluo14['%'] = pd.to_numeric(fluo14['%'])  # convert_objects is deprecated
fluo14 = fluo14.rename(columns={'%': 'Rate receiving fluoridated water'})


# BLS dataset for labor participation
bls_file = pd.ExcelFile('staadata.xlsx')
# Parse the first sheet of the excel file and set as DataFrame
header = ['FIPS Code', 'State and area', 'Year', 'Civilian non-institutional population',
           'Civilian labor force Total', 'CLF Percent of population', 'CLF Employment Total',
           'CLF Employment Percent of population', 'CLF Unemployment Total',
           'CLF Unemployment Rate'
          ]
bls = bls_file.parse('Sheet1', skiprows=7, names=header)
bls14 = bls[bls.Year == 2014]
bls14 = bls14[['State and area','CLF Employment Percent of population','CLF Unemployment Rate']]
bls14['CLF Employment Percent of population'] = pd.to_numeric(bls14['CLF Employment Percent of population'])
bls14['CLF Unemployment Rate'] = pd.to_numeric(bls14['CLF Unemployment Rate'])
bls14.set_index('State and area')

data = pd.merge(fluo14, bls14, left_on="State", right_on="State and area")

In [2]:
data.head()

Unnamed: 0,State,Persons receiving fluoridated water,Persons served by CWS,Rate receiving fluoridated water,Rank,State and area,CLF Employment Percent of population,CLF Unemployment Rate
0,Alabama,3812180,4849377,78.6,23.0,Alabama,53.2,6.8
1,Alaska,339415,685165,49.5,43.0,Alaska,63.1,6.9
2,Arizona,3199068,5536324,57.8,38.0,Arizona,55.7,6.8
3,Arkansas,1986099,2809741,70.7,31.0,Arkansas,53.6,6.1
4,California,24699693,38802500,63.7,35.0,California,57.7,7.5


In [3]:
import matplotlib.pyplot as plt

plt.hist(data['Rate receiving fluoridated water'])
plt.xlabel('Percent of population receiving fluoridated water')
plt.ylabel('Number of States')
plt.title('Count of States and Flouridated Water')
plt.savefig('flouridated_rate_histogram.png', dpi=300, bbox_inches='tight')

In [4]:
plt.gcf().clear()
plt.hist(data['CLF Employment Percent of population'])
plt.xlabel('Percent of population employed')
plt.ylabel('Number of States')
plt.title('Civilian Labor Force Participation Percentage')
plt.savefig('participation_rate_histogram.png', dpi=300, bbox_inches='tight')

In [5]:
# Assignment requires 3 variables instead of 2
plt.gcf().clear()
plt.hist(data['CLF Unemployment Rate'])
plt.xlabel('Unemployment Rate')
plt.ylabel('Number of States')
plt.title('Civilian Labor Force Unemployment Histogram')
plt.savefig('unemployment_rate_histogram.png', dpi=300, bbox_inches='tight')

In [6]:
# Bonus, going fancy here even though a Scatterplot would be easiear to understand 
import seaborn as sns
fluoridation = data['Rate receiving fluoridated water']
labor_participation = data['CLF Employment Percent of population']
unemployment = data['CLF Unemployment Rate']
sns_plot = sns.jointplot(fluoridation, labor_participation, kind="reg")
sns_plot.savefig("seaborn_jointplot.png")