# Project Final Code

**Title:** Time series analysis of state- and county-level unemployment rates in the USA

**Index:**  
[Obtaining the Data](#-Obtaining-the-Data)  
[Setup](#Setup)  
[Data Preparation](#Data-Preparation)   
[Data Exploration](#Data-Exploration)  
[Modeling](#Modeling) 
[Presentation Graphic(s)](#Presentation-Graphic(s))



## Obtaining the Data
The data required for this project may be obtained as follows.

The source of datasets
- 'unemployment-by-county-us.csv': https://www.kaggle.com/jayrav13/unemployment-by-county
 (dowload output.csv)
- 'Unemployment.csv': https://www.ers.usda.gov/data-products/county-level-data-sets/download-data/ (dowload file for 'Unemployment and median household income for the U.S., States, and counties, 2007-18')
- 'states.csv' and 'Counties.csv': https://www.kaggle.com/stansilas/us-state-county-name-codes#US%20Countiess.csv 

## Setup

In [1]:

%matplotlib notebook
# mainline tools
import os
import re
# data tools
import numpy  as np
import pandas as pd
import scipy  as sp
import scipy.stats as stats
import datetime
# plotting and graphics
import matplotlib        as mpl
import matplotlib.pyplot as plt
import matplotlib.dates  as mdates
import mplleaflet 
# Use seaborn theme, scaling, and color palette.
import seaborn as sns
from matplotlib import rcParams
%matplotlib inline
# %matplotlib inline
# sns.set_context('notebook')
# from IPython.core.pylabtools import figsize
# #@markdown This sets the resolution of the plot outputs (`retina` is the highest resolution)
# notebook_screen_res = 'retina' #@param ['retina', 'png', 'jpeg', 'svg', 'pdf']
# %config InlineBackend.figure_format = notebook_screen_res
sns.set()
sns.set_style("ticks", {"xtick.major.size": 9, "ytick.major.size": 9})
rcParams['figure.figsize'] = 15.7,8.27
plt.style.use('seaborn-ticks')
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = 'Ubuntu'
plt.rcParams['font.monospace'] = 'Ubuntu Mono'
plt.rcParams['font.size'] = 12
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['axes.labelweight'] = 'bold'
plt.rcParams['axes.titlesize'] = 12
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10
plt.rcParams['legend.fontsize'] = 10
plt.rcParams['figure.titlesize'] =20


# Styling seems more consistent if set using MPL instead of sns
plt.style.use('seaborn-whitegrid')

# Set paths to directories for the data
dataroot = os.environ['HOMESHARE']


## Data Preparation

The data has been loaded from different sources and merged to a single dataframe.Below mentioned utility functions are used at different stages of data preperation

### Utility functions

In [2]:
#utility functions


def calc_rate(df):
    '''To calculate unemployment rate while aggregating'''
    df['Unemployment Rate (%)']=((df['Unemployed']/df['Civilian_labor_force'])*100)
    df.drop(['Civilian_labor_force','Unemployed'],axis=1)
    #df.set_index('Year',inplace=True)
    return df

def return_date(year, month):
    '''combine month and year to form date'''
    months = {"january":0, "february":1, "march":2, "april":3, "may":4, "june":5, "july":6, "august":7, "september":8, "october":9, "november":10, "december":11 }
    # print(months[month.lower()]+1)
    return(datetime.datetime(year, months[month.lower()]+1, 1))


In [3]:
#functions to load and merge datasets 
def load_df( DataDir ):
    '''Loading main dataset'''
    # Read the data into a Pandas dataframe 
    df=pd.read_csv(os.path.join(DataDir,'unemployment-by-county-us.csv'))
    
    #adding state abbr and FIPS code
    state=pd.read_csv("states.csv",dtype=str)
    fips=pd.read_csv("Counties.csv",dtype=str)
    df=df.merge(state,on ='State').merge(fips,left_on=['Abbreviation','County'],right_on=['X1','X4']).sort_values(by=['Year','Month','State','County'])
    df['FIPS']=df["X2"].map(str) + df["X3"].map(str)
    df=df[['Year','Month','X2','Abbreviation','County','FIPS','Rate']]
    df.columns=['Year','Month','State_code','State','County','FIPS','Unemployment_Rate']
    return df

def load_pop(DataDir):
    '''Loding second datset containing Civilian_Labour_Force'''
    df1=pd.read_csv("Unemployment.csv",usecols=[x for x in range(56) if x not in [3,4,5,54,55,56]] )
    #removing unwanted records
    df1=df1.loc[~df1.FIPS.isin([x for x in df1.FIPS if str(x).endswith("000")])]
    df1=df1[df1['State']!='US']
    df1['FIPS'] = df1['FIPS'].astype(str)
    #making FIPS code 5digits
    df1['FIPS']=df1['FIPS'].str.rjust(5,'0')
    labour_force=df1.loc[:,(['FIPS','State','Area_name']+[x for x in df1.columns if x.strip().startswith("Civilian_labor_force")])]
    employed=df1.loc[:,(['FIPS','State','Area_name']+[x for x in df1.columns if x.strip().startswith("Employed")])]
    unemployed=df1.loc[:,(['FIPS','State','Area_name']+[x for x in df1.columns if x.strip().startswith("Unemployed")])]
    #ordering of columns
    def reorder(data,string):
        '''Rearranging df columns'''
        data.Area_name= data.Area_name.str.split(',').str[0]
        data.columns = data.columns.str.strip()
        data.columns=['FIPS','State','County',2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018]
        data[data.columns[3:]]=data[data.columns[3:]].apply(lambda x: pd.to_numeric(x.astype(str).str.replace(',',''), errors='coerce'))
        data=data.set_index(['FIPS','State','County']).stack()
        data.name=string
        data.index.names=['FIPS','State', 'County', 'Year']
        data=data.reset_index()
        data=data.reset_index(drop=True)
        return data
    df1=reorder(labour_force,'Civilian_labor_force')
    df1.dropna(inplace=True)
    return df1

def prepare_data(df):
    df=df[['Year','Month','State_code','State','County','FIPS','Civilian_labor_force','Unemployment_Rate']]
    #finding out monthly unemployed value based on civilian labour force and unemployment rate
    df['Unemployed']=(df['Civilian_labor_force']*df['Unemployment_Rate']/100).round()
    #checking for duplicates
    df[df.duplicated(subset=['Year','Month','State','FIPS'])]
    return df

In [4]:
DataDir = os.path.join(dataroot, 'project')
#reading main data file
df=load_df(DataDir)

#loading civilian labour force data 
df2=load_pop(DataDir)

#merge main data with civilian labour force
data_all=df.merge(df2,on=['Year','State','FIPS'],suffixes=("","_"))


#cleaning
data_all= prepare_data(data_all)
data_all.head(5)


## Data Exploration

The dataset has following attributes and datatypes. 

In [5]:
data_all.info()

The summary statistics of the dataset is as below:

In [6]:
data_all.describe()

### Histograms
Histograms are plotted both on county level and state level

In [7]:
def hist_county(data_all):
    '''Plots histogram on County unemployment(monthly for each year) '''
    plt.figure(figsize=(10,5))
    sns.distplot(data_all['Unemployment_Rate'])
    plt.xlabel("Unemployment Rate (%)", fontsize=15)
    plt.ylabel("Frequency",fontsize=15)
    plt.title("Histogram on county level unemployment rate",fontsize=20)
    plt.show()
def plot_hist_state(dfs):
    '''Plots histogram on County unemployment(monthly for each year) '''
    dfs_m=calc_rate(dfs)
    plt.figure(figsize=(10,5))
    sns.distplot(dfs['Unemployment Rate (%)'])
    plt.xlabel("Unemployment Rate (%)", fontsize=15)
    plt.ylabel("Frequency",fontsize=15)
    plt.title("Histogram on State average unemployment rate",fontsize=20)
    plt.show()

In [8]:
hist_county(data_all)  

Histogram on county level shows some extreme values  causing long tail. Aggregating data statewise to see the distribution

In [9]:
#histogram on state unemployment(monthly for each year) 
#satewise average

dfs=data_all.groupby(['Year','Month','State_code','State'],as_index=False).mean()
df_state=dfs.groupby(['State'],as_index=False).mean()
df_state=calc_rate(df_state)[['State','Civilian_labor_force','Unemployment Rate (%)']]
df_state1=df_state[['State','Unemployment Rate (%)']]

plot_hist_state(dfs)


The data is pretty well distributed with mean value around 5

### Checking for outliers
In order to check for the outliers,plotted boxplot and it is seen that outliers are negligible.

In [10]:
sns.boxplot(x = dfs['Year'], y = dfs['Unemployment Rate (%)'])
plt.title("Box Plot of Yearly Unemployment Rate", fontsize=20)
plt.show()

In [11]:
#Month and Year can be combined to form Date.
data_all_ts=data_all.copy()
data_all_ts['Date']=data_all_ts.apply(lambda x: return_date(x['Year'], x['Month']), axis=1)
data_all_ts=data_all_ts.sort_values(by='Date')


Checking for missing values: 

In [12]:
#check for missing datetimeindex values based on reference index (with all values)
data_all_ts.set_index('Date')['State'].groupby('Date').count().plot(figsize = (16,6))

finding number of counties with missing data.For 10 years it need 120 records for each county)
FIPS=(data_all_ts.groupby(['FIPS','Date']).count()['Unemployed'].groupby(['FIPS']).count())!=120
FIPS[FIPS==True]#662 counties doesnt have data for all the month and year

When analyzed county level, there are 662 counties with missing values(doesn't have values for all the month and year). Then I aggregated data on State level and checked if all values present for all the month and year.

In [13]:
finding number of states with missing data(For 10 years it need 120 records for each state)
St=(data_all_ts.groupby(['State','Date']).count()['Unemployed'].groupby(['State']).count())!=120
St[St==True]#6 states doesnt have data for all the month and year
CA    101,KY     99,LA     76,MI     88,TX     83,VA     98

There are 6 states with missing data. It needs to be filled before modelling. 

For further analysis, data has been plotted at ditfferent levels.

### County level analysis

The overall geoplot has been plotted to get an idea about rate at county level

In [14]:
#Need to install geopy plotly
# !pip install geopy plotly --user
# !pip install plotly-geo --user
import plotly.figure_factory as ff
import plotly.graph_objects as go

In [15]:
def plot_geo(df,title):
    '''Plots choropleth graph to display unemployment rate at county level '''
    colorscale = ["#f7fbff","#ebf3fb","#deebf7","#d2e3f3","#c6dbef","#b3d2e9","#9ecae1",
              "#85bcdb","#6baed6","#57a0ce","#4292c6","#3082be","#2171b5","#1361a9",
              "#08519c","#0b4083","#08306b"]
    endpts = list(np.linspace(1, 12, len(colorscale) - 1))
    df=df.sort_values(by='FIPS')
    fips = df['FIPS'].tolist()
    values = df['Unemployment Rate (%)'].tolist()

    fig = ff.create_choropleth(
        fips=fips, values=values,
        binning_endpoints=endpts,
        colorscale=colorscale,
        show_state_data=True,
        show_hover=True, centroid_marker={'opacity': 0},
        asp=2.9, title=title,
        legend_title='% unemployed'
    )

    fig.layout.template = None
    fig.show()


In [16]:
df_county=data_all.copy()
#overall unemployment rate for each county
df_county1=df_county.groupby(['FIPS'],as_index=False).mean()[['FIPS','Civilian_labor_force','Unemployed']]
df_county1=calc_rate(df_county1)[['FIPS','Unemployment Rate (%)']]


plot_geo(df_county1,'Overall Unemployment rate in US by county')

From the plots it is clearly visible unemployment rate more towards east coast and west coast regions

### Statewise anlysis

Based on Geographical location and Unemployment_Rate, I have selected 5 states (ND,MI,WA,NY and CA,) for further analysis. This selection has been done based on the civilian labour force and geographical location.

In [17]:
def plot_fig3(dfs): 
    '''Plots area stacked plot for the selected five states'''
    states=['ND','WA','MI','NY','CA']
    dfs=dfs.groupby(['Year','State_code','State'],as_index=False).mean()
    dfs=calc_rate(dfs)[['Year','State','Unemployment Rate (%)']]
    df_states5=dfs[dfs['State'].isin(states)].sort_values(by=['State','Year'])
    df_states5=df_states5.pivot(index='Year', columns='State')
    plt.style.use('seaborn')
    df_states5.plot.area(alpha=0.5)
    fig= plt.gcf()
    fig.set_size_inches(15,10) 
    plt.style.use('seaborn-whitegrid')
    plt.legend(loc=1)
    plt.title("Unemployment rate analysis for selected states(2007-16)",fontsize=20)
    plt.show()
    fig.tight_layout()

In [18]:
plot_fig3(dfs)

The area chart shows "Unemployment_Rate" change over the same period for the above chosen different states. Since data is stacked on top of another, it is much easier to compare how data is evolved over the time period. Among these 5 states, CA is one of the high "Unemployment_Rate" state, ND with least, and NY with average. 


### Data distribution analysis for the selected states
Here I have used violin plot in order to understand the distribution of underlying data in a state by taking examples as 'ND','WA','MI','NY' and 'CA'

In [19]:
def plot_violin(df,state,ax):
    '''Plots violin plot for the selected five states in order to understand underlying data distribution'''
    state_names=['North Dakota','Washington','Michigan','New York','California']
    states=['ND','WA','MI','NY','CA']
    state_dict = dict(zip(states,state_names))
    out=df[df.State==state]
    #print(out)
    out=out.groupby(['Year','FIPS'],as_index=False).mean()[['Year','FIPS','Civilian_labor_force','Unemployed']].round()
    out=calc_rate(out)[['Year','FIPS','Unemployment Rate (%)']]
    #print(out)
    sns.violinplot( ax=ax,x = out['Year'], y = out['Unemployment Rate (%)'],inner="box") 
    ax.set_title(state_dict[state])
    

In [20]:
fig3, axs3 = plt.subplots(num=3, nrows=5,sharex=True, ncols=1,figsize=[10,20], dpi=100, clear=True)

states=['ND','WA','MI','NY','CA']
for i in range(len(states)):
    plot_violin(df_county,states[i],axs3[i])

plt.show()

fig3.tight_layout()

By analyzing violin plot for ND, we can see that the mean is almost constant across the years, and distribution is almost constant.

## Modeling

From all the analysis done above ,I have selected two states ND and CA for modelling, which represents states with low and high civilian labour force. Also,ND has least deviation in the distribution,and no missing data. CA has few missing data, which can be filled with interpolation.

In [21]:
#Month and Year can be combined to form DateTimeIndex.
data_all_ts=data_all.copy()
data_all_ts['Date']=data_all_ts.apply(lambda x: return_date(x['Year'], x['Month']), axis=1)
data_all_ts=data_all_ts.sort_values(by='Date')


In [22]:
def prepare_ts(df,St_code):
    '''generate time series data'''
    df['Date']=df.apply(lambda x: return_date(x['Year'], x['Month']), axis=1)
    df['Unemployed']=df['Unemployed'].round() 
    df=df[['Date','State_code','Unemployed']].sort_values(by=['Date','State_code'])
    ts=df[df.State_code==St_code][['Date','Unemployed']]
    ts=ts.set_index('Date')
    return ts
def check_correlation(df):
    '''Check if there is any correlation between the current data and previous data'''
    #lag plot
    lag_plot(df)
    plt.show()
    
    #autocorrelation plot
    autocorrelation_plot(df)
    plt.title('Autocorrelation plot',fontsize=20)
    plt.show()
    
    
    #correlation matrix
    values = pd.DataFrame(df.values)
    dataframe = concat([values.shift(1), values], axis=1)
    dataframe.columns = ['t-1', 't+1']
    result = dataframe.corr()
    print(result)

def seasonal_decomp(df):
    '''Data is decomposed into seasonal and trend .We need to remove data out of it'''
    result = seasonal_decompose(df, model='multiplicative')
    result.plot()
    plt.show()
def modeling(df,lag):
    X = df.values
    size = int(len(X) * 0.66)
    train, test = X[0:size], X[size:len(X)]

    scaler = StandardScaler()
    train = scaler.fit_transform(train)
    test = scaler.transform (test)

    history = [x for x in train]
    predictions = list()
    for t in range(len(test)):
        model = ARIMA(history, order=(lag,1,0))
        model_fit = model.fit(disp=0)
        output = model_fit.forecast()
        yhat = output[0]
        predictions.append(yhat)
        obs = test[t]
        history.append(obs)
        print('predicted=%f, expected=%f' % (yhat, obs))
    error = mean_squared_error(test, predictions)
    print('Test MSE: %.3f' % error)
    # plot
    plt.plot(test)
    plt.plot(predictions, color='red')
    plt.show()


 Modelling for NorthDacota(ND)

In [23]:
from pandas.plotting import autocorrelation_plot
from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.preprocessing import StandardScaler
from pandas import concat
from pandas.plotting import lag_plot
#ND state code=46
#preparing data for time series analysis-
df_ts=dfs.copy()

ts1=prepare_ts(df_ts,'46')
check_correlation(ts1)

In [24]:
seasonal_decomp(ts1)    

In [25]:
runnning may take few minutes
lag=9
modeling(ts1,lag)

Modelling for California

In [26]:
def missing_data_interploate(df):
    #missing data handling
    date_range = pd.date_range('2007-01-01',end= '2016-12-01', freq='MS')
    df= df.loc[date_range, ['Unemployed']].copy()
    df['Unemployed'].interpolate(method='time', inplace=True)
    return df


In [27]:
#CA state code=06
# CA has missing records .So handled seperately. Interpolation has been done to fill missing data

CA=prepare_ts(df_ts,'06')
CA_1=missing_data_interploate(CA)
CA_1=CA_1['Unemployed'].to_frame()
ts2=CA_1
check_correlation(ts2)

In [28]:
seasonal_decomp(ts2) 

In [29]:
runnning may take few minutes
lag=15
modeling(ts2,lag)

## Presentation Graphic(s)

### 1. Average State Unemployement Rate in the USA(2007-2016) 
The below map represents the average unemployment rate in each state in the USA. The bubble size represents the average unemployment rate for a state. 

In [30]:
#Actually it is an interactive plot with hover values as state name,longitude and latitude. But it needs large dataset(>300k)
def geo_bubble(df_state1):
    '''Plots bubble plot over geomap to represent rate in each state'''
    states_all=df_state1["State"].unique()
    avg_unemp_rate_dict= dict(zip(sorted(list(states_all)), list(df_state1['Unemployment Rate (%)'])))
    #getting the hover values
    geolocator =Nominatim(user_agent="my-application")
    lati = []
    longi = []
    marker_size = []
    text_list = []
    for state in states_all:
        loc = geolocator.geocode(state+','+ 'USA')
        lati.append(loc.latitude)
        longi.append(loc.longitude)
        marker_size.append(avg_unemp_rate_dict[state]*5)
        text_list.append(state + ':' + str(avg_unemp_rate_dict[state]))
    fig = go.Figure(data=go.Scattergeo(
            lon = longi,
            lat = lati,
            text = text_list,
            mode = 'markers',
            marker_size = marker_size
            ))

    fig.update_layout(
            title = 'Average State Unemployement Rate in the USA(2007-2016)',
            geo_scope='usa',
        )

    fig.show()


In [31]:
from  geopy.geocoders import Nominatim
import plotly.graph_objects as go
geo_bubble(df_state1)

### 2. Change in unemployment rate during different duration (2007-2016)

In [32]:
#aggregate at county level within a duration
def county_year(df,start,end):
    '''Filter out data between two years and aggregates'''
    out=df[np.logical_and(df['Year']>=start , df['Year']<=end)]
    out=out.groupby(['FIPS'],as_index=False).mean()[['FIPS','Civilian_labor_force','Unemployed']]
    out=calc_rate(out)[['FIPS','Unemployment Rate (%)']]
    return out

In [33]:
df_county_2007_09=county_year(df_county,2007,2009)
df_county_2010_12=county_year(df_county,2010,2012)
df_county_2013_15=county_year(df_county,2013,2015)
plot_geo(df_county_2007_09,'Unemployment rate in US by county during 2007-09')
plot_geo(df_county_2010_12,'Unemployment rate in US by county during 2010-12')
plot_geo(df_county_2013_15,'Unemployment rate in US by county during 2013-15')

### 3.Better performed states Vs Worst performed states during the period 2006-2017
This graph gives an idea about the average unemployment rate in three best states and three worst states along with the national average rate.

In [34]:
def plot_bestworst(dfs):
    '''Plots three best performed and worst performed states during this decade'''
    df_state_2007=dfs[dfs.Year==2007].groupby(['Year','State'],as_index=False).mean()
    df_state_2007=calc_rate(df_state_2007)[['Year','State','Unemployment Rate (%)']].set_index('State')

    df_state_2016=dfs[dfs.Year==2016].groupby(['Year','State'],as_index=False).mean()
    df_state_2016=calc_rate(df_state_2016)[['Year','State','Unemployment Rate (%)']].set_index('State')
    change=(df_state_2016['Unemployment Rate (%)']-df_state_2007['Unemployment Rate (%)']).sort_values().dropna()
    top_st=change.head(3).index
    bottom_st=change.tail(3).index

    worst=dfs[dfs['State'].isin(bottom_st)]
    worst['Date']=worst.apply(lambda x: return_date(x['Year'], x['Month']), axis=1)

    worst=calc_rate(worst)
    worst=worst[['Date','State','Unemployment Rate (%)']]
    worst = worst.reset_index(drop=True)

    best=dfs[dfs['State'].isin(top_st)]
    best['Date']=best.apply(lambda x: return_date(x['Year'], x['Month']), axis=1)

    best=calc_rate(best)
    best=best[['Date','State','Unemployment Rate (%)']]
    best = best.reset_index(drop=True)
    best.columns = ["Date", "Better Perf States", 'Unemployment Rate (%)']
    worst.columns = ["Date", "Worst Perf States", 'Unemployment Rate (%)']

    #US average
    data_us=data_all.groupby(['Year','Month'],as_index=False).mean()[['Year','Month','Civilian_labor_force','Unemployed']]
    data_us=calc_rate(data_us)[['Year','Month','Unemployment Rate (%)']]
    data_us['Date']=data_us.apply(lambda x: return_date(x['Year'], x['Month']), axis=1)
    data_us=data_us.sort_values(by='Date')
    data_us.set_index('Date',inplace=True)
    data_us=data_us['Unemployment Rate (%)']
    data_us = pd.DataFrame(data_us)
    data_us.columns = ["National mean Unemployment Rate (%)"]

    # figure size in inches
    rcParams['figure.figsize'] = 15.7,8.27
    plt.style.use('seaborn-whitegrid')


    sns.scatterplot(data=best,palette='BuGn_r',y="Unemployment Rate (%)", x="Date", hue="Better Perf States",  alpha=.9)
    sns.scatterplot(data=worst,palette='OrRd',y="Unemployment Rate (%)", x="Date", hue="Worst Perf States", alpha=.9)
    sns.lineplot(data=data_us,palette='PuBuGn_d',ci=0, linewidth=2.5)
    plt.xlim("2006-11-11" , "2017-01-01")
    plt.title("Better performed states Vs Worst performed states during the period 2006-2017 ",fontsize=20)
    plt.show()


In [35]:
plot_bestworst(dfs)

### 4.Average Unemployment rate and Civilian Labour Force by states in US(2007-2016)

In [36]:
#Average Unemployment rate and Civilian Labour Force by states in US(2007-2016)
def plot_fig4(df_state):
    '''Comparison on each states unemployment rate with Civilian labour force'''
    fig, ax = plt.subplots(figsize=(20,7),sharex=True)
    ax1 = ax.twinx()
    sns.barplot(df_state.State, df_state['Unemployment Rate (%)'], alpha=0.2,color='blue',ax=ax,label='Average Unemployment rate')
    sns.lineplot(x=df_state.State,y=df_state['Civilian_labor_force'],ax=ax1,color='black',label='Average Civilian Labour Force')
    ax.set_xticklabels(df_state.State,rotation=70)
    ax.grid(False)
    ax1.grid(False)
    plt.title("Average Unemployment rate and Civilian Labour Force by states in US(2007-2016)",fontsize=20)
    plt.xlabel('State Names')
    ax.set_ylabel("Average Unemployment Rate")
    ax1.set_ylabel("Average Civilian Labour Force")
    ax1.lines[0].set_linestyle("--")
    plt.legend()
    plt.show()

In [37]:
# plot_fig4(df_state)

### 5.Yearly Unemployment Average and Min-Max Band

In [38]:
# plotted year wise average unemployment rate along with the min-max band
def plot_minmax(dfs):
    '''Plots average unemployment rate along with the min-max band '''
    dfs_x=dfs.groupby(['Year']).mean()
    dfs_x_max=dfs.groupby(['Year']).max()
    dfs_x_min=dfs.groupby(['Year']).min()
    dfs_1=dfs.set_index('Year')
    sns.lineplot(x=dfs_x.index, y=dfs_x['Unemployment Rate (%)'])
    plt.fill_between(dfs_x.index, dfs_x_min['Unemployment Rate (%)'], dfs_x_max['Unemployment Rate (%)'], color='g', alpha=0.3)
    plt.xticks(dfs_x.index, list(np.arange(2007, 2017)))
    plt.ylim(0,20)
    plt.grid(1)
    plt.title('Yearly Unemployment Average and Min-Max Band', fontsize=20)
    plt.show()

In [39]:
plot_minmax(dfs)

It is clearly visible that the unemployment rate is high in 2009-10 which is the recession period.

## Project approach and overall execution
Do not put anything below this cell

## Code Structure and Organization

## Code Commenting