In [1]:
# Importing dependencies
import pandas as pd
import numpy as np
from collections import defaultdict

# for visualizations
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'

# For Hypothesis testing
import statsmodels.formula.api as smf

# For principal components analysis
from sklearn.decomposition import PCA

### Loading the Dataset

In [2]:
# Loading the dataset into pandas dataframe
path = "../data/census-income.data.gz"
# set the column names
censusColnames = ['Age', 'ClassOfWorker', 'Industry', 'Occupation', 'Education',
                  'WagePerHr', 'EducationalInst', 'MaritalStatus', 'IndustryCode', 
                  'OccupationCode', 'Race', 'HispanicOrigin', 'Sex', 'MemLabourUnion',
                  'UnemploymentReason', 'EmploymentStatus', 'CapitalGain', 'CapitalLoss',
                  'Dividends', 'FEDERALTAX', 'TaxFilerStat', 'PrevState', 
                  'HouseholdStatus', 'HouseholdSummary', 'INSTANCEWEIGHT', 
                  'MigrationCode_MSA', 'MigrationCode_REG', 
                  'MigrationCode_WITHIN_REG', 'HouseOneYearAgo', 
                  'MigrationPrevResInSunbelt', 'NumOfPersonForEmployer', 'Parent', 
                  'BirthCountryFather', 'BirthCountryMother', 'BirthCountrySelf', 
                  'Citizenship', 'OwnBusiness', 'VeteranQA', 'VeteranBenefits', 
                  'WeeksWorked', 'Year', 'targetIncome']
censusDf = pd.read_csv(path, sep=r',', skipinitialspace=True, 
                       names = censusColnames, header='infer')

# Printing the dimensions of the dataset
print(censusDf.shape[0],"rows,", censusDf.shape[1],"columns")

# Displaying first five elements of all columns
with pd.option_context('display.max_columns', None):
    display(censusDf.head())

199523 rows, 42 columns


Unnamed: 0,Age,ClassOfWorker,Industry,Occupation,Education,WagePerHr,EducationalInst,MaritalStatus,IndustryCode,OccupationCode,Race,HispanicOrigin,Sex,MemLabourUnion,UnemploymentReason,EmploymentStatus,CapitalGain,CapitalLoss,Dividends,FEDERALTAX,TaxFilerStat,PrevState,HouseholdStatus,HouseholdSummary,INSTANCEWEIGHT,MigrationCode_MSA,MigrationCode_REG,MigrationCode_WITHIN_REG,HouseOneYearAgo,MigrationPrevResInSunbelt,NumOfPersonForEmployer,Parent,BirthCountryFather,BirthCountryMother,BirthCountrySelf,Citizenship,OwnBusiness,VeteranQA,VeteranBenefits,WeeksWorked,Year,targetIncome
0,73,Not in universe,0,0,High school graduate,0,Not in universe,Widowed,Not in universe or children,Not in universe,White,All other,Female,Not in universe,Not in universe,Not in labor force,0,0,0,Nonfiler,Not in universe,Not in universe,Other Rel 18+ ever marr not in subfamily,Other relative of householder,1700.09,?,?,?,Not in universe under 1 year old,?,0,Not in universe,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,0,95,- 50000.
1,58,Self-employed-not incorporated,4,34,Some college but no degree,0,Not in universe,Divorced,Construction,Precision production craft & repair,White,All other,Male,Not in universe,Not in universe,Children or Armed Forces,0,0,0,Head of household,South,Arkansas,Householder,Householder,1053.55,MSA to MSA,Same county,Same county,No,Yes,1,Not in universe,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,94,- 50000.
2,18,Not in universe,0,0,10th grade,0,High school,Never married,Not in universe or children,Not in universe,Asian or Pacific Islander,All other,Female,Not in universe,Not in universe,Not in labor force,0,0,0,Nonfiler,Not in universe,Not in universe,Child 18+ never marr Not in a subfamily,Child 18 or older,991.95,?,?,?,Not in universe under 1 year old,?,0,Not in universe,Vietnam,Vietnam,Vietnam,Foreign born- Not a citizen of U S,0,Not in universe,2,0,95,- 50000.
3,9,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,White,All other,Female,Not in universe,Not in universe,Children or Armed Forces,0,0,0,Nonfiler,Not in universe,Not in universe,Child <18 never marr not in subfamily,Child under 18 never married,1758.14,Nonmover,Nonmover,Nonmover,Yes,Not in universe,0,Both parents present,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,- 50000.
4,10,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,White,All other,Female,Not in universe,Not in universe,Children or Armed Forces,0,0,0,Nonfiler,Not in universe,Not in universe,Child <18 never marr not in subfamily,Child under 18 never married,1069.16,Nonmover,Nonmover,Nonmover,Yes,Not in universe,0,Both parents present,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,- 50000.


---

## Problem Statement

>From the various features in the census data set our aim is to build a predictive model to determine whether the income level for the people in United States exceeds the bracket of $50,000.

## Hypothesis Generation

From our problem statement is clear that it is a binary classification problem.

Let us generate some hypotheses which will help us in building the models more efficiently. We need to figure out some hypotheses which might influence our final outcome, hence we need to answer a simple question.

**Is There a Relationship Between the Response and Predictors?**

To test this we use the test between the Null Hypothesis $H_0$ versus the Alternate Hypothesis $H_a$.
* $H_0$ : There is no relationship between the response Income and the predictors.
    * To test the Null Hypothesis we test whether all the regression coefficients are zero.
* $H_a$ : There is some realtionship between the response and the predictors.
    * To test the Alternate Hypothesis we find  at least one coefficient that is non-zero.
    
*To perform the Hypothesis tests we will be performing multivariate linear regression on ordinal values of the dataset using **statsmodels** library.*


In [3]:
# create a dataframe for hypothesis testing
censusDf_htest = censusDf
censusDf_htest['targetIncome'] = pd.get_dummies(censusDf_htest.targetIncome).iloc[:,1:]

# Constructing a linearmodel using the ordinal values for our initial hypothesis test
hypothesis_test_model = smf.ols(formula=("targetIncome ~ Age + Industry + Occupation + "
             "WagePerHr + CapitalGain + CapitalLoss + Dividends + "
             "INSTANCEWEIGHT + NumOfPersonForEmployer + OwnBusiness +"
             "VeteranBenefits + WeeksWorked + Year"), data=censusDf_htest).fit()

# Printing the summary of the model
hypothesis_test_model.summary()

0,1,2,3
Dep. Variable:,targetIncome,R-squared:,0.195
Model:,OLS,Adj. R-squared:,0.195
Method:,Least Squares,F-statistic:,3710.0
Date:,"Fri, 24 Nov 2017",Prob (F-statistic):,0.0
Time:,21:03:57,Log-Likelihood:,22185.0
No. Observations:,199523,AIC:,-44340.0
Df Residuals:,199509,BIC:,-44200.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.4296,0.092,-4.688,0.000,-0.609,-0.250
Age,0.0010,3.15e-05,32.642,0.000,0.001,0.001
Industry,0.0004,4.23e-05,9.753,0.000,0.000,0.000
Occupation,-0.0040,4.61e-05,-86.916,0.000,-0.004,-0.004
WagePerHr,-8.225e-06,1.81e-06,-4.545,0.000,-1.18e-05,-4.68e-06
CapitalGain,9.776e-06,1.05e-07,93.318,0.000,9.57e-06,9.98e-06
CapitalLoss,9.952e-05,1.8e-06,55.394,0.000,9.6e-05,0.000
Dividends,1.552e-05,2.48e-07,62.566,0.000,1.5e-05,1.6e-05
INSTANCEWEIGHT,2.085e-06,4.89e-07,4.266,0.000,1.13e-06,3.04e-06

0,1,2,3
Omnibus:,119596.189,Durbin-Watson:,2.002
Prob(Omnibus):,0.0,Jarque-Bera (JB):,995579.191
Skew:,2.882,Prob(JB):,0.0
Kurtosis:,12.302,Cond. No.,895000.0


We can see from the above result that none of the coefficients are zero, also some of the features have significant p-values, which indicates that there is a significant relationship among the predictors and the response. 

* Hence we reject our Null Hypothesis $H_0$.

We should keep in mind that we have not considered all the features for our hypothesis generation, we will explore more about the nominal features as we proceed in the coming sections.

## Baseline

In order to evaluate our model we should define some baseline. Let us generate some statistics about our response variable so that we can set our baseline.

In [4]:
# Getting the count
incomeCount = censusDf['targetIncome'].value_counts()
print(incomeCount)

# Getting the proportion of data having -50000 as response
print(float(incomeCount[0]/len(censusDf['targetIncome']))*100,
     "% people have income below $50000.")

0    187141
1     12382
Name: targetIncome, dtype: int64
93.79419916500854 % people have income below $50000.


Most of the values are 0 in the responce variable, Income. Which means that the dataset is heavily skewed towards having income less than \$50,000. Which means that if we predict only below \$50,000, still our model accuracy would be **93.79%**.

---

## Data Wrangling

### 1. Missing Value Imputation

In [5]:
censusDf.isnull().sum().sort_values(ascending=False).head()

HispanicOrigin    874
targetIncome        0
Race                0
Dividends           0
CapitalLoss         0
dtype: int64

* We can observe from the above statistics that, there are no missing values in numerical columns of the dataset. 
* There is only one column in which there are 874 missing values, which is 'HispanicOrigin'.
* From the first five lines of dataframe displayed above we saw that there are some garbage/missing values in the dataframe labelled as '?', lets try to track them.

In [6]:
# There are lot of '?' appearing in the dataset lets track them
for i in censusDf.columns:
    if '?' in list(censusDf[i]):
        print(censusDf.loc[censusDf[i].isin(['?'])][i].value_counts())

?    708
Name: PrevState, dtype: int64
?    99696
Name: MigrationCode_MSA, dtype: int64
?    99696
Name: MigrationCode_REG, dtype: int64
?    99696
Name: MigrationCode_WITHIN_REG, dtype: int64
?    99696
Name: MigrationPrevResInSunbelt, dtype: int64
?    6713
Name: BirthCountryFather, dtype: int64
?    6119
Name: BirthCountryMother, dtype: int64
?    3393
Name: BirthCountrySelf, dtype: int64


The above missing values does not makes much sense if we substitute them, as they are nominal values. Let us label all the above missing values as 'Unavailable'. Also there are four columns in which there almost 50% of the values which are '?', it is better to drop those columns, as high proportion of missing values can be misleading.

In [7]:
# Dropping the columns with missing values more than 50% and storing in a new dataframe
censusDf_cleaned = censusDf.drop(['MigrationCode_MSA', 'MigrationCode_REG', 
                                  'MigrationCode_WITHIN_REG', 
                                  'MigrationPrevResInSunbelt'], axis=1)

# Replacing the '?' with the label 'Unavailable'
censusDf_cleaned = censusDf_cleaned.replace('?', 'Unavailable')

In [8]:
# Check if the values are replaced
for i in censusDf_cleaned.columns:
    if 'Unavailable' in list(censusDf_cleaned[i]):
        print(censusDf_cleaned.loc[censusDf_cleaned[i].isin(['Unavailable'])][i].value_counts())

Unavailable    708
Name: PrevState, dtype: int64
Unavailable    6713
Name: BirthCountryFather, dtype: int64
Unavailable    6119
Name: BirthCountryMother, dtype: int64
Unavailable    3393
Name: BirthCountrySelf, dtype: int64


* As we saw earlier, for the caolumn 'HispanicOrigin' we have few (874) missing values; lets see how the values are distributed in the column, so that we can impute the missing values.

In [9]:
censusDf_cleaned['HispanicOrigin'].value_counts().sort_values(ascending=False)

All other                    171907
Mexican-American               8079
Mexican (Mexicano)             7234
Central or South American      3895
Puerto Rican                   3313
Other Spanish                  2485
Cuban                          1126
Do not know                     306
Chicano                         304
Name: HispanicOrigin, dtype: int64

Creating a new column for the missing values for HispanicOrigin.

In [10]:
# store the missing value in a variable
missing_val = censusDf_cleaned[censusDf_cleaned.isnull()]['HispanicOrigin'].iloc[1]
# impute the missing values
censusDf_cleaned['HispanicOrigin'] = censusDf_cleaned['HispanicOrigin'].replace(
    missing_val, 'None')

In [11]:
# Check if the values are replaced
censusDf_cleaned['HispanicOrigin'].value_counts()

All other                    171907
Mexican-American               8079
Mexican (Mexicano)             7234
Central or South American      3895
Puerto Rican                   3313
Other Spanish                  2485
Cuban                          1126
None                            874
Do not know                     306
Chicano                         304
Name: HispanicOrigin, dtype: int64

##### **Check for missing values one last time.**

In [12]:
# Check for missing values
censusDf_cleaned.isnull().sum().sort_values(ascending=False).head()

targetIncome          0
OccupationCode        0
CapitalGain           0
EmploymentStatus      0
UnemploymentReason    0
dtype: int64

> Now there are no missing values in the dataset.

### 2. Feature Engineering

In [13]:
# Categorizing the columns

# Replacing the 'targetIncome' values with dummy variables
# - 50000. as the baseline. 0 for - 50000. and 1 for 50000+.
censusDf_cleaned['targetIncome'] = pd.get_dummies(
    censusDf_cleaned.targetIncome).iloc[:,1:]

# Features and Outcome
X = censusDf_cleaned.drop('targetIncome',1)
Y = censusDf_cleaned.targetIncome
print("X (predictors) is ",X.shape[0],"rows,", X.shape[1],"columns, and..."\
      "\nY (response) is ",Y.shape[0],"rows.")


X (predictors) is  199523 rows, 37 columns, and...
Y (response) is  199523 rows.


##### **Let us check the categorical variables in for each feature, and decide which one to  use in our model.**

In [14]:
# Print out number of unique categorical values in each column
print("NUMBER OF UNIQUE VALUES IN EACH FEATURE:\n")
for col_name in X.columns:
    if X[col_name].dtype == 'object':
        unique_val = len(X[col_name].unique())
        print("'{col_name}' has --> {unique_val}\
        ".format(col_name=col_name, unique_val=unique_val))

NUMBER OF UNIQUE VALUES IN EACH FEATURE:

'ClassOfWorker' has --> 9        
'Education' has --> 17        
'EducationalInst' has --> 3        
'MaritalStatus' has --> 7        
'IndustryCode' has --> 24        
'OccupationCode' has --> 15        
'Race' has --> 5        
'HispanicOrigin' has --> 10        
'Sex' has --> 2        
'MemLabourUnion' has --> 3        
'UnemploymentReason' has --> 6        
'EmploymentStatus' has --> 8        
'FEDERALTAX' has --> 6        
'TaxFilerStat' has --> 6        
'PrevState' has --> 51        
'HouseholdStatus' has --> 38        
'HouseholdSummary' has --> 8        
'HouseOneYearAgo' has --> 3        
'Parent' has --> 5        
'BirthCountryFather' has --> 43        
'BirthCountryMother' has --> 43        
'BirthCountrySelf' has --> 43        
'Citizenship' has --> 5        
'VeteranQA' has --> 3        


##### It looks like the columns 'BirthCountryFather', 'BirthCountryMother' and 'BirthCountrySelf' have same number of unique values. Let us keep only one column, and drop the other two.

In [15]:
# Dropping the columns
X = X.drop(['BirthCountryFather', 'BirthCountryMother'], axis=1)
# keeping 'BirthCountrySelf' and renaming
X.rename(columns={'BirthCountrySelf': 'BirthCountry'}, inplace=True)

In [16]:
# Although, 'BirthCountry' has a lot of unique categories, ...
# ...most categories only have a few observations if compared to max (United-States)
X['BirthCountry'].value_counts().sort_values(ascending=False).head(10)

United-States         176989
Mexico                  5767
Unavailable             3393
Puerto-Rico             1400
Germany                  851
Philippines              845
Cuba                     837
Canada                   700
Dominican-Republic       690
El-Salvador              689
Name: BirthCountry, dtype: int64

In [17]:
# In this case, bucket low frequecy categories as "Other"
X['BirthCountry'] = ['United-States' if x == 'United-States' 
                       else 'Other-Countries' for x in X['BirthCountry']]
# check the values
X['BirthCountry'].value_counts().sort_values(ascending=False)

United-States      176989
Other-Countries     22534
Name: BirthCountry, dtype: int64

##### The column 'HouseholdStatus' has 38 unique values; only few of the categories have significant number of observations.

In [18]:
# Check the value counts
X['HouseholdStatus'].value_counts().sort_values(ascending=False).head(10)

Householder                                        53248
Child <18 never marr not in subfamily              50326
Spouse of householder                              41695
Nonfamily householder                              22213
Child 18+ never marr Not in a subfamily            12030
Secondary individual                                6122
Other Rel 18+ ever marr not in subfamily            1956
Grandchild <18 never marr child of subfamily RP     1868
Other Rel 18+ never marr not in subfamily           1728
Grandchild <18 never marr not in subfamily          1066
Name: HouseholdStatus, dtype: int64

It is better to categorize the values as other, which does not have significant count.

In [19]:
# Bucket the low frequency category as other
X['HouseholdStatus'] = ['Householder' if x == 'Householder'
                        else 'Children' if x == 'Child <18 never marr not in subfamily'
                        else 'Spouse' if x == 'Spouse of householder'
                        else 'Nonfamily' if x == 'Nonfamily householder'
                        else 'Child_18_plus' if x == 'Child 18+ never marr Not in a subfamily'
                        else 'Secondary_indv' if x == 'Secondary individual'
                       else 'Other_Householders' for x in X['HouseholdStatus']]
# check the values
X['HouseholdStatus'].value_counts().sort_values(ascending=False)

Householder           53248
Children              50326
Spouse                41695
Nonfamily             22213
Other_Householders    13889
Child_18_plus         12030
Secondary_indv         6122
Name: HouseholdStatus, dtype: int64

##### Lets check the 'PrevState' column, there are 51, unique values for the feature, lets see what are they.

In [20]:
# Check the value counts
X['PrevState'].value_counts().sort_values(ascending=False).head(10)

Not in universe    183750
California           1714
Utah                 1063
Florida               849
North Carolina        812
Unavailable           708
Abroad                671
Oklahoma              626
Minnesota             576
Indiana               533
Name: PrevState, dtype: int64

With approximately 200,000 rows in our dataset, there are almost 184,000 values for the 'PrevState' column, that say 'Not in universe', which is almost 96% of the entire row, since the survey has been conducted in the United States of America, all of them must belong to a state, hence the value stating "Not in universe" are the missing values. Having this much small information about the sate doesn't seem to be helpful, it is better that we drop this feature from our predictors variables list.

In [21]:
# Dropping the 'PrevState' column
X = X.drop(['PrevState'], axis=1)

#### Creating Dummies

**Coverting categorical variable in to _Dummy Variables_.** If we want to include a categorical feature in our machine learning model, one common solution is to create dummy variables. We drop the original feature from the dataset and add a dummied version of the feature to the dataset, which is easier for the model to interpret.

In [22]:
# Creating a list of categorical features to create a dummy variable of
# columns names in asscending order, according to number of diff unique values
features_to_dummy = ['Sex', 'BirthCountry', 'Year', 'EducationalInst', 
                     'MemLabourUnion', 'HouseOneYearAgo', 'OwnBusiness', 'VeteranQA',
                     'VeteranBenefits', 'Race', 'Parent', 'Citizenship', 
                     'UnemploymentReason', 'FEDERALTAX', 'TaxFilerStat', 
                     'MaritalStatus', 'HouseholdStatus', 'NumOfPersonForEmployer', 
                     'EmploymentStatus', 'HouseholdSummary', 'ClassOfWorker', 
                     'HispanicOrigin', 'OccupationCode', 'Education', 
                     'IndustryCode', 'Occupation', 'Industry','WeeksWorked']

Define a function to create dummy variables of the dataframe from the list of columns.

In [23]:
# Function to create the dummy categorical variables used for modeling
def create_dummies(df, col_name_list):
    """
    This function takes the dataframe and features list as input, 
    and returns the modified dataframe with dummy variables of the 
    features in the list col_name_list.
    
    :param df: target dataframe 
    :param col_name_list: list of the column names from the dataset
    :return: modifies the dataframe df inplace and returns dummied dataframe
             of features in col_name_list
    """
    for x in col_name_list:
        dummies = pd.get_dummies(df[x], prefix=x, dummy_na=False)
        df = df.drop(x, 1)
        df = pd.concat([df, dummies], axis=1)
    return df

Calling the function create_dummies to convert our features in to dummy variables.

In [24]:
# Before dummies
print("Dataframe X has", X.shape[1],"columns",X.shape[0],"and rows.")

# Call the function create_dummies on X and replace the features with dummies
print("Creating dummies ...")
X = create_dummies(X, features_to_dummy)

# Printing the dimensions of the modified feature set
print("*** Now our dataframe has", X.shape[1],"columns",X.shape[0],"and rows. ***")

# display first five rows of all the features
with pd.option_context('display.max_columns', None):
    display(X.head())

Dataframe X has 34 columns 199523 and rows.
Creating dummies ...
*** Now our dataframe has 327 columns 199523 and rows. ***


Unnamed: 0,Age,WagePerHr,CapitalGain,CapitalLoss,Dividends,INSTANCEWEIGHT,Sex_Female,Sex_Male,BirthCountry_Other-Countries,BirthCountry_United-States,Year_94,Year_95,EducationalInst_College or university,EducationalInst_High school,EducationalInst_Not in universe,MemLabourUnion_No,MemLabourUnion_Not in universe,MemLabourUnion_Yes,HouseOneYearAgo_No,HouseOneYearAgo_Not in universe under 1 year old,HouseOneYearAgo_Yes,OwnBusiness_0,OwnBusiness_1,OwnBusiness_2,VeteranQA_No,VeteranQA_Not in universe,VeteranQA_Yes,VeteranBenefits_0,VeteranBenefits_1,VeteranBenefits_2,Race_Amer Indian Aleut or Eskimo,Race_Asian or Pacific Islander,Race_Black,Race_Other,Race_White,Parent_Both parents present,Parent_Father only present,Parent_Mother only present,Parent_Neither parent present,Parent_Not in universe,Citizenship_Foreign born- Not a citizen of U S,Citizenship_Foreign born- U S citizen by naturalization,Citizenship_Native- Born abroad of American Parent(s),Citizenship_Native- Born in Puerto Rico or U S Outlying,Citizenship_Native- Born in the United States,UnemploymentReason_Job leaver,UnemploymentReason_Job loser - on layoff,UnemploymentReason_New entrant,UnemploymentReason_Not in universe,UnemploymentReason_Other job loser,UnemploymentReason_Re-entrant,FEDERALTAX_Head of household,FEDERALTAX_Joint both 65+,FEDERALTAX_Joint both under 65,FEDERALTAX_Joint one under 65 & one 65+,FEDERALTAX_Nonfiler,FEDERALTAX_Single,TaxFilerStat_Abroad,TaxFilerStat_Midwest,TaxFilerStat_Northeast,TaxFilerStat_Not in universe,TaxFilerStat_South,TaxFilerStat_West,MaritalStatus_Divorced,MaritalStatus_Married-A F spouse present,MaritalStatus_Married-civilian spouse present,MaritalStatus_Married-spouse absent,MaritalStatus_Never married,MaritalStatus_Separated,MaritalStatus_Widowed,HouseholdStatus_Child_18_plus,HouseholdStatus_Children,HouseholdStatus_Householder,HouseholdStatus_Nonfamily,HouseholdStatus_Other_Householders,HouseholdStatus_Secondary_indv,HouseholdStatus_Spouse,NumOfPersonForEmployer_0,NumOfPersonForEmployer_1,NumOfPersonForEmployer_2,NumOfPersonForEmployer_3,NumOfPersonForEmployer_4,NumOfPersonForEmployer_5,NumOfPersonForEmployer_6,EmploymentStatus_Children or Armed Forces,EmploymentStatus_Full-time schedules,EmploymentStatus_Not in labor force,EmploymentStatus_PT for econ reasons usually FT,EmploymentStatus_PT for econ reasons usually PT,EmploymentStatus_PT for non-econ reasons usually FT,EmploymentStatus_Unemployed full-time,EmploymentStatus_Unemployed part- time,HouseholdSummary_Child 18 or older,HouseholdSummary_Child under 18 ever married,HouseholdSummary_Child under 18 never married,HouseholdSummary_Group Quarters- Secondary individual,HouseholdSummary_Householder,HouseholdSummary_Nonrelative of householder,HouseholdSummary_Other relative of householder,HouseholdSummary_Spouse of householder,ClassOfWorker_Federal government,ClassOfWorker_Local government,ClassOfWorker_Never worked,ClassOfWorker_Not in universe,ClassOfWorker_Private,ClassOfWorker_Self-employed-incorporated,ClassOfWorker_Self-employed-not incorporated,ClassOfWorker_State government,ClassOfWorker_Without pay,HispanicOrigin_All other,HispanicOrigin_Central or South American,HispanicOrigin_Chicano,HispanicOrigin_Cuban,HispanicOrigin_Do not know,HispanicOrigin_Mexican (Mexicano),HispanicOrigin_Mexican-American,HispanicOrigin_None,HispanicOrigin_Other Spanish,HispanicOrigin_Puerto Rican,OccupationCode_Adm support including clerical,OccupationCode_Armed Forces,OccupationCode_Executive admin and managerial,OccupationCode_Farming forestry and fishing,OccupationCode_Handlers equip cleaners etc,OccupationCode_Machine operators assmblrs & inspctrs,OccupationCode_Not in universe,OccupationCode_Other service,OccupationCode_Precision production craft & repair,OccupationCode_Private household services,OccupationCode_Professional specialty,OccupationCode_Protective services,OccupationCode_Sales,OccupationCode_Technicians and related support,OccupationCode_Transportation and material moving,Education_10th grade,Education_11th grade,Education_12th grade no diploma,Education_1st 2nd 3rd or 4th grade,Education_5th or 6th grade,Education_7th and 8th grade,Education_9th grade,Education_Associates degree-academic program,Education_Associates degree-occup /vocational,Education_Bachelors degree(BA AB BS),Education_Children,Education_Doctorate degree(PhD EdD),Education_High school graduate,Education_Less than 1st grade,Education_Masters degree(MA MS MEng MEd MSW MBA),Education_Prof school degree (MD DDS DVM LLB JD),Education_Some college but no degree,IndustryCode_Agriculture,IndustryCode_Armed Forces,IndustryCode_Business and repair services,IndustryCode_Communications,IndustryCode_Construction,IndustryCode_Education,IndustryCode_Entertainment,IndustryCode_Finance insurance and real estate,IndustryCode_Forestry and fisheries,IndustryCode_Hospital services,IndustryCode_Manufacturing-durable goods,IndustryCode_Manufacturing-nondurable goods,IndustryCode_Medical except hospital,IndustryCode_Mining,IndustryCode_Not in universe or children,IndustryCode_Other professional services,IndustryCode_Personal services except private HH,IndustryCode_Private household services,IndustryCode_Public administration,IndustryCode_Retail trade,IndustryCode_Social services,IndustryCode_Transportation,IndustryCode_Utilities and sanitary services,IndustryCode_Wholesale trade,Occupation_0,Occupation_1,Occupation_2,Occupation_3,Occupation_4,Occupation_5,Occupation_6,Occupation_7,Occupation_8,Occupation_9,Occupation_10,Occupation_11,Occupation_12,Occupation_13,Occupation_14,Occupation_15,Occupation_16,Occupation_17,Occupation_18,Occupation_19,Occupation_20,Occupation_21,Occupation_22,Occupation_23,Occupation_24,Occupation_25,Occupation_26,Occupation_27,Occupation_28,Occupation_29,Occupation_30,Occupation_31,Occupation_32,Occupation_33,Occupation_34,Occupation_35,Occupation_36,Occupation_37,Occupation_38,Occupation_39,Occupation_40,Occupation_41,Occupation_42,Occupation_43,Occupation_44,Occupation_45,Occupation_46,Industry_0,Industry_1,Industry_2,Industry_3,Industry_4,Industry_5,Industry_6,Industry_7,Industry_8,Industry_9,Industry_10,Industry_11,Industry_12,Industry_13,Industry_14,Industry_15,Industry_16,Industry_17,Industry_18,Industry_19,Industry_20,Industry_21,Industry_22,Industry_23,Industry_24,Industry_25,Industry_26,Industry_27,Industry_28,Industry_29,Industry_30,Industry_31,Industry_32,Industry_33,Industry_34,Industry_35,Industry_36,Industry_37,Industry_38,Industry_39,Industry_40,Industry_41,Industry_42,Industry_43,Industry_44,Industry_45,Industry_46,Industry_47,Industry_48,Industry_49,Industry_50,Industry_51,WeeksWorked_0,WeeksWorked_1,WeeksWorked_2,WeeksWorked_3,WeeksWorked_4,WeeksWorked_5,WeeksWorked_6,WeeksWorked_7,WeeksWorked_8,WeeksWorked_9,WeeksWorked_10,WeeksWorked_11,WeeksWorked_12,WeeksWorked_13,WeeksWorked_14,WeeksWorked_15,WeeksWorked_16,WeeksWorked_17,WeeksWorked_18,WeeksWorked_19,WeeksWorked_20,WeeksWorked_21,WeeksWorked_22,WeeksWorked_23,WeeksWorked_24,WeeksWorked_25,WeeksWorked_26,WeeksWorked_27,WeeksWorked_28,WeeksWorked_29,WeeksWorked_30,WeeksWorked_31,WeeksWorked_32,WeeksWorked_33,WeeksWorked_34,WeeksWorked_35,WeeksWorked_36,WeeksWorked_37,WeeksWorked_38,WeeksWorked_39,WeeksWorked_40,WeeksWorked_41,WeeksWorked_42,WeeksWorked_43,WeeksWorked_44,WeeksWorked_45,WeeksWorked_46,WeeksWorked_47,WeeksWorked_48,WeeksWorked_49,WeeksWorked_50,WeeksWorked_51,WeeksWorked_52
0,73,0,0,0,0,1700.09,1,0,0,1,0,1,0,0,1,0,1,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,58,0,0,0,0,1053.55,0,1,0,1,1,0,0,0,1,0,1,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,18,0,0,0,0,991.95,1,0,1,0,0,1,0,1,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,9,0,0,0,0,1758.14,1,0,0,1,1,0,0,0,1,0,1,0,0,0,1,1,0,0,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,10,0,0,0,0,1069.16,1,0,0,1,1,0,0,0,1,0,1,0,0,0,1,1,0,0,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


#### Principal Components Analysis

Principal component analysis (PCA) transforms the dataset of many features into few Principal Components that "summarize" the variance underying in the data. It is the  most common way of dimensionality reduction, and it works well where the features are highly corelated. The drawback of using PCA is that it makes it difficult to interpret the data.

In [25]:
# We will use PCA from sklearn.decomposition to find the principal components
pca = PCA(n_components=10) # 10 principal components
X_pca = pd.DataFrame(pca.fit_transform(X))

In [26]:
# Displaying the first few rows of 10 pcs
X_pca.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-446.982542,-168.035772,-40.381395,-64.759638,-13.50166,-39.122762,2.385361,1.256711,0.119348,0.841754
1,-447.349032,-167.870671,-686.914257,-62.029238,-12.985453,-24.142538,-0.927172,-1.016997,-1.472035,-0.230068
2,-447.394376,-167.897641,-748.51238,-61.951289,-13.070459,15.885651,0.728191,1.405955,0.400897,1.034297
3,-446.96712,-168.12295,17.670191,-65.306601,-13.773969,24.926329,1.616421,-0.938406,0.308593,0.181694
4,-447.353242,-167.928242,-671.302999,-62.323231,-13.166087,23.967267,1.601772,-0.929753,0.280612,0.209125


_In this case we will not proceed with the principal components. Because it is not recommended to perform PCA on categorical data._

### Model Building

In [27]:
# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier

# metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit

In [28]:
class ModelMetrics(object):

    # Random permutation cross-validator with 80-20 train test split
    cv = ShuffleSplit(n_splits = 3, test_size = 0.2, random_state=1)
    # dictionary to store scores
    model_scores = dict()
    # default scoring metrics
    default_metric = ['accuracy','precision', 'recall']
    
    def __init__(self, model_name, model_obj, features, response):
        self.model_name = model_name
        self.model_obj = model_obj
        ModelMetrics.model_scores[model_name] = []
        self.cv = ModelMetrics.cv
        self.features = features
        self.response = response
        self.model_scores = ModelMetrics.model_scores
        
    def model_scoring(self, scoring_metric=default_metric):
        for metric in scoring_metric:
            n_fold_score = cross_val_score(self.model_obj,self.features,
                                                           self.response,
                                                           cv=self.cv,
                                                           scoring=metric)
            mean_score = np.mean(n_fold_score)
            self.model_scores[self.model_name].append({metric:mean_score})
        model_scores = self.model_scores
        return model_scores

### Changing dataset for testing model scoring functions

In [29]:
X_small = X.head(500)
Y_small = Y.head(500)

#### Logistic Regression Model

In [30]:
%%timeit
# Classifier implementing Logistic Regression
clf_log_reg = LogisticRegression()
# Creating object for metrics
log_reg_metrics_obj = ModelMetrics("Logistic Regression", clf_log_reg, X_small, Y_small)
# get the metrics
log_reg_metrics = log_reg_metrics_obj.model_scoring()

103 ms ± 2.75 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


#### KNN

In [31]:
%%timeit
# Classifier implementing the k-nearest neighbors
clf_knn = KNeighborsClassifier()
# Creating object for metrics
clf_knn_metrics_obj = ModelMetrics("k-nearest neighbors", clf_knn, X_small, Y_small)
# get the metrics
clf_knn_metrics = clf_knn_metrics_obj.model_scoring()

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


164 ms ± 3.85 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


  'precision', 'predicted', average, warn_for)


#### Decision Tree

In [32]:
%%timeit
# Classifier implementing the Decision Tree
clf_d_tree = DecisionTreeClassifier()
# Creating object for metrics
clf_d_tree_metrics_obj = ModelMetrics("Decision Tree", clf_d_tree, X_small, Y_small)
# get the metrics
clf_d_tree_metrics = clf_d_tree_metrics_obj.model_scoring()

75.8 ms ± 996 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


#### Linear Discriminant Analysis

In [33]:
%%timeit
# Classifier implementing  Linear Discriminant Analysis
clf_LDA = LinearDiscriminantAnalysis()
# Creating object for metrics
clf_LDA_metrics_obj = ModelMetrics("Linear Discriminant Analysis", 
                                      clf_LDA, X_small, Y_small)
# get the metrics
clf_LDA_metrics = clf_LDA_metrics_obj.model_scoring()





329 ms ± 2.34 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)




#### Random Forest Classifier

In [34]:
%%timeit
# Classifier implementing  Random Forest Classifier
clf_RF = RandomForestClassifier()
# Creating object for metrics
clf_RF_metrics_obj = ModelMetrics("Random Forest Classifier", 
                                      clf_RF, X_small, Y_small)
# get the metrics
clf_RF_metrics = clf_RF_metrics_obj.model_scoring()

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

147 ms ± 540 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


### Printing the scores

In [35]:
for i,j in ModelMetrics.model_scores.items():
    print("Model=",i,"\nScores=",j)

Model= Logistic Regression 
Scores= [{'accuracy': 0.95666666666666667}, {'precision': 0.44444444444444442}, {'recall': 0.12222222222222223}]
Model= k-nearest neighbors 
Scores= [{'accuracy': 0.95666666666666667}, {'precision': 0.0}, {'recall': 0.0}]
Model= Decision Tree 
Scores= [{'accuracy': 0.92000000000000004}, {'precision': 0.20000000000000004}, {'recall': 0.12222222222222223}]
Model= Linear Discriminant Analysis 
Scores= [{'accuracy': 0.89666666666666661}, {'precision': 0.09696969696969697}, {'recall': 0.17777777777777778}]
Model= Random Forest Classifier 
Scores= [{'accuracy': 0.95999999999999996}, {'precision': 0.0}, {'recall': 0.0}]


---

## Future work

- Work on feature engineering, try to explore features that can help predicting the target more efficiently, come up with new columns.
- Data Wrangling: try to clean the data whereever possible, drop the column if required.
- Work on Hypothesis and baseline.
- Work on model building and fitting the data into the model. Use the concepts taught in the class in order to fit the model, few of the classifiers which we are planning to use:
    - Logistic Regression
    - Gaussian Naive Bayes
    - Concepts of resampling, for example Cross Validation
    - Decision Tree
    - Random Forest classifier
    - Linear Regression
    - Ridge/Lasso Regression
    - Support Vector Machines
- Create diagnostic tools for the models.
- Define a metric to compare the models.