Import all packages and define functions used for data-preprocessing 
Reads the dataset

In [1]:
#### Import all packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import tree, neighbors, svm
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler

import warnings
# To ignore any future warnings
warnings.filterwarnings("ignore")

# Repeated functions
def getMode(df, attribute, condAtt, cond):
    return (df[attribute][df[condAtt]==cond].mode()[0])

def replaceWithMode(df, attribute, condAtt, cond):
    mode = getMode(df, attribute, condAtt, cond)
    df[attribute] = df[attribute].mask(((df[condAtt]==cond) & (df[attribute]=='?')), mode)
    

# read the dataset and set skipinitialspace to true to be able to .replace
df = pd.read_csv('./HouseholderAtRisk(1).csv', skipinitialspace=True)



# Task 1 
**1)      What proportion of households who have high risk?**

Properties with high risks are calculated with value_counts

In [None]:
# Task 1 question 1
# Show proportion of high risk
risks = df['AtRisk'].value_counts()
print("Proportion of high risk = " + str(risks[0]/len(df)))

# Task 1
**2) Did you have to fix any data quality problems? Detail them?**


In [2]:
# Drop rows with multiple columns containing NaN values
df = df.dropna(subset=["Relationship", "Sex", "NumYearsEducation"], how='all')

# Replace inconsistency in CountryOfOrigin
df['CountryOfOrigin'] = df['CountryOfOrigin'].replace("US", "USA").replace("United-States", "USA")

# Removing Gender to use numerical binary for Sex where 0 = Male, 1 = Female
df.drop('Gender', axis=1, inplace=True)

# Replace -1 value in Age with mean value
modeAge = df['Age'].mode()[0]
df.loc[df['Age']==-1, 'Age'] = modeAge

# Round off age
df['Age'] = df['Age'].astype(int)

# Drop race as there are 39954 NaN vs 45 labelled classes
df.drop('Race', axis=1, inplace=True)


# Sets upper boundary of 90 hours work week in NumWorkingHoursPerWeek and fills with mean value
modeWorkHours = df['NumWorkingHoursPerWeek'].mode()[0]
df['NumWorkingHoursPerWeek'] = df['NumWorkingHoursPerWeek'].where(df['NumWorkingHoursPerWeek'] <= 90, modeWorkHours)  
# Round off hours                             
df['NumWorkingHoursPerWeek'] = df['NumWorkingHoursPerWeek'].astype(int)

# Replaces missing data with mean of column
df['Weighting'] = df['Weighting'].fillna(df['Weighting'].mean())


# Replaces missing data with unknown Occupation
df['Occupation'] = df['Occupation'].fillna("?")



# Replaces unknowns with the mode of attribute
# By WorkClass
replaceWithMode(df,'Occupation','WorkClass','Federal-gov')
replaceWithMode(df,'Occupation','WorkClass','Self-emp-inc')
replaceWithMode(df,'Occupation','WorkClass','Private')
replaceWithMode(df,'Occupation','WorkClass','Never-worked')

# By education
for values in df['Education'].unique():
    replaceWithMode(df, 'Occupation', 'Education', values)
    replaceWithMode(df, 'WorkClass', 'Education', values)


# Never-worked adults will have occupation set as other services as a generic unknown
df['Occupation'] = df['Occupation'].mask(((df['WorkClass']=='Never-worked') & (df['Occupation']=='?')), "Other-service")


# Turning CapitalAvg into binary options 
df['CapitalAvg'][df['CapitalAvg'] > 0] = 1
df['CapitalAvg'].astype(int)

0        0
1        0
2        0
3        1
4        0
5        0
6        0
7        1
8        0
9        0
10       1
11       0
12       0
13       0
14       1
15       0
16       0
17       0
18       0
19       0
20       0
21       0
22       0
23       0
24       0
25       1
26       0
27       0
28       0
29       0
        ..
39969    0
39970    0
39971    0
39972    0
39973    0
39974    0
39975    0
39976    0
39977    0
39978    0
39979    0
39980    0
39981    1
39982    1
39983    0
39984    0
39985    0
39986    0
39987    0
39988    0
39989    0
39990    1
39991    0
39992    1
39993    0
39994    0
39995    0
39996    0
39997    0
39998    0
Name: CapitalAvg, Length: 39027, dtype: int32

In [3]:

workclassMapping = {'Private':0, 'Local-gov': 1, 'Self-emp-not-inc':2, 'Federal-gov': 3, 'State-gov': 4,  'Self-emp-inc': 5\
                    ,'Without-pay':6, 'Never-worked':7}
df['WorkClass'] = df['WorkClass'].map(workclassMapping)

df['WorkClass'].unique()

array([0, 1, 2, 3, 4, 5, 6, 7], dtype=int64)

# Task 2 Predictive Modelling using Decision Trees

**Dataset is split into target and input types**

By dropping ID, target variable and other object types, a decision tree can be formed.

Cross-fold validation with k=10 is done to the training set of 70%, having 7% for each fold. Then, it is fitted and ready for analysis


In [4]:
# target/input split
y = df['AtRisk']
# Drop all object type (temporarily)
X = df.drop(['ID', 'AtRisk', 'CapitalLoss', 'CapitalGain', 'Education', 'Marital-Status','Occupation', 'Relationship', 'CountryOfOrigin'], axis=1)

X_mat = X.as_matrix()
X_train, X_test, y_train, y_test = train_test_split(X_mat, y, test_size=0.3, stratify=y, random_state=10)

# simple decision tree training
clf = DecisionTreeClassifier(random_state=10)
K_fold = cross_val_score(clf, X_train, y_train, cv=10)
clf.fit(X_train, y_train)


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=10,
            splitter='best')

# Task 2 

**1a) What is classification accuracy on training and test datasets?**

**b) Which variable is used for the first split? What are the variables that are used for the second split?**

In [5]:
# Task 2 1a
print("Train accuracy:", clf.score(X_train, y_train))
print("Test accuracy:", clf.score(X_test, y_test))

# Task 2 1b

# grab feature importances from the model and feature name from the original X
importances = clf.feature_importances_
feature_names = X.columns

# sort them out in descending order
indices = np.argsort(importances)
indices = np.flip(indices, axis=0)

for i in indices:
    print(feature_names[i], ':', importances[i])

Train accuracy: 0.9993044878834468
Test accuracy: 0.7517294388931591
Weighting : 0.3749799532153776
Age : 0.2268747593960812
NumYearsEducation : 0.16307633901215637
NumWorkingHoursPerWeek : 0.09401493604386964
CapitalAvg : 0.05550375978750433
WorkClass : 0.05205092769757479
Sex : 0.033499324847436104


# Task 2

**2) Build another decision tree tuned with GridSearchCV**

With this, we have set the hyperparameters as the maximum depth of tree, and test against depths of 1 to 100, with cross fold validation of k=3. Then, the accuracy and scores are printed.

In [6]:
clf = DecisionTreeClassifier()
tree_depth = np.arange(1,100)
gs = GridSearchCV(clf, param_grid={'max_depth':tree_depth}, iid=True, cv=3)

K_fold_prediction = cross_val_score(gs, X_train, y_train, cv=3)
print("Accuracy: %0.2f (+/- %0.2f)" % (K_fold_prediction.mean(), K_fold_prediction.std() * 2))
print("Cross validation scores are:", K_fold_prediction)

gs.fit(X_train, y_train)
print("The training set gave a best score of " + str(gs.best_score_))
    

Accuracy: 0.82 (+/- 0.00)
Cross validation scores are: [0.8163848  0.81814188 0.81880079]
The training set gave a best score of 0.8181052785709056


In [7]:
# Task 2 2a
print("Train accuracy:", gs.score(X_train, y_train))
print("Test accuracy:", gs.score(X_test, y_test))


Train accuracy: 0.8202284208214364
Test accuracy: 0.8115979161328892
