# Project: ML - Decision Tree Classifier, Random Forest Classifier (Predict Income from US census)

<br>

## Questions to address:
- From the data on the US census, predict whether someone has a high income or not

<br>

## Tools:
- Models: 
  - Decision Tree Classifier
  - Random Forest Classifier
- Error Metric: AUC (ideal for binary classification)
- Holdout Validation 

<br>

### load defaults

In [18]:
import numpy as np
import pandas as pd
import seaborn as sns
import re
import requests 

%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from matplotlib import rcParams
import matplotlib.dates as mdates
from datetime import datetime
from IPython.display import display, Math

from functions import *

plt.rcParams.update({'axes.titlepad': 20, 'font.size': 12, 'axes.titlesize':20})

colors = [(0/255,107/255,164/255), (255/255, 128/255, 14/255), 'red', 'green', '#9E80BA', '#8EDB8E', '#58517A']
Ncolors = 10
color_map = plt.cm.Blues_r(np.linspace(0.2, 0.5, Ncolors))
#color_map = plt.cm.tab20c_r(np.linspace(0.2, 0.5, Ncolors))


#specific to this project
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier


print("Defaults Loaded")

Defaults Loaded


<br>

## Dataset: US census, predict high or low income

In [5]:
# Set index_col to False to avoid pandas thinking that the first column is row indexes (it's age)
income = pd.read_csv("./data/income.csv", index_col=False)
display(income.iloc[:2,[0,1,2,3,4,5,6,7,8,9,10,11,-1]])

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,high_income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,<=50K


In [6]:
# Convert text categories to numbers
cols = ['workclass', 'education', 'marital_status', 'occupation', 
        'relationship', 'race', 'sex', 'native_country', 'high_income']

for element in cols:
    #col = pandas.Categorical.from_array(income[element])
    col = pd.Categorical(income[element])
    income[element] = col.codes

display(income.iloc[:2,[0,1,2,3,4,5,6,7,8,9,10,11,-1]])

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,high_income
0,39,7,77516,9,13,4,1,1,4,1,2174,0,0
1,50,6,83311,9,13,2,4,0,4,1,0,0,0


<br>

### Decision Tree Classifier with holdout validation

In [13]:
#columns to train with (all have been converted to numeric)
columns = ["age", "workclass", "education_num", "marital_status", "occupation", 
           "relationship", "race", "sex", "hours_per_week", "native_country"]


# Shuffle the rows  
np.random.seed(1)
income = income.reindex(np.random.permutation(income.index))

#create train adn test set for holdout validation
train_max_row = int(np.floor(income.shape[0] * .8))
train = income[0:train_max_row]
test = income[train_max_row:]

# Instantiate the classifier (Set random_state to 1 to be able to replicate)
clf = DecisionTreeClassifier(random_state=1)
clf.fit(train[columns], train["high_income"])

predictions = clf.predict(test[columns])
test_auc = roc_auc_score(test['high_income'], predictions)

predictions = clf.predict(train[columns])
train_auc = roc_auc_score(train['high_income'], predictions)
print("AUC train:{:0.3f}, AUC test:{:0.3f}".format(train_auc, test_auc))

AUC train:0.947, AUC test:0.703


The predictions on train set are significantly better, hint of **overfitting**

To avoid overfittin restrict max_depth and min_samples_split

In [14]:
clf = DecisionTreeClassifier(random_state=1, max_depth=7, min_samples_split=13)
clf.fit(train[columns], train["high_income"])
predictions = clf.predict(test[columns])
test_auc = roc_auc_score(test["high_income"], predictions)

train_predictions = clf.predict(train[columns])
train_auc = roc_auc_score(train["high_income"], train_predictions)

print("AUC train:{:0.3f}, AUC test:{:0.3f}".format(train_auc, test_auc))

AUC train:0.756, AUC test:0.754


<br>

### Random Forest Classifier with holdout validation

In [25]:
#max_features='auto' = sqrt(n_features)
clf = RandomForestClassifier(n_estimators=150, random_state=1, min_samples_leaf=5, 
                             criterion='gini', max_features='auto')
clf.fit(train[columns], train["high_income"])

predictions = clf.predict(test[columns])
test_auc = roc_auc_score(test["high_income"], predictions)

predictions = clf.predict(train[columns])
train_auc = roc_auc_score(train["high_income"], predictions)

print("AUC train:{:0.3f}, AUC test:{:0.3f}".format(train_auc, test_auc))

AUC train:0.798, AUC test:0.758
