In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('clean_data.csv')

In [3]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30161 entries, 0 to 30160
Data columns (total 15 columns):
age               30161 non-null int64
workclass         30161 non-null object
fnlwgt            30161 non-null int64
education         30161 non-null object
education-num     30161 non-null int64
marital-status    30161 non-null object
occupation        30161 non-null object
relationship      30161 non-null object
race              30161 non-null object
sex               30161 non-null object
capital-gain      30161 non-null int64
capital-loss      30161 non-null int64
hours-per-week    30161 non-null int64
native-country    30161 non-null object
target            30161 non-null int64
dtypes: int64(7), object(8)
memory usage: 3.5+ MB


In [5]:
df.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,target
count,30161.0,30161.0,30161.0,30161.0,30161.0,30161.0,30161.0
mean,38.437883,189797.6,10.121216,1091.971984,88.375419,40.931269,0.248931
std,13.134882,105652.7,2.549983,7406.466659,404.304753,11.980182,0.432401
min,17.0,13769.0,1.0,0.0,0.0,1.0,0.0
25%,28.0,117628.0,9.0,0.0,0.0,40.0,0.0
50%,37.0,178429.0,10.0,0.0,0.0,40.0,0.0
75%,47.0,237630.0,13.0,0.0,0.0,45.0,0.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0,1.0


In [6]:
work = pd.get_dummies(df['workclass'],drop_first=True)
#education = pd.get_dummies(df['education'],drop_first=True)
marital = pd.get_dummies(df['marital-status'],drop_first=True)
occupation = pd.get_dummies(df['occupation'],drop_first=True) 
relation = pd.get_dummies(df['relationship'],drop_first=True)
race = pd.get_dummies(df['race'],drop_first=True)
sex = pd.get_dummies(df['sex'],drop_first=True)

In [7]:
new_data = pd.concat([df, work, marital, occupation, relation, race, sex], axis = 1)

In [8]:
cols = df.columns
num_cols = df._get_numeric_data().columns
cat_cols = list(set(cols) - set(num_cols))

DT_data = new_data.drop(cat_cols, axis=1)

In [9]:
DT_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30161 entries, 0 to 30160
Data columns (total 57 columns):
age                      30161 non-null int64
fnlwgt                   30161 non-null int64
education-num            30161 non-null int64
capital-gain             30161 non-null int64
capital-loss             30161 non-null int64
hours-per-week           30161 non-null int64
target                   30161 non-null int64
Local-gov                30161 non-null uint8
Private                  30161 non-null uint8
Self-emp-inc             30161 non-null uint8
Self-emp-not-inc         30161 non-null uint8
State-gov                30161 non-null uint8
Without-pay              30161 non-null uint8
11th                     30161 non-null uint8
12th                     30161 non-null uint8
1st-4th                  30161 non-null uint8
5th-6th                  30161 non-null uint8
7th-8th                  30161 non-null uint8
9th                      30161 non-null uint8
Assoc-acdm       

In [9]:
X = DT_data.drop('target', axis=1)
y = DT_data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [10]:
from sklearn.tree import DecisionTreeClassifier

In [11]:
DT_classifier = DecisionTreeClassifier()

In [12]:
DT_classifier.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [13]:
pred = DT_classifier.predict(X_test)

In [14]:
print(confusion_matrix(y_test, pred))

[[5907  914]
 [ 855 1373]]


In [15]:
print(classification_report(y_test, pred))

             precision    recall  f1-score   support

          0       0.87      0.87      0.87      6821
          1       0.60      0.62      0.61      2228

avg / total       0.81      0.80      0.81      9049



The result is slightly worse than KNN (83%)

<br/>

## Random forest model

In [16]:
from sklearn.ensemble import RandomForestClassifier

In [17]:
rf = RandomForestClassifier(n_estimators=10)        # a smaller number may work better but require more time

In [18]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [19]:
pred = rf.predict(X_test)

In [20]:
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

[[6336  485]
 [ 900 1328]]
             precision    recall  f1-score   support

          0       0.88      0.93      0.90      6821
          1       0.73      0.60      0.66      2228

avg / total       0.84      0.85      0.84      9049



Here we can see the random forest works better than decision tree.<br/>
<br/>

# predict on the test set

In [21]:
test = pd.read_csv('clean_test_data.csv')
test.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,0
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,0
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,1
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,1
4,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,0


In [22]:
t_work = pd.get_dummies(test['workclass'],drop_first=True)
#t_education = pd.get_dummies(test['education'],drop_first=True)
t_marital = pd.get_dummies(test['marital-status'],drop_first=True)
t_occupation = pd.get_dummies(test['occupation'],drop_first=True) 
t_relation = pd.get_dummies(test['relationship'],drop_first=True)
t_race = pd.get_dummies(test['race'],drop_first=True)
t_sex = pd.get_dummies(test['sex'],drop_first=True)

new_data = pd.concat([test, t_work, t_marital, t_occupation, t_relation, t_race, t_sex], axis = 1)

test_data = new_data.drop(cat_cols, axis=1)

In [23]:
test_data.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,target,Local-gov,Private,Self-emp-inc,...,Not-in-family,Other-relative,Own-child,Unmarried,Wife,Asian-Pac-Islander,Black,Other,White,Male
0,25,226802,7,0,0,40,0,0,1,0,...,0,0,1,0,0,0,1,0,0,1
1,38,89814,9,0,0,50,0,0,1,0,...,0,0,0,0,0,0,0,0,1,1
2,28,336951,12,0,0,40,1,1,0,0,...,0,0,0,0,0,0,0,0,1,1
3,44,160323,10,7688,0,40,1,0,1,0,...,0,0,0,0,0,0,1,0,0,1
4,34,198693,6,0,0,30,0,0,1,0,...,1,0,0,0,0,0,0,0,1,1


<br/>
## Predict using decision tree model

In [24]:
test_features = test_data.drop('target', axis=1)
target = test_data['target']

In [25]:
dt_pred = DT_classifier.predict(test_features)

In [26]:
print(confusion_matrix(target, dt_pred))
print(classification_report(target, dt_pred))

[[9762 1598]
 [1439 2261]]
             precision    recall  f1-score   support

          0       0.87      0.86      0.87     11360
          1       0.59      0.61      0.60      3700

avg / total       0.80      0.80      0.80     15060



<br/>
## Predict using random forests model

In [27]:
rf_pred = rf.predict(test_features)

In [28]:
print(confusion_matrix(target, rf_pred))
print(classification_report(target, rf_pred))

[[10561   799]
 [ 1558  2142]]
             precision    recall  f1-score   support

          0       0.87      0.93      0.90     11360
          1       0.73      0.58      0.65      3700

avg / total       0.84      0.84      0.84     15060



The random forests works better.