In [53]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
%matplotlib inline

In [2]:
df = pd.read_csv('Data/income.csv')

In [3]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [4]:
df.apply(lambda x: sum(x.isnull()))

age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64

In [27]:
rep = {'<=50K':0, '>50K':1}
df.income = df.income.replace(rep)

In [28]:
df.apply(lambda x: sum(x.isnull()))

age                   0
workclass          2799
fnlwgt                0
education             0
educational-num       0
marital-status        0
occupation         2809
relationship          0
race                  0
gender                0
capital-gain          0
capital-loss          0
hours-per-week        0
native-country      857
income                0
dtype: int64

In [29]:
df.replace(['Divorced', 'Married-AF-spouse', 
              'Married-civ-spouse', 'Married-spouse-absent', 
              'Never-married','Separated','Widowed'],
             ['divorced','married','married','married',
              'not married','not married','not married'], inplace = True)

In [30]:
category_col =['workclass', 'race', 'education','marital-status', 'occupation',
               'relationship', 'gender', 'native-country', 'income'] 

In [31]:
from sklearn.preprocessing import LabelEncoder

In [32]:
labelencoder = LabelEncoder()

In [40]:
df

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,11th,not married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,0
1,38,Private,HS-grad,married,Farming-fishing,Husband,White,Male,0,0,50,United-States,0
2,28,Local-gov,Assoc-acdm,married,Protective-serv,Husband,White,Male,0,0,40,United-States,1
3,44,Private,Some-college,married,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,1
4,18,Private,Some-college,not married,,Own-child,White,Female,0,0,30,United-States,0
5,34,Private,10th,not married,Other-service,Not-in-family,White,Male,0,0,30,United-States,0
6,29,Private,HS-grad,not married,,Unmarried,Black,Male,0,0,40,United-States,0
7,63,Self-emp-not-inc,Prof-school,married,Prof-specialty,Husband,White,Male,3103,0,32,United-States,1
8,24,Private,Some-college,not married,Other-service,Unmarried,White,Female,0,0,40,United-States,0
9,55,Private,7th-8th,married,Craft-repair,Husband,White,Male,0,0,10,United-States,0


In [36]:
df.workclass.value_counts()

Private             33906
Self-emp-not-inc     3862
Local-gov            3136
State-gov            1981
Self-emp-inc         1695
Federal-gov          1432
Without-pay            21
Never-worked           10
Name: workclass, dtype: int64

In [37]:
df.workclass = df.workclass.fillna('Private')

In [42]:
df.occupation.value_counts()

Prof-specialty       6172
Craft-repair         6112
Exec-managerial      6086
Adm-clerical         5611
Sales                5504
Other-service        4923
Machine-op-inspct    3022
Transport-moving     2355
Handlers-cleaners    2072
Farming-fishing      1490
Tech-support         1446
Protective-serv       983
Priv-house-serv       242
Armed-Forces           15
Name: occupation, dtype: int64

In [43]:
df.occupation = df.occupation.fillna('Farming-fishing')

In [48]:
df['native-country'].value_counts()
df['native-country'] = df['native-country'].fillna('United-States')

In [49]:
df.apply(lambda x: sum(x.isnull()))

age               0
workclass         0
education         0
marital-status    0
occupation        0
relationship      0
race              0
gender            0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64

In [50]:
for c in category_col:
    df[c] = labelencoder.fit_transform(df[col])

In [52]:
df

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,3,3,3,3,3,3,3,0,0,40,3,3
1,38,3,3,3,3,3,3,3,0,0,50,3,3
2,28,1,1,1,1,1,1,1,0,0,40,1,1
3,44,3,3,3,3,3,3,3,7688,0,40,3,3
4,18,3,3,3,3,3,3,3,0,0,30,3,3
5,34,3,3,3,3,3,3,3,0,0,30,3,3
6,29,3,3,3,3,3,3,3,0,0,40,3,3
7,63,5,5,5,5,5,5,5,3103,0,32,5,5
8,24,3,3,3,3,3,3,3,0,0,40,3,3
9,55,3,3,3,3,3,3,3,0,0,10,3,3


In [54]:
X = df.values[:, 0:12]
Y = df.values[:,12]

X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size = 0.3, random_state = 100)


In [57]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [58]:
dt_clf_gini = DecisionTreeClassifier(criterion = "gini", random_state = 100,
                               max_depth=5, min_samples_leaf=5)
dt_clf_gini.fit(X_train, y_train)
### Desicion Tree with Information Gain ###

dt_clf_entropy = DecisionTreeClassifier(criterion = "entropy", random_state = 100,
 max_depth=5, min_samples_leaf=5)

dt_clf_entropy.fit(X_train, y_train)

y_pred_gini = dt_clf_gini.predict(X_test)
y_pred_en = dt_clf_entropy.predict(X_test)

print ("Desicion Tree using Gini Index\nAccuracy is ", accuracy_score(y_test,y_pred_gini)*100 )
print ("Desicion Tree using Information Gain\nAccuracy is ", accuracy_score(y_test,y_pred_en)*100 )


Desicion Tree using Gini Index
Accuracy is  100.0
Desicion Tree using Information Gain
Accuracy is  100.0


In [59]:
import pickle

In [60]:
pickle.dump(dt_clf_gini, open("Data/model.pkl", "wb"))