In [1]:
import pandas as pd
import numpy as np

from sklearn import tree
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt

### **Training Data**

In [2]:
train_data = pd.read_csv("Census_income_train.csv")

In [3]:
train_data.head()

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education-num,Marital status,Occupation,Relationship,Race,Sex,Capital-gain,Capital-loss,Hours-per-week,Native-country,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
len(train_data)

32560

### **Data Preprocessing**

In [5]:
train_data.isnull().sum()

Age               0
Workclass         0
fnlwgt            0
Education         0
Education-num     0
Marital status    0
Occupation        0
Relationship      0
Race              0
Sex               0
Capital-gain      0
Capital-loss      0
Hours-per-week    0
Native-country    0
Income            0
dtype: int64

In [6]:
# here are 3 columns which contain '?' 
# - Workclass, Occupation, Native-country

clean_train_data = train_data[train_data["Workclass"].str.contains("\?") == False]
clean_train_data = clean_train_data[clean_train_data["Occupation"].str.contains("\?") == False]
clean_train_data = clean_train_data[clean_train_data["Native-country"].str.contains("\?") == False]

In [7]:
len(clean_train_data)

30161

In [8]:
clean_train_data = clean_train_data.reset_index(drop=True)

In [9]:
len(clean_train_data)

30161

#### **Prepareing Categorical Data for skit-learn Data**

In [10]:
# Decision trees and random forest can not work with categorical data
# When used with skit learn
train_dummies = pd.get_dummies(clean_train_data, drop_first=False)
train_dummies.head()

Unnamed: 0,Age,fnlwgt,Education-num,Capital-gain,Capital-loss,Hours-per-week,Workclass_ Federal-gov,Workclass_ Local-gov,Workclass_ Private,Workclass_ Self-emp-inc,...,Native-country_ Scotland,Native-country_ South,Native-country_ Taiwan,Native-country_ Thailand,Native-country_ Trinadad&Tobago,Native-country_ United-States,Native-country_ Vietnam,Native-country_ Yugoslavia,Income_ <=50K,Income_ >50K
0,39,77516,13,2174,0,40,False,False,False,False,...,False,False,False,False,False,True,False,False,True,False
1,50,83311,13,0,0,13,False,False,False,False,...,False,False,False,False,False,True,False,False,True,False
2,38,215646,9,0,0,40,False,False,True,False,...,False,False,False,False,False,True,False,False,True,False
3,53,234721,7,0,0,40,False,False,True,False,...,False,False,False,False,False,True,False,False,True,False
4,28,338409,13,0,0,40,False,False,True,False,...,False,False,False,False,False,False,False,False,True,False


In [11]:
# Unnecessary as >= 50 exists
train_dummies = train_dummies.drop(['Income_ <=50K'],axis=1)

In [12]:
train_dummies.head()

Unnamed: 0,Age,fnlwgt,Education-num,Capital-gain,Capital-loss,Hours-per-week,Workclass_ Federal-gov,Workclass_ Local-gov,Workclass_ Private,Workclass_ Self-emp-inc,...,Native-country_ Puerto-Rico,Native-country_ Scotland,Native-country_ South,Native-country_ Taiwan,Native-country_ Thailand,Native-country_ Trinadad&Tobago,Native-country_ United-States,Native-country_ Vietnam,Native-country_ Yugoslavia,Income_ >50K
0,39,77516,13,2174,0,40,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
1,50,83311,13,0,0,13,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
2,38,215646,9,0,0,40,False,False,True,False,...,False,False,False,False,False,False,True,False,False,False
3,53,234721,7,0,0,40,False,False,True,False,...,False,False,False,False,False,False,True,False,False,False
4,28,338409,13,0,0,40,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False


### ***Feature Selection***

In [13]:

train_input = train_dummies.iloc[:,:-1]

train_target = train_dummies.iloc[:,-1] # Income >50

In [14]:
train_input.head()

Unnamed: 0,Age,fnlwgt,Education-num,Capital-gain,Capital-loss,Hours-per-week,Workclass_ Federal-gov,Workclass_ Local-gov,Workclass_ Private,Workclass_ Self-emp-inc,...,Native-country_ Portugal,Native-country_ Puerto-Rico,Native-country_ Scotland,Native-country_ South,Native-country_ Taiwan,Native-country_ Thailand,Native-country_ Trinadad&Tobago,Native-country_ United-States,Native-country_ Vietnam,Native-country_ Yugoslavia
0,39,77516,13,2174,0,40,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1,50,83311,13,0,0,13,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,38,215646,9,0,0,40,False,False,True,False,...,False,False,False,False,False,False,False,True,False,False
3,53,234721,7,0,0,40,False,False,True,False,...,False,False,False,False,False,False,False,True,False,False
4,28,338409,13,0,0,40,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False


In [15]:
train_target.head()

0    False
1    False
2    False
3    False
4    False
Name: Income_ >50K, dtype: bool

### ***Test Dataset***

In [16]:
test_data = pd.read_csv("Census_income_test.csv")
test_data.head()

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education-num,Marital status,Occupation,Relationship,Race,Sex,Capital-gain,Capital-loss,Hours-per-week,Native-country,Income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.


In [17]:
len(test_data)

16281

In [18]:
clean_test_data = test_data[test_data["Workclass"].str.contains("\?") == False]
clean_test_data = clean_test_data[clean_test_data["Occupation"].str.contains("\?") == False]
clean_test_data = clean_test_data[clean_test_data["Native-country"].str.contains("\?") == False]
clean_test_data = clean_test_data.reset_index(drop=True)

In [19]:
len(clean_test_data)

15060

In [20]:
test_dummies = pd.get_dummies(clean_test_data, drop_first=False)
test_dummies = test_dummies.drop(['Income_ <=50K.'],axis=1)
test_dummies.head()

Unnamed: 0,Age,fnlwgt,Education-num,Capital-gain,Capital-loss,Hours-per-week,Workclass_ Federal-gov,Workclass_ Local-gov,Workclass_ Private,Workclass_ Self-emp-inc,...,Native-country_ Puerto-Rico,Native-country_ Scotland,Native-country_ South,Native-country_ Taiwan,Native-country_ Thailand,Native-country_ Trinadad&Tobago,Native-country_ United-States,Native-country_ Vietnam,Native-country_ Yugoslavia,Income_ >50K.
0,25,226802,7,0,0,40,False,False,True,False,...,False,False,False,False,False,False,True,False,False,False
1,38,89814,9,0,0,50,False,False,True,False,...,False,False,False,False,False,False,True,False,False,False
2,28,336951,12,0,0,40,False,True,False,False,...,False,False,False,False,False,False,True,False,False,True
3,44,160323,10,7688,0,40,False,False,True,False,...,False,False,False,False,False,False,True,False,False,True
4,34,198693,6,0,0,30,False,False,True,False,...,False,False,False,False,False,False,True,False,False,False


In [21]:
test_input = test_dummies.iloc[:,:-1]
test_target = test_dummies.iloc[:,-1]

### ***Implementing Desesion Tree***

In [22]:
clf = tree.DecisionTreeClassifier()

In [23]:
# Train the model
clf.fit(train_input,train_target)

### ***Testing the Model***

In [25]:
test_pred = clf.predict(test_input)

In [26]:
print(classification_report(test_target, test_pred))

              precision    recall  f1-score   support

       False       0.88      0.87      0.87     11360
        True       0.60      0.62      0.61      3700

    accuracy                           0.81     15060
   macro avg       0.74      0.74      0.74     15060
weighted avg       0.81      0.81      0.81     15060



### ***Predicting***

In [27]:
test_pred = clf.predict(test_input)
test_pred

array([False, False,  True, ...,  True, False, False])

In [28]:
print(classification_report(test_target, test_pred))

              precision    recall  f1-score   support

       False       0.88      0.87      0.87     11360
        True       0.60      0.62      0.61      3700

    accuracy                           0.81     15060
   macro avg       0.74      0.74      0.74     15060
weighted avg       0.81      0.81      0.81     15060



Model Appears to be performing Correctly but it can be further Improved

# **Random Forest**

In [29]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the model as a random forest classifier
clf = RandomForestClassifier(n_estimators = 150)

In [30]:
# Train the model
clf.fit(train_input,train_target)

### ***Testing***

In [31]:
test_pred = clf.predict(test_input)
test_pred

array([False, False,  True, ...,  True, False,  True])

In [33]:
print(classification_report(test_target, test_pred))

              precision    recall  f1-score   support

       False       0.88      0.92      0.90     11360
        True       0.71      0.62      0.66      3700

    accuracy                           0.85     15060
   macro avg       0.80      0.77      0.78     15060
weighted avg       0.84      0.85      0.84     15060



### ***Improving The Model via Pruning & Estimators***

In [35]:
clf = RandomForestClassifier(n_estimators = 150, ccp_alpha = 0.0001)
clf.fit(train_input,train_target)
test_pred = clf.predict(test_input)
print(classification_report(test_target, test_pred))

              precision    recall  f1-score   support

       False       0.88      0.94      0.91     11360
        True       0.77      0.62      0.68      3700

    accuracy                           0.86     15060
   macro avg       0.83      0.78      0.80     15060
weighted avg       0.85      0.86      0.85     15060



- A slight increase in accuracy however it is insignificant
- This is the limit of the performance on this dataset