In [None]:
###############################################
# Author: Pedro Igor Salvador Alves
# Project: Adult Income Prediction
# Start Date: 31/12/2021
# Type: Data Science - Machine Learning 
# Database Link: https://archive.ics.uci.edu/ml/datasets/adult
###############################################

In [1]:
import pandas as pd
import numpy as np

In [2]:
dataset = pd.read_csv("data/dataset.csv")

In [3]:
dataset

Unnamed: 0,age,wc,education,marital status,race,gender,hours per week,IncomeClass
0,38,Private,HS-grad,Divorced,White,Male,40,<=50K
1,28,Private,Bachelors,Married,Black,Female,40,<=50K
2,37,Private,Masters,Married,White,Female,40,<=50K
3,31,Private,Masters,Never-married,White,Female,50,>50K
4,42,Private,Bachelors,Married,White,Male,40,>50K
...,...,...,...,...,...,...,...,...
19782,53,Private,Masters,Married,White,Male,40,>50K
19783,22,Private,Some-college,Never-married,White,Male,40,<=50K
19784,40,Private,HS-grad,Married,White,Male,40,>50K
19785,58,Private,HS-grad,Widowed,White,Female,40,<=50K


In [4]:
#verifying null values
dataset.isnull().sum()

age               0
wc                0
education         0
marital status    0
race              0
gender            0
hours per week    0
IncomeClass       0
dtype: int64

In [6]:
dataset.dtypes

age                int64
wc                object
education         object
marital status    object
race              object
gender            object
hours per week     int64
IncomeClass       object
dtype: object

In [7]:
#creating dummy variables
df = pd.get_dummies(dataset, drop_first = True)

In [8]:
df

Unnamed: 0,age,hours per week,wc_ Local-gov,wc_ Never-worked,wc_ Private,education_ Doctorate,education_ HS-grad,education_ Masters,education_ Preschool,education_ Prof-school,education_ Some-college,marital status_ Never-married,marital status_ Widowed,marital status_Married,race_ Asian-Pac-Islander,race_ Black,race_ Other,race_ White,gender_ Male,IncomeClass_ >50K
0,38,40,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0
1,28,40,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0
2,37,40,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0
3,31,50,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1
4,42,40,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19782,53,40,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,1,1
19783,22,40,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,1,0
19784,40,40,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,1,1
19785,58,40,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0


In [9]:
X = df.iloc[:,:-1]
Y = df.iloc[:, -1]

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 157, stratify = Y)

In [12]:
from sklearn.tree import DecisionTreeClassifier

In [13]:
dt = DecisionTreeClassifier(random_state = 157)
dt.fit(X_train, Y_train)
Y_predict = dt.predict(X_test)

In [14]:
#evaluating the model
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_test, Y_predict)

array([[3847,  526],
       [ 794,  770]], dtype=int64)

In [15]:
score = dt.score(X_test, Y_test)
score

0.7776654876200101

In [16]:
#random forest
from sklearn.ensemble import RandomForestClassifier

ranfor = RandomForestClassifier(random_state = 157)
ranfor.fit(X_train, Y_train)
Y_predict = ranfor.predict(X_test)

In [17]:
confusion_matrix(Y_test, Y_predict)

array([[3857,  516],
       [ 700,  864]], dtype=int64)

In [18]:
score = ranfor.score(X_test, Y_test)
score

0.7951827522317669

In [20]:
from sklearn.metrics import classification_report
print(classification_report(Y_test, Y_predict))

              precision    recall  f1-score   support

           0       0.85      0.88      0.86      4373
           1       0.63      0.55      0.59      1564

    accuracy                           0.80      5937
   macro avg       0.74      0.72      0.73      5937
weighted avg       0.79      0.80      0.79      5937



In [None]:
#the precision for bad loans needs to be with an error 