In [55]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import warnings 
warnings.filterwarnings('ignore')

In [56]:
df = pd.read_csv("adult_dataset.csv")
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [57]:
df = df[df["workclass"] != "?"]
df = df[df["occupation"] != "?"]
df = df[df["native.country"] != "?"]

In [58]:
from sklearn import preprocessing

df_categorical = df.select_dtypes(include=["object"])
df_categorical.head()


Unnamed: 0,workclass,education,marital.status,occupation,relationship,race,sex,native.country,income
1,Private,HS-grad,Widowed,Exec-managerial,Not-in-family,White,Female,United-States,<=50K
3,Private,7th-8th,Divorced,Machine-op-inspct,Unmarried,White,Female,United-States,<=50K
4,Private,Some-college,Separated,Prof-specialty,Own-child,White,Female,United-States,<=50K
5,Private,HS-grad,Divorced,Other-service,Unmarried,White,Female,United-States,<=50K
6,Private,10th,Separated,Adm-clerical,Unmarried,White,Male,United-States,<=50K


In [59]:
# // transform categorical data to numerical data
le = preprocessing.LabelEncoder()
df_categorical = df_categorical.apply(le.fit_transform)
df_categorical.head()

Unnamed: 0,workclass,education,marital.status,occupation,relationship,race,sex,native.country,income
1,2,11,6,3,1,4,0,38,0
3,2,5,0,6,4,4,0,38,0
4,2,15,5,9,3,4,0,38,0
5,2,11,0,7,4,4,0,38,0
6,2,0,5,0,4,4,1,38,0


In [60]:
df = df.drop(df_categorical.columns,axis=1)

# // concatenate the transformed categorical values to data frame
df = pd.concat([df, df_categorical], axis=1)

# // to ensure all columns are int
df.head()


Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week,workclass,education,marital.status,occupation,relationship,race,sex,native.country,income
1,82,132870,9,0,4356,18,2,11,6,3,1,4,0,38,0
3,54,140359,4,0,3900,40,2,5,0,6,4,4,0,38,0
4,41,264663,10,0,3900,40,2,15,5,9,3,4,0,38,0
5,34,216864,9,0,3770,45,2,11,0,7,4,4,0,38,0
6,38,150601,6,0,3770,40,2,0,5,0,4,4,1,38,0


In [61]:
# // split the data
from sklearn.model_selection import train_test_split

x = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)


# // default tree model
from sklearn.tree import DecisionTreeClassifier

df_default = DecisionTreeClassifier(max_depth=5)
df_default.fit(x_train, y_train)

# // checking accuracy
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

y_pred_default = df_default.predict(x_test)
print(classification_report(y_test, y_pred_default))
print(confusion_matrix(y_test, y_pred_default))
print(accuracy_score(y_test, y_pred_default))
df_default.score(x_test, y_test)


# // tuning with gini
df_gini = DecisionTreeClassifier(
    criterion="gini", max_depth=10, min_samples_leaf=50, min_samples_split=50
)
df_gini.fit(x_train, y_train)
df_gini.score(x_test, y_test)

              precision    recall  f1-score   support

           0       0.85      0.95      0.90      6776
           1       0.79      0.51      0.62      2273

    accuracy                           0.84      9049
   macro avg       0.82      0.73      0.76      9049
weighted avg       0.84      0.84      0.83      9049

[[6470  306]
 [1104 1169]]
0.8441816775334291


0.8484915460271854