In [2]:
import pandas as pd
df = pd.read_csv("salary.csv")
df.head(3)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K


In [3]:
df.shape

(32561, 15)

### features engineering: label and one hot encoding

In [4]:
df.education.unique()

array([' Bachelors', ' HS-grad', ' 11th', ' Masters', ' 9th',
       ' Some-college', ' Assoc-acdm', ' Assoc-voc', ' 7th-8th',
       ' Doctorate', ' Prof-school', ' 5th-6th', ' 10th', ' 1st-4th',
       ' Preschool', ' 12th'], dtype=object)

In [6]:
df['education'] = df['education'].map({' Bachelors':1, ' HS-grad':2, ' 11th':3, ' Masters':4, ' 9th':5,
       ' Some-college':6, ' Assoc-acdm':7, ' Assoc-voc':8, ' 7th-8th':9,
       ' Doctorate':10, ' Prof-school':11, ' 5th-6th':12, ' 10th':13, ' 1st-4th':14,
       ' Preschool':15, ' 12th':16})
df.head(3)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,1,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,1,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,2,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K


In [7]:
df.drop(['marital-status','relationship','race','native-country'], axis="columns", inplace=True)
df.head(3)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,occupation,sex,capital-gain,capital-loss,hours-per-week,salary
0,39,State-gov,77516,1,13,Adm-clerical,Male,2174,0,40,<=50K
1,50,Self-emp-not-inc,83311,1,13,Exec-managerial,Male,0,0,13,<=50K
2,38,Private,215646,2,9,Handlers-cleaners,Male,0,0,40,<=50K


In [11]:
df_encoded = pd.get_dummies(df, columns=["workclass","occupation"])
df_encoded.head(4)

Unnamed: 0,age,fnlwgt,education,education-num,sex,capital-gain,capital-loss,hours-per-week,salary,workclass_ ?,...,occupation_ Farming-fishing,occupation_ Handlers-cleaners,occupation_ Machine-op-inspct,occupation_ Other-service,occupation_ Priv-house-serv,occupation_ Prof-specialty,occupation_ Protective-serv,occupation_ Sales,occupation_ Tech-support,occupation_ Transport-moving
0,39,77516,1,13,Male,2174,0,40,<=50K,False,...,False,False,False,False,False,False,False,False,False,False
1,50,83311,1,13,Male,0,0,13,<=50K,False,...,False,False,False,False,False,False,False,False,False,False
2,38,215646,2,9,Male,0,0,40,<=50K,False,...,False,True,False,False,False,False,False,False,False,False
3,53,234721,3,7,Male,0,0,40,<=50K,False,...,False,True,False,False,False,False,False,False,False,False


### model training

In [18]:
from sklearn.tree import DecisionTreeClassifier

X = df_encoded.drop(['salary','sex'], axis="columns")
y = df_encoded['salary']

model = DecisionTreeClassifier()
model.fit(X, y)

In [20]:
from sklearn.metrics import classification_report

y_pred = model.predict(X)
report = classification_report(y , y_pred)
print(report)

              precision    recall  f1-score   support

       <=50K       1.00      1.00      1.00     24720
        >50K       1.00      1.00      1.00      7841

    accuracy                           1.00     32561
   macro avg       1.00      1.00      1.00     32561
weighted avg       1.00      1.00      1.00     32561



In [21]:
from sklearn.tree import export_text

print(export_text(model, feature_names=list(X.columns)))

|--- capital-gain <= 5119.00
|   |--- education-num <= 12.50
|   |   |--- age <= 33.50
|   |   |   |--- age <= 26.50
|   |   |   |   |--- age <= 23.50
|   |   |   |   |   |--- capital-loss <= 2468.00
|   |   |   |   |   |   |--- hours-per-week <= 45.50
|   |   |   |   |   |   |   |--- workclass_ Local-gov <= 0.50
|   |   |   |   |   |   |   |   |--- fnlwgt <= 546197.00
|   |   |   |   |   |   |   |   |   |--- age <= 21.50
|   |   |   |   |   |   |   |   |   |   |--- class:  <=50K
|   |   |   |   |   |   |   |   |   |--- age >  21.50
|   |   |   |   |   |   |   |   |   |   |--- occupation_ Other-service <= 0.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 5
|   |   |   |   |   |   |   |   |   |   |--- occupation_ Other-service >  0.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 7
|   |   |   |   |   |   |   |   |--- fnlwgt >  546197.00
|   |   |   |   |   |   |   |   |   |--- fnlwgt <= 548826.00
|   |   |   |   |   |   |   |  

### using entropy

In [22]:
from sklearn.tree import DecisionTreeClassifier
X = df_encoded.drop(['salary','sex'], axis="columns")
y = df_encoded['salary']

model = DecisionTreeClassifier(criterion = "entropy")
model.fit(X, y)

In [23]:
from sklearn.metrics import classification_report

y_pred = model.predict(X)
report = classification_report(y , y_pred)
print(report)

              precision    recall  f1-score   support

       <=50K       1.00      1.00      1.00     24720
        >50K       1.00      1.00      1.00      7841

    accuracy                           1.00     32561
   macro avg       1.00      1.00      1.00     32561
weighted avg       1.00      1.00      1.00     32561



In [24]:
from sklearn.tree import export_text

print(export_text(model, feature_names=list(X.columns)))

|--- capital-gain <= 7073.50
|   |--- age <= 27.50
|   |   |--- age <= 23.50
|   |   |   |--- capital-loss <= 2218.00
|   |   |   |   |--- hours-per-week <= 45.50
|   |   |   |   |   |--- age <= 20.50
|   |   |   |   |   |   |--- class:  <=50K
|   |   |   |   |   |--- age >  20.50
|   |   |   |   |   |   |--- workclass_ Local-gov <= 0.50
|   |   |   |   |   |   |   |--- fnlwgt <= 545665.50
|   |   |   |   |   |   |   |   |--- hours-per-week <= 31.00
|   |   |   |   |   |   |   |   |   |--- class:  <=50K
|   |   |   |   |   |   |   |   |--- hours-per-week >  31.00
|   |   |   |   |   |   |   |   |   |--- hours-per-week <= 32.50
|   |   |   |   |   |   |   |   |   |   |--- fnlwgt <= 131921.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 2
|   |   |   |   |   |   |   |   |   |   |--- fnlwgt >  131921.50
|   |   |   |   |   |   |   |   |   |   |   |--- class:  <=50K
|   |   |   |   |   |   |   |   |   |--- hours-per-week >  32.50
|   |   |   |   |   |   |   | 

In [25]:
X.head(3)

Unnamed: 0,age,fnlwgt,education,education-num,capital-gain,capital-loss,hours-per-week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,...,occupation_ Farming-fishing,occupation_ Handlers-cleaners,occupation_ Machine-op-inspct,occupation_ Other-service,occupation_ Priv-house-serv,occupation_ Prof-specialty,occupation_ Protective-serv,occupation_ Sales,occupation_ Tech-support,occupation_ Transport-moving
0,39,77516,1,13,2174,0,40,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,50,83311,1,13,0,0,13,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,38,215646,2,9,0,0,40,False,False,False,...,False,True,False,False,False,False,False,False,False,False


In [None]:
model.predict([
    