In [1]:
import pandas as pd

df = pd.read_csv("Datasets/salaries.csv")
df.head()

Unnamed: 0,company,job,degree,salary_more_then_100k
0,google,sales executive,bachelors,0
1,google,sales executive,masters,0
2,google,business manager,bachelors,1
3,google,business manager,masters,1
4,google,computer programmer,bachelors,0


In [2]:
df.degree.unique()

array(['bachelors', 'masters'], dtype=object)

ONE HOT ENCODING !

In [3]:
df['degree_number'] = df.degree.map({'bachelors':1, 'masters': 2})
df.head(3)

Unnamed: 0,company,job,degree,salary_more_then_100k,degree_number
0,google,sales executive,bachelors,0,1
1,google,sales executive,masters,0,2
2,google,business manager,bachelors,1,1


In [4]:
df.drop('degree', axis="columns", inplace=True)
df.head(3)

Unnamed: 0,company,job,salary_more_then_100k,degree_number
0,google,sales executive,0,1
1,google,sales executive,0,2
2,google,business manager,1,1


In [5]:
df_encoded = pd.get_dummies(df, columns=['company', 'job'], drop_first=True)
df_encoded.head()

Unnamed: 0,salary_more_then_100k,degree_number,company_facebook,company_google,job_computer programmer,job_sales executive
0,0,1,False,True,False,True
1,0,2,False,True,False,True
2,1,1,False,True,False,False
3,1,2,False,True,False,False
4,0,1,False,True,True,False


In [6]:
from sklearn.tree import DecisionTreeClassifier

X = df_encoded.drop('salary_more_then_100k', axis="columns")
y = df_encoded['salary_more_then_100k']

model = DecisionTreeClassifier()
model.fit(X, y)

y_pred = model.predict(X)

from sklearn.metrics import classification_report

report = classification_report(y, y_pred)
print(report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         6
           1       1.00      1.00      1.00        10

    accuracy                           1.00        16
   macro avg       1.00      1.00      1.00        16
weighted avg       1.00      1.00      1.00        16



In [7]:
# Optionally, visualize the tree
from sklearn.tree import export_text
print(export_text(model, feature_names=list(X.columns)))

|--- company_facebook <= 0.50
|   |--- job_sales executive <= 0.50
|   |   |--- degree_number <= 1.50
|   |   |   |--- company_google <= 0.50
|   |   |   |   |--- class: 0
|   |   |   |--- company_google >  0.50
|   |   |   |   |--- job_computer programmer <= 0.50
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- job_computer programmer >  0.50
|   |   |   |   |   |--- class: 0
|   |   |--- degree_number >  1.50
|   |   |   |--- class: 1
|   |--- job_sales executive >  0.50
|   |   |--- class: 0
|--- company_facebook >  0.50
|   |--- class: 1



In [8]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(criterion="entropy")
model.fit(X, y)

y_pred = model.predict(X)

from sklearn.metrics import classification_report

report = classification_report(y, y_pred)
print(report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         6
           1       1.00      1.00      1.00        10

    accuracy                           1.00        16
   macro avg       1.00      1.00      1.00        16
weighted avg       1.00      1.00      1.00        16



In [10]:
X.head(2)

Unnamed: 0,degree_number,company_facebook,company_google,job_computer programmer,job_sales executive
0,1,False,True,False,True
1,2,False,True,False,True


In [11]:
model.predict([[1, 0, 1, 0, 1]])



array([0], dtype=int64)

In [12]:
model.predict([[2, 0, 1, 1, 0]])



array([1], dtype=int64)