In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier


## For ordinal encodeing we can use
## from sklearn.preprocessing import OrdinalEncoder

### for visualizing the tree, we need to note down the Gini values after each iteration (that is at each node) and save it in .dot format and then visualize in a separate software

## Getting categorical variables using label encoder and further model building

In [24]:
df = pd.read_csv("salaries.csv")
df

Unnamed: 0,Company,Job,Degree,Salary_more_than_100k
0,google,sales executive,bachelors,0
1,google,sales executive,masters,0
2,google,business manager,bachelors,1
3,google,business manager,masters,1
4,google,computer programmer,bachelors,0
5,google,computer programmer,masters,1
6,abd pharma,sales executive,masters,0
7,abd pharma,computer programmer,bachelors,0
8,abd pharma,business manager,bachelors,0
9,abd pharma,business manager,masters,1


In [25]:
inputs = df.drop(['Salary_more_than_100k'], axis = 1)
inputs

Unnamed: 0,Company,Job,Degree
0,google,sales executive,bachelors
1,google,sales executive,masters
2,google,business manager,bachelors
3,google,business manager,masters
4,google,computer programmer,bachelors
5,google,computer programmer,masters
6,abd pharma,sales executive,masters
7,abd pharma,computer programmer,bachelors
8,abd pharma,business manager,bachelors
9,abd pharma,business manager,masters


In [26]:
target = df['Salary_more_than_100k']
target

0     0
1     0
2     1
3     1
4     0
5     1
6     0
7     0
8     0
9     1
10    1
11    1
12    1
13    1
14    1
15    1
Name: Salary_more_than_100k, dtype: int64

In [27]:
le_company = LabelEncoder()
le_job = LabelEncoder()
le_degree = LabelEncoder()

In [28]:
inputs['Company_n'] = le_company.fit_transform(inputs['Company'])
inputs['Job_n'] = le_company.fit_transform(inputs['Job'])
inputs['Degree_n'] = le_company.fit_transform(inputs['Degree'])
inputs

Unnamed: 0,Company,Job,Degree,Company_n,Job_n,Degree_n
0,google,sales executive,bachelors,2,2,0
1,google,sales executive,masters,2,2,1
2,google,business manager,bachelors,2,0,0
3,google,business manager,masters,2,0,1
4,google,computer programmer,bachelors,2,1,0
5,google,computer programmer,masters,2,1,1
6,abd pharma,sales executive,masters,0,2,1
7,abd pharma,computer programmer,bachelors,0,1,0
8,abd pharma,business manager,bachelors,0,0,0
9,abd pharma,business manager,masters,0,0,1


In [29]:
inputs_n = inputs.drop(['Company', 'Job', 'Degree'], axis = 1)
inputs_n

Unnamed: 0,Company_n,Job_n,Degree_n
0,2,2,0
1,2,2,1
2,2,0,0
3,2,0,1
4,2,1,0
5,2,1,1
6,0,2,1
7,0,1,0
8,0,0,0
9,0,0,1


### Please read the documentation of decision tree classifier

sklearn.tree.DecisionTreeClassifier(*, criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0)

criterion = {“gini”, “entropy”}, default=”gini”

splitter = {“best”, “random”}, default=”best”

max_depth = int, default=None

min_samples_split = int or float, default=2

min_samples_leaf = int or float, default=1

max_leaf_nodes = int, default=None

min_impurity_decrease = float, default=0.0

In [30]:
model_without_splitting = DecisionTreeClassifier()

In [31]:
model_without_splitting.fit(inputs_n, target) 

DecisionTreeClassifier()

In [32]:
model_without_splitting.score(inputs_n, target)
# will show 1 as the same training sets are used for checking the score

1.0

In [33]:
 X_train, X_test, y_train, y_test = train_test_split(inputs_n, target, test_size = 0.2)

In [34]:
model_with_splitting = DecisionTreeClassifier()

In [35]:
model_with_splitting.fit(X_train, y_train) 

DecisionTreeClassifier()

In [36]:
predictions = model_with_splitting.predict(X_test)

In [37]:
model_with_splitting.score(X_test, y_test)
# Here, the accuracy score is very unpredictable

0.5

In [38]:
cm_with_split = confusion_matrix(y_test, predictions)
cm_with_split

array([[0, 1],
       [1, 2]], dtype=int64)

In [45]:
model_with_splitting.score(X_train, y_train)
## Accuracy score here, will be 1 as it is of the training sets

1.0

In [39]:
ac_with_split = accuracy_score(y_test, predictions)
ac_with_split
## Here the score will be less than 1 (can also become equal to 1 ), as it is on the testing sets

0.5

## Getting categorical variables using pd.get_dummies and further model building

In [40]:
df = pd.read_csv("salaries.csv")
df

Unnamed: 0,Company,Job,Degree,Salary_more_than_100k
0,google,sales executive,bachelors,0
1,google,sales executive,masters,0
2,google,business manager,bachelors,1
3,google,business manager,masters,1
4,google,computer programmer,bachelors,0
5,google,computer programmer,masters,1
6,abd pharma,sales executive,masters,0
7,abd pharma,computer programmer,bachelors,0
8,abd pharma,business manager,bachelors,0
9,abd pharma,business manager,masters,1


In [41]:
df1 = pd.get_dummies(df, drop_first = True)
df1

Unnamed: 0,Salary_more_than_100k,Company_facebook,Company_google,Job_computer programmer,Job_sales executive,Degree_masters
0,0,0,1,0,1,0
1,0,0,1,0,1,1
2,1,0,1,0,0,0
3,1,0,1,0,0,1
4,0,0,1,1,0,0
5,1,0,1,1,0,1
6,0,0,0,0,1,1
7,0,0,0,1,0,0
8,0,0,0,0,0,0
9,1,0,0,0,0,1


In [42]:
X = df1.drop('Salary_more_than_100k', axis = 1)
X

Unnamed: 0,Company_facebook,Company_google,Job_computer programmer,Job_sales executive,Degree_masters
0,0,1,0,1,0
1,0,1,0,1,1
2,0,1,0,0,0
3,0,1,0,0,1
4,0,1,1,0,0
5,0,1,1,0,1
6,0,0,0,1,1
7,0,0,1,0,0
8,0,0,0,0,0
9,0,0,0,0,1


In [43]:
Y = df1['Salary_more_than_100k']
Y 

0     0
1     0
2     1
3     1
4     0
5     1
6     0
7     0
8     0
9     1
10    1
11    1
12    1
13    1
14    1
15    1
Name: Salary_more_than_100k, dtype: int64

### Here also, the model can be built with and without splitting into training and testing sets Further in a similar manner as shown above