In [1]:
#Importing required libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn import metrics

In [2]:
#Loading and displaying the dataset
dataset = pd.read_csv('balance-scale.csv')
dataset

Unnamed: 0,Class,L-Weight,L-Distance,R-Weight,R-Distance
0,B,1,1,1,1
1,R,1,1,1,2
2,R,1,1,1,3
3,R,1,1,1,4
4,R,1,1,1,5
...,...,...,...,...,...
620,L,5,5,5,1
621,L,5,5,5,2
622,L,5,5,5,3
623,L,5,5,5,4


In [3]:
dataset.describe()

Unnamed: 0,L-Weight,L-Distance,R-Weight,R-Distance
count,625.0,625.0,625.0,625.0
mean,3.0,3.0,3.0,3.0
std,1.415346,1.415346,1.415346,1.415346
min,1.0,1.0,1.0,1.0
25%,2.0,2.0,2.0,2.0
50%,3.0,3.0,3.0,3.0
75%,4.0,4.0,4.0,4.0
max,5.0,5.0,5.0,5.0


### Exploratory Data Analysis

In [4]:
#Displaying information about dataset.
dataset.info

<bound method DataFrame.info of     Class  L-Weight  L-Distance  R-Weight  R-Distance
0       B         1           1         1           1
1       R         1           1         1           2
2       R         1           1         1           3
3       R         1           1         1           4
4       R         1           1         1           5
..    ...       ...         ...       ...         ...
620     L         5           5         5           1
621     L         5           5         5           2
622     L         5           5         5           3
623     L         5           5         5           4
624     B         5           5         5           5

[625 rows x 5 columns]>

In [5]:
#Displaying the number of records having NULL values in the respective columns
dataset.isnull().sum()

Class         0
L-Weight      0
L-Distance    0
R-Weight      0
R-Distance    0
dtype: int64

In [6]:
dataset.shape

(625, 5)

In [7]:
dataset.head()

Unnamed: 0,Class,L-Weight,L-Distance,R-Weight,R-Distance
0,B,1,1,1,1
1,R,1,1,1,2
2,R,1,1,1,3
3,R,1,1,1,4
4,R,1,1,1,5


In [8]:
dataset.tail()

Unnamed: 0,Class,L-Weight,L-Distance,R-Weight,R-Distance
620,L,5,5,5,1
621,L,5,5,5,2
622,L,5,5,5,3
623,L,5,5,5,4
624,B,5,5,5,5


In [9]:
#Displaying the unique values in the dataset
dataset.nunique()

Class         3
L-Weight      5
L-Distance    5
R-Weight      5
R-Distance    5
dtype: int64

### Normalization of Data

In [10]:
dataset.columns

Index(['Class', 'L-Weight', 'L-Distance', 'R-Weight', 'R-Distance'], dtype='object')

In [11]:
#Taking columns having integer values 
column = ['L-Weight', 'L-Distance', 'R-Weight', 'R-Distance']

In [12]:
scaling = MinMaxScaler()
scaling.fit_transform(dataset[['L-Weight', 'L-Distance', 'R-Weight', 'R-Distance']])

array([[0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.25],
       [0.  , 0.  , 0.  , 0.5 ],
       ...,
       [1.  , 1.  , 1.  , 0.5 ],
       [1.  , 1.  , 1.  , 0.75],
       [1.  , 1.  , 1.  , 1.  ]])

### Categorical Encoding

In [13]:
#Converting categorical data to numerical data
data = pd.get_dummies(dataset['Class'])
data

Unnamed: 0,B,L,R
0,1,0,0
1,0,0,1
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
620,0,1,0
621,0,1,0
622,0,1,0
623,0,1,0


### Training and Testing of Data

In [14]:
X= dataset[column]
Y = dataset.Class

In [15]:
X_train , X_test , Y_train , Y_test = train_test_split(X,Y,test_size=0.2)

In [16]:
X_train

Unnamed: 0,L-Weight,L-Distance,R-Weight,R-Distance
93,1,4,4,4
513,5,1,3,4
97,1,4,5,3
427,4,3,1,3
141,2,1,4,2
...,...,...,...,...
550,5,3,1,1
346,3,4,5,2
500,5,1,1,1
433,4,3,2,4


In [17]:
X_test

Unnamed: 0,L-Weight,L-Distance,R-Weight,R-Distance
88,1,4,3,4
563,5,3,3,4
277,3,2,1,3
458,4,4,2,4
151,2,2,1,2
...,...,...,...,...
284,3,2,2,5
85,1,4,3,1
175,2,3,1,1
187,2,3,3,3


In [18]:
Y_train

93     R
513    R
97     R
427    L
141    R
      ..
550    L
346    L
500    L
433    L
152    L
Name: Class, Length: 500, dtype: object

In [19]:
Y_test

88     R
563    L
277    L
458    L
151    L
      ..
284    R
85     L
175    L
187    R
470    L
Name: Class, Length: 125, dtype: object

### Descision Tree Classifier

In [20]:
dtc = DecisionTreeClassifier()

In [21]:
dtc.fit(X_train,Y_train)

DecisionTreeClassifier()

In [22]:
Y_predict = dtc.predict(X_test)

In [23]:
Y_predict

array(['R', 'B', 'L', 'L', 'B', 'R', 'R', 'L', 'R', 'B', 'R', 'L', 'L',
       'L', 'L', 'L', 'L', 'R', 'R', 'R', 'R', 'L', 'R', 'R', 'R', 'L',
       'L', 'L', 'R', 'L', 'R', 'B', 'L', 'R', 'L', 'R', 'R', 'R', 'R',
       'L', 'L', 'R', 'R', 'L', 'R', 'L', 'R', 'L', 'L', 'L', 'R', 'R',
       'R', 'R', 'L', 'R', 'R', 'R', 'R', 'L', 'R', 'L', 'B', 'L', 'R',
       'B', 'R', 'R', 'L', 'R', 'R', 'L', 'L', 'R', 'R', 'L', 'L', 'L',
       'L', 'R', 'L', 'R', 'R', 'L', 'L', 'L', 'L', 'L', 'R', 'L', 'B',
       'R', 'R', 'L', 'L', 'R', 'R', 'L', 'L', 'L', 'L', 'R', 'R', 'B',
       'L', 'R', 'R', 'L', 'B', 'R', 'R', 'L', 'R', 'L', 'R', 'L', 'R',
       'B', 'R', 'L', 'R', 'B', 'L', 'R', 'L'], dtype=object)

### Hyperparameter Tuning

In [24]:
confusion_matrix(Y_test, Y_predict)

array([[ 0,  2,  5],
       [ 8, 51,  4],
       [ 3,  2, 50]], dtype=int64)

In [25]:
accuracy_score(Y_test, Y_predict)

0.808

In [26]:
param_dist = {
    "criterion":["gini", "entropy"],
    "max_depth":[1,2,3,4,5,6,7]
}

In [28]:
grid = GridSearchCV(dtc, param_grid = param_dist, cv = 18, n_jobs = -1)

In [29]:
grid.fit(X_train, Y_train)

GridSearchCV(cv=18, estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [1, 2, 3, 4, 5, 6, 7]})

In [30]:
grid.best_params_

{'criterion': 'gini', 'max_depth': 5}

In [31]:
grid.best_estimator_

DecisionTreeClassifier(max_depth=5)

In [32]:
grid.best_score_

0.7661669606114052