In [1]:
#Importing required libraries
import numpy as np
import pandas as pd

In [2]:
#Loading and displaying the dataset
dataset = pd.read_csv('balance-scale.csv')
dataset

Unnamed: 0,Class,L-Weight,L-Distance,R-Weight,R-Distance
0,B,1,1,1,1
1,R,1,1,1,2
2,R,1,1,1,3
3,R,1,1,1,4
4,R,1,1,1,5
...,...,...,...,...,...
620,L,5,5,5,1
621,L,5,5,5,2
622,L,5,5,5,3
623,L,5,5,5,4


In [3]:
dataset.describe()

Unnamed: 0,L-Weight,L-Distance,R-Weight,R-Distance
count,625.0,625.0,625.0,625.0
mean,3.0,3.0,3.0,3.0
std,1.415346,1.415346,1.415346,1.415346
min,1.0,1.0,1.0,1.0
25%,2.0,2.0,2.0,2.0
50%,3.0,3.0,3.0,3.0
75%,4.0,4.0,4.0,4.0
max,5.0,5.0,5.0,5.0


### Exploratory Data Analysis

In [4]:
#Displaying information about dataset.
dataset.info

<bound method DataFrame.info of     Class  L-Weight  L-Distance  R-Weight  R-Distance
0       B         1           1         1           1
1       R         1           1         1           2
2       R         1           1         1           3
3       R         1           1         1           4
4       R         1           1         1           5
..    ...       ...         ...       ...         ...
620     L         5           5         5           1
621     L         5           5         5           2
622     L         5           5         5           3
623     L         5           5         5           4
624     B         5           5         5           5

[625 rows x 5 columns]>

In [5]:
#Displaying the number of records having NULL values in the respective columns
dataset.isnull().sum()

Class         0
L-Weight      0
L-Distance    0
R-Weight      0
R-Distance    0
dtype: int64

In [6]:
dataset.shape

(625, 5)

In [7]:
dataset.head()

Unnamed: 0,Class,L-Weight,L-Distance,R-Weight,R-Distance
0,B,1,1,1,1
1,R,1,1,1,2
2,R,1,1,1,3
3,R,1,1,1,4
4,R,1,1,1,5


In [8]:
dataset.tail()

Unnamed: 0,Class,L-Weight,L-Distance,R-Weight,R-Distance
620,L,5,5,5,1
621,L,5,5,5,2
622,L,5,5,5,3
623,L,5,5,5,4
624,B,5,5,5,5


In [9]:
#Displaying the unique values in the dataset
dataset.nunique()

Class         3
L-Weight      5
L-Distance    5
R-Weight      5
R-Distance    5
dtype: int64

### Normalization of Data

In [10]:
from sklearn.preprocessing import MinMaxScaler

In [11]:
dataset.columns

Index(['Class', 'L-Weight', 'L-Distance', 'R-Weight', 'R-Distance'], dtype='object')

In [12]:
#Taking columns having integer values 
column = ['L-Weight', 'L-Distance', 'R-Weight', 'R-Distance']

In [13]:
scaling = MinMaxScaler()
scaling.fit_transform(dataset[['L-Weight', 'L-Distance', 'R-Weight', 'R-Distance']])

array([[0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.25],
       [0.  , 0.  , 0.  , 0.5 ],
       ...,
       [1.  , 1.  , 1.  , 0.5 ],
       [1.  , 1.  , 1.  , 0.75],
       [1.  , 1.  , 1.  , 1.  ]])

### Categorical Encoding

In [14]:
dataset.Class

0      B
1      R
2      R
3      R
4      R
      ..
620    L
621    L
622    L
623    L
624    B
Name: Class, Length: 625, dtype: object

In [15]:
#Converting categorical data to numerical data
data = pd.get_dummies(dataset['Class'])
data

Unnamed: 0,B,L,R
0,1,0,0
1,0,0,1
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
620,0,1,0
621,0,1,0
622,0,1,0
623,0,1,0


### Training and Testing of Data

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
X= dataset[column]
Y = dataset.Class

In [18]:
X_train , X_test , Y_train , Y_test = train_test_split(X,Y,test_size=0.2,random_state=101,shuffle=False)

In [19]:
print(X.shape,X_train.shape,X_test.shape)
#Total 625 data points in which 80% of data which is 500 goes to training set and
# 20% of data which is 125 goes to testing set.

(625, 4) (500, 4) (125, 4)


In [20]:
X_train

Unnamed: 0,L-Weight,L-Distance,R-Weight,R-Distance
0,1,1,1,1
1,1,1,1,2
2,1,1,1,3
3,1,1,1,4
4,1,1,1,5
...,...,...,...,...
495,4,5,5,1
496,4,5,5,2
497,4,5,5,3
498,4,5,5,4


In [21]:
X_test

Unnamed: 0,L-Weight,L-Distance,R-Weight,R-Distance
500,5,1,1,1
501,5,1,1,2
502,5,1,1,3
503,5,1,1,4
504,5,1,1,5
...,...,...,...,...
620,5,5,5,1
621,5,5,5,2
622,5,5,5,3
623,5,5,5,4


In [22]:
Y_train

0      B
1      R
2      R
3      R
4      R
      ..
495    L
496    L
497    L
498    B
499    R
Name: Class, Length: 500, dtype: object

In [23]:
Y_test

500    L
501    L
502    L
503    L
504    B
      ..
620    L
621    L
622    L
623    L
624    B
Name: Class, Length: 125, dtype: object

### Descision Tree Classifier

In [24]:
from sklearn.tree import DecisionTreeClassifier

In [25]:
#Decision Tree Classifier helps in creating a model that predicts the value of a target variable
dtree = DecisionTreeClassifier()

In [26]:
# fitting model into training data
dtree.fit(X_train,Y_train)

DecisionTreeClassifier()

In [27]:
predictions = dtree.predict(X_test)

In [28]:
predictions

array(['L', 'L', 'L', 'B', 'R', 'L', 'B', 'R', 'R', 'R', 'L', 'R', 'R',
       'R', 'R', 'B', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'L',
       'L', 'L', 'L', 'L', 'L', 'L', 'L', 'B', 'R', 'L', 'L', 'R', 'R',
       'R', 'L', 'B', 'R', 'R', 'R', 'L', 'R', 'R', 'R', 'R', 'L', 'L',
       'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'B', 'R',
       'L', 'L', 'B', 'R', 'R', 'L', 'L', 'R', 'R', 'R', 'L', 'L', 'L',
       'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L',
       'L', 'L', 'B', 'R', 'L', 'L', 'L', 'R', 'R', 'L', 'L', 'L', 'L',
       'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L',
       'L', 'L', 'B', 'L', 'L', 'L', 'B', 'R'], dtype=object)

### Model Prediction and Evaluation

In [29]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score
from sklearn import metrics

In [30]:
#classification report is used to measure the quality of prediction like whether it is true or false
classification_report(Y_test,predictions)

'              precision    recall  f1-score   support\n\n           B       0.00      0.00      0.00         9\n           L       1.00      0.88      0.93        88\n           R       0.74      1.00      0.85        28\n\n    accuracy                           0.84       125\n   macro avg       0.58      0.62      0.59       125\nweighted avg       0.87      0.84      0.85       125\n'

In [31]:
print("Accuracy:",metrics.accuracy_score(Y_test,predictions))

Accuracy: 0.84


In [32]:
confusion_matrix(Y_test,predictions)

array([[ 0,  0,  9],
       [10, 77,  1],
       [ 0,  0, 28]], dtype=int64)

### Hyperparameter Tuning using GridSearchCV

In [33]:
from sklearn.model_selection import GridSearchCV

In [34]:
param_dict = {
    "criterion":["gini", "entropy"],
    "max_depth":[1,2,3,4,None]
}

In [35]:
#basically GridSearchCV forms 2d coordinate space as we have given two parameters which is gini and entropy.
grid = GridSearchCV(dtree, param_grid = param_dict, cv =10, n_jobs = -1)

In [36]:
grid.fit(X_train, Y_train)

GridSearchCV(cv=10, estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [1, 2, 3, 4, None]})

In [37]:
grid.best_params_

{'criterion': 'gini', 'max_depth': 4}

In [38]:
grid.best_estimator_

DecisionTreeClassifier(max_depth=4)

In [39]:
grid.best_score_

0.7

### RandomSearchCV

In [40]:
from sklearn.model_selection import RandomizedSearchCV 

In [41]:
random = RandomizedSearchCV(dtree, param_dict, cv=10,n_jobs = -1)

In [42]:
random.fit(X_train,Y_train)

RandomizedSearchCV(cv=10, estimator=DecisionTreeClassifier(), n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [1, 2, 3, 4, None]})

In [43]:
random.best_params_

{'max_depth': 4, 'criterion': 'gini'}

In [44]:
random.best_estimator_

DecisionTreeClassifier(max_depth=4)

In [45]:
random.best_score_

0.7

### Random Forest Classifier

Comparing the decision tree model to a random forest

In [46]:
from sklearn.ensemble import RandomForestClassifier

In [47]:
rfc= RandomForestClassifier(n_estimators=400)

In [48]:
rfc.fit(X_train,Y_train)

RandomForestClassifier(n_estimators=400)

In [49]:
rfc_pred = rfc.predict(X_test)

In [50]:
classification_report(Y_test,rfc_pred)

'              precision    recall  f1-score   support\n\n           B       0.00      0.00      0.00         9\n           L       1.00      0.88      0.93        88\n           R       0.74      1.00      0.85        28\n\n    accuracy                           0.84       125\n   macro avg       0.58      0.62      0.59       125\nweighted avg       0.87      0.84      0.85       125\n'

In [51]:
confusion_matrix(Y_test,rfc_pred)

array([[ 0,  0,  9],
       [10, 77,  1],
       [ 0,  0, 28]], dtype=int64)

In [52]:
dataset['Class'].value_counts()

R    288
L    288
B     49
Name: Class, dtype: int64

In [53]:
accuracy_score(Y_test,predictions)

0.84

   ## Observations and Problem faced
   
    1.loading and displaying Balance Scale dataset.
    
    2.Statistical summary - This includes the count, mean, the min and max values and percentiles.
    we can see that all the numerical values have the same scale and similar ranges between 0 and 5.
    
    3.Summarizing and getting dimensions of dataset.
    
    4.Creating a validation dataset - estimating the accuracy of models using statistical method
    that is splitting this dataset into two, 80% of which is used for training and evaluating
    and rest 20% is hold back as validation dataset.
    
    5.Making predictions  by comparing  to the expected results in the validation set, 
    then calculating classification accuracy,confusion matrix and a classification report.
    
    6.we are getting accuracy 0.84 which is about 84% on the given dataset.
    
    7.On applying hyperparametertuning for better accuracy, accuracy score is decreased due to overfitting of data
    (high testing error)
    
    8.Applying Random Forest Classifier and then comparing it with descision tree model 
    same accuracy is obtained this is due to smaller dataset provided.
    
    