# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.model_selection import train_test_split
import category_encoders as ce
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# Load and Preview Dataset

In [3]:
df = pd.read_csv(r'E:\PYTHONCLASSJUPYTER\PrakashSenapati\2024_10_18_Decision Tree\Other Projects\car_evaluation.csv')

In [4]:
df.shape

(1728, 7)

In [5]:
df.head()

Unnamed: 0,buying_price,maint_price,no_of_doors,persons_capacity,lug_boot_size,car_safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [6]:
col_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
df.columns = col_names

In [7]:
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [8]:
df.shape

(1728, 7)

# Summary of variables

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1728 non-null   object
 1   maint     1728 non-null   object
 2   doors     1728 non-null   object
 3   persons   1728 non-null   object
 4   lug_boot  1728 non-null   object
 5   safety    1728 non-null   object
 6   class     1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


<font color=green>__Frequency distribution of values in variables :__</font>

In [11]:
col_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
for col in col_names:
    print(df[col].value_counts())
    print('===================================')

buying
vhigh    432
high     432
med      432
low      432
Name: count, dtype: int64
maint
vhigh    432
high     432
med      432
low      432
Name: count, dtype: int64
doors
2        432
3        432
4        432
5more    432
Name: count, dtype: int64
persons
2       576
4       576
more    576
Name: count, dtype: int64
lug_boot
small    576
med      576
big      576
Name: count, dtype: int64
safety
low     576
med     576
high    576
Name: count, dtype: int64
class
unacc    1210
acc       384
good       69
vgood      65
Name: count, dtype: int64


- There are 7 variables in the dataset. All the variables are of categorical data type
- These are given by `buying`, `maint`, `doors`, `persons`, `lug_boot`, `safety` and `class`
- `class` is the target variable

# Explore `class` (Target) Variable

In [12]:
df['class'].value_counts()

class
unacc    1210
acc       384
good       69
vgood      65
Name: count, dtype: int64

In [13]:
df.isnull().sum()

buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
class       0
dtype: int64

# Declare Feature Vector `(X)` and Target Variable `(y)`

In [14]:
X = df.drop(['class'], axis=1)
y = df['class']

In [15]:
X.shape

(1728, 6)

In [16]:
X.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,vhigh,vhigh,2,2,small,low
1,vhigh,vhigh,2,2,small,med
2,vhigh,vhigh,2,2,small,high
3,vhigh,vhigh,2,2,med,low
4,vhigh,vhigh,2,2,med,med


In [17]:
y.shape

(1728,)

In [18]:
y.head()

0    unacc
1    unacc
2    unacc
3    unacc
4    unacc
Name: class, dtype: object

# Split data `(X, y)` into separate training and testing set

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [20]:
print( X_train.shape )
print( X_test.shape )
print( y_train.shape )
print( y_test.shape )

(1157, 6)
(571, 6)
(1157,)
(571,)


In [21]:
X_train.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
48,vhigh,vhigh,3,more,med,low
468,high,vhigh,3,4,small,low
155,vhigh,high,3,more,small,high
1721,low,low,5more,more,small,high
1208,med,low,2,more,small,high


In [22]:
X_test.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
599,high,high,4,2,med,high
1201,med,low,2,4,med,med
628,high,high,5more,2,big,med
1498,low,high,5more,4,med,med
1263,med,low,4,more,med,low


In [23]:
y_train.head()

48      unacc
468     unacc
155     unacc
1721     good
1208    unacc
Name: class, dtype: object

In [24]:
y_test.head()

599     unacc
1201      acc
628     unacc
1498      acc
1263    unacc
Name: class, dtype: object

# Feature Engineering

In [25]:
X_train.dtypes

buying      object
maint       object
doors       object
persons     object
lug_boot    object
safety      object
dtype: object

In [26]:
col_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']
for col in col_names:
    print(X_train[col].value_counts())
    print('===================================')

buying
med      293
high     291
vhigh    287
low      286
Name: count, dtype: int64
maint
med      296
vhigh    289
high     288
low      284
Name: count, dtype: int64
doors
2        297
5more    292
3        286
4        282
Name: count, dtype: int64
persons
4       402
more    380
2       375
Name: count, dtype: int64
lug_boot
small    391
big      386
med      380
Name: count, dtype: int64
safety
high    389
low     386
med     382
Name: count, dtype: int64


In [27]:
col_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']
for col in col_names:
    print(X_test[col].value_counts())
    print('===================================')

buying
low      146
vhigh    145
high     141
med      139
Name: count, dtype: int64
maint
low      148
high     144
vhigh    143
med      136
Name: count, dtype: int64
doors
4        150
3        146
5more    140
2        135
Name: count, dtype: int64
persons
2       201
more    196
4       174
Name: count, dtype: int64
lug_boot
med      196
big      190
small    185
Name: count, dtype: int64
safety
med     194
low     190
high    187
Name: count, dtype: int64


<font color=green>__Encode categorical variables :__</font>

In [28]:
encoder = ce.OrdinalEncoder(cols=['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety'])
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)

In [30]:
col_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']
for col in col_names:
    print(X_train[col].value_counts())
    print('===================================')

buying
4    293
2    291
1    287
3    286
Name: count, dtype: int64
maint
4    296
1    289
2    288
3    284
Name: count, dtype: int64
doors
3    297
2    292
1    286
4    282
Name: count, dtype: int64
persons
2    402
1    380
3    375
Name: count, dtype: int64
lug_boot
2    391
3    386
1    380
Name: count, dtype: int64
safety
2    389
1    386
3    382
Name: count, dtype: int64


In [31]:
col_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']
for col in col_names:
    print(X_test[col].value_counts())
    print('===================================')

buying
3    146
1    145
2    141
4    139
Name: count, dtype: int64
maint
3    148
2    144
1    143
4    136
Name: count, dtype: int64
doors
4    150
1    146
2    140
3    135
Name: count, dtype: int64
persons
3    201
1    196
2    174
Name: count, dtype: int64
lug_boot
1    196
3    190
2    185
Name: count, dtype: int64
safety
3    194
1    190
2    187
Name: count, dtype: int64


In [32]:
X_train.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
48,1,1,1,1,1,1
468,2,1,1,2,2,1
155,1,2,1,1,2,2
1721,3,3,2,1,2,2
1208,4,3,3,1,2,2


In [33]:
X_test.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
599,2,2,4,3,1,2
1201,4,3,3,2,1,3
628,2,2,2,3,3,3
1498,3,2,2,2,1,3
1263,4,3,4,1,1,1


# Decision Tree Classifier with criterion gini index

<font color=green>__Fit Model :__</font>

In [34]:
clf_gini = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=0)
clf_gini.fit(X_train, y_train)

<font color=green>__Predict the Test set results with criterion gini index :__</font>

In [38]:
y_pred_gini = clf_gini.predict(X_test)

In [39]:
y_pred_gini.shape

(571,)

In [40]:
print(y_pred_gini)

['unacc' 'acc' 'unacc' 'acc' 'unacc' 'acc' 'unacc' 'unacc' 'acc' 'unacc'
 'acc' 'acc' 'unacc' 'unacc' 'unacc' 'unacc' 'unacc' 'unacc' 'unacc' 'acc'
 'unacc' 'unacc' 'acc' 'unacc' 'unacc' 'unacc' 'unacc' 'unacc' 'unacc'
 'unacc' 'acc' 'unacc' 'unacc' 'unacc' 'acc' 'acc' 'acc' 'unacc' 'unacc'
 'unacc' 'unacc' 'unacc' 'acc' 'unacc' 'acc' 'acc' 'unacc' 'unacc' 'unacc'
 'unacc' 'unacc' 'unacc' 'acc' 'unacc' 'unacc' 'unacc' 'unacc' 'unacc'
 'unacc' 'acc' 'unacc' 'unacc' 'unacc' 'unacc' 'acc' 'acc' 'unacc' 'unacc'
 'unacc' 'unacc' 'unacc' 'unacc' 'unacc' 'unacc' 'acc' 'acc' 'unacc'
 'unacc' 'unacc' 'unacc' 'acc' 'unacc' 'unacc' 'acc' 'acc' 'acc' 'unacc'
 'unacc' 'acc' 'acc' 'unacc' 'acc' 'unacc' 'unacc' 'unacc' 'acc' 'unacc'
 'unacc' 'unacc' 'acc' 'unacc' 'unacc' 'unacc' 'unacc' 'acc' 'acc' 'acc'
 'unacc' 'unacc' 'acc' 'unacc' 'unacc' 'unacc' 'unacc' 'unacc' 'unacc'
 'acc' 'unacc' 'unacc' 'unacc' 'unacc' 'acc' 'unacc' 'unacc' 'acc' 'unacc'
 'acc' 'unacc' 'unacc' 'unacc' 'unacc' 'unacc' 'acc' 

<font color=green>__Check accuracy score with criterion gini index :__</font>

In [48]:
print('Model (Testing-set) accuracy score with criterion gini index: {0:0.4f}'. format(accuracy_score(y_test, y_pred_gini)))

Model (Testing-set) accuracy score with criterion gini index: 0.8021


**`y_test` are the true class labels and `y_pred_gini` are the predicted class labels in the test-set**

<font color=green>__Compare the train-set and test-set accuracy :__</font>

In [44]:
y_pred_train_gini = clf_gini.predict(X_train)

In [46]:
print(y_pred_train_gini[0:10])

['unacc' 'unacc' 'unacc' 'acc' 'acc' 'acc' 'unacc' 'unacc' 'unacc' 'unacc']


In [47]:
print('Training-set accuracy score with criterion gini index: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train_gini)))

Training-set accuracy score with criterion gini index: 0.7865


<font color=green>__Check for overfitting and underfitting :__</font>

In [49]:
print('Training set score: {:.4f}'.format(clf_gini.score(X_train, y_train)))
print('Test set score: {:.4f}'.format(clf_gini.score(X_test, y_test)))

Training set score: 0.7865
Test set score: 0.8021


* Here, the training-set accuracy score is 0.7865 while the test-set accuracy to be 0.8021. These two values are quite comparable. So, there is no sign of overfitting.

# Decision Tree Classifier with criterion entropy

<font color=green>__Fit Model :__</font>

In [50]:
clf_en = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0)
clf_en.fit(X_train, y_train)

<font color=green>__Predict the Test set results with criterion entropy :__</font>

In [51]:
y_pred_en = clf_en.predict(X_test)

In [52]:
print(y_pred_en)

['unacc' 'acc' 'unacc' 'acc' 'unacc' 'acc' 'unacc' 'unacc' 'acc' 'unacc'
 'acc' 'acc' 'unacc' 'unacc' 'unacc' 'unacc' 'unacc' 'unacc' 'unacc' 'acc'
 'unacc' 'unacc' 'acc' 'unacc' 'unacc' 'unacc' 'unacc' 'unacc' 'unacc'
 'unacc' 'acc' 'unacc' 'unacc' 'unacc' 'acc' 'acc' 'acc' 'unacc' 'unacc'
 'unacc' 'unacc' 'unacc' 'acc' 'unacc' 'acc' 'acc' 'unacc' 'unacc' 'unacc'
 'unacc' 'unacc' 'unacc' 'acc' 'unacc' 'unacc' 'unacc' 'unacc' 'unacc'
 'unacc' 'acc' 'unacc' 'unacc' 'unacc' 'unacc' 'acc' 'acc' 'unacc' 'unacc'
 'unacc' 'unacc' 'unacc' 'unacc' 'unacc' 'unacc' 'acc' 'acc' 'unacc'
 'unacc' 'unacc' 'unacc' 'acc' 'unacc' 'unacc' 'acc' 'acc' 'acc' 'unacc'
 'unacc' 'acc' 'acc' 'unacc' 'acc' 'unacc' 'unacc' 'unacc' 'acc' 'unacc'
 'unacc' 'unacc' 'acc' 'unacc' 'unacc' 'unacc' 'unacc' 'acc' 'acc' 'acc'
 'unacc' 'unacc' 'acc' 'unacc' 'unacc' 'unacc' 'unacc' 'unacc' 'unacc'
 'acc' 'unacc' 'unacc' 'unacc' 'unacc' 'acc' 'unacc' 'unacc' 'acc' 'unacc'
 'acc' 'unacc' 'unacc' 'unacc' 'unacc' 'unacc' 'acc' 

<font color=green>__Check accuracy score with criterion entropy :__</font>

In [53]:
print('Model (Testing-set) accuracy score with criterion entropy: {0:0.4f}'. format(accuracy_score(y_test, y_pred_en)))

Model (Testing-set) accuracy score with criterion entropy: 0.8021


<font color=green>__Compare the train-set and test-set accuracy :__</font>

In [55]:
y_pred_train_en = clf_en.predict(X_train)
print(y_pred_train_en)

['unacc' 'unacc' 'unacc' ... 'unacc' 'unacc' 'acc']


<font color=green>__Check for overfitting and underfitting :__</font>

In [56]:
print('Training set score: {:.4f}'.format(clf_en.score(X_train, y_train)))
print('Test set score: {:.4f}'.format(clf_en.score(X_test, y_test)))

Training set score: 0.7865
Test set score: 0.8021


* The training-set accuracy score is 0.7865 while the test-set accuracy to be 0.8021. These two values are quite comparable. So, there is no sign of overfitting.

# Confusion Matrix

In [57]:
cm = confusion_matrix(y_test, y_pred_en)
print('Confusion matrix\n\n', cm)

Confusion matrix

 [[ 73   0  56   0]
 [ 20   0   0   0]
 [ 12   0 385   0]
 [ 25   0   0   0]]


#  Classification Report

In [58]:
print(classification_report(y_test, y_pred_en))

              precision    recall  f1-score   support

         acc       0.56      0.57      0.56       129
        good       0.00      0.00      0.00        20
       unacc       0.87      0.97      0.92       397
       vgood       0.00      0.00      0.00        25

    accuracy                           0.80       571
   macro avg       0.36      0.38      0.37       571
weighted avg       0.73      0.80      0.77       571



# Results and conclusion

1.	In this project, I build a Decision-Tree Classifier model to predict the safety of the car. I build two models, one with criterion `gini index` and another one with criterion `entropy`. The model yields a very good performance as indicated by the model accuracy in both the cases which was found to be 0.8021.
2.	In the model with criterion `gini index`, the training-set accuracy score is 0.7865 while the test-set accuracy to be 0.8021. These two values are quite comparable. So, there is no sign of overfitting.
3.	Similarly, in the model with criterion `entropy`, the training-set accuracy score is 0.7865 while the test-set accuracy to be 0.8021.We get the same values as in the case with criterion `gini`. So, there is no sign of overfitting.
4.	In both the cases, the training-set and test-set accuracy score is the same. It may happen because of small dataset.
5.	The confusion matrix and classification report yields very good model performance.