In [33]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # data visualization
import seaborn as sns # statistical data visualization
%matplotlib inline

## Participation

Look at your neighbor and discuss the dataset with high and low entropy. How about when you run some classification methods and you look at your precisions in class? Is it high entropy or low entropy? If accuracy is low, what's entropy?

**Ans:** Entropy measures uncertainty in a dataset. High entropy indicates randomness, such as when classes are evenly distributed, making predictions harder. Low entropy suggests predictability, like when one class dominates, but this can also lead to imbalanced predictions.

If classification accuracy is low, the dataset might have high entropy, making it challenging for the model to distinguish patterns. However, low accuracy in a low-entropy dataset could point to poor model performance, underfitting, or insufficient features rather than data complexity.

Data is from UCI ML datasets and can be found on [Kaggle](https://www.kaggle.com/datasets/elikplim/car-evaluation-data-set?resource=download)

In [None]:
!wget https://archive.ics.uci.edu/static/public/19/car+evaluation.zip -o /dev/null
!unzip car+evaluation.zip

Archive:  car+evaluation.zip
replace car.c45-names? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
df = pd.read_csv('car.data')
df

In [None]:
display(df.shape)
# preview the dataset
df.head()

In [None]:
col_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']


df.columns = col_names

col_names

In [None]:
# let's again preview the dataset

df.head()

In [None]:
df.info()

In [None]:
col_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']


for col in col_names:

    print(df[col].value_counts())
#We can see that the doors and persons are categorical in nature. So, I will treat them as categorical variables.
#There are 7 variables in the dataset. All the variables are of categorical data type.
#These are given by buying, maint, doors, persons, lug_boot, safety and class.
#class is the target variable.

In [None]:
df['class'].value_counts()

In [None]:
df.isnull().sum()

In [None]:
X = df.drop(['class'], axis=1)

y = df['class']

# split X and y into training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

X_train.shape, X_test.shape

In [None]:
# check data types in X_train

X_train.dtypes

In [None]:
# import category encoders
!pip install category_encoders
import category_encoders as ce
X_train.head()

In [None]:
# encode variables with ordinal encoding

encoder = ce.OrdinalEncoder(cols=['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety'])


X_train = encoder.fit_transform(X_train)

X_test = encoder.transform(X_test)
X_train.head()

### Data is READY!!!!
Time to import our [DecisionTreeClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html)

In [None]:
# import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
# instantiate the DecisionTreeClassifier model with criterion gini index

clf_gini = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=0)


# fit the model
clf_gini.fit(X_train, y_train)

'''
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=0, splitter='best')
'''

In [None]:
y_pred_gini = clf_gini.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score

print('Model accuracy score with criterion gini index: {0:0.4f}'. format(accuracy_score(y_test, y_pred_gini)))

#Here, y_test are the true class labels and y_pred_gini are the predicted class labels in the test-set.

In [None]:
y_pred_train_gini = clf_gini.predict(X_train)

y_pred_train_gini

In [None]:
print('Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train_gini)))

In [None]:
# print the scores on training and test set

print('Training set score: {:.4f}'.format(clf_gini.score(X_train, y_train)))

print('Test set score: {:.4f}'.format(clf_gini.score(X_test, y_test)))

#Here, the training-set accuracy score is 0.7865 while the test-set accuracy to be 0.8021. These two values are quite comparable. So, there is no sign of overfitting.

In [None]:
plt.figure(figsize=(12,8))

from sklearn import tree

tree.plot_tree(clf_gini.fit(X_train, y_train))

In [None]:
import graphviz # Not install inlocal install please use in collab or pip install
dot_data = tree.export_graphviz(clf_gini, out_file=None,
                              feature_names=X_train.columns,
                              class_names=y_train,
                              filled=True, rounded=True,
                              special_characters=True)

graph = graphviz.Source(dot_data)

graph

In [None]:
# instantiate the DecisionTreeClassifier model with criterion entropy

clf_en = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0)


# fit the model
clf_en.fit(X_train, y_train)

In [None]:
y_pred_en = clf_en.predict(X_test)

In [None]:

print('Model accuracy score with criterion entropy: {0:0.4f}'. format(accuracy_score(y_test, y_pred_en)))

In [None]:
y_pred_train_en = clf_en.predict(X_train)

y_pred_train_en

In [None]:
print('Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train_en)))

In [None]:
# print the scores on training and test set

print('Training set score: {:.4f}'.format(clf_en.score(X_train, y_train)))

print('Test set score: {:.4f}'.format(clf_en.score(X_test, y_test)))

In [None]:
plt.figure(figsize=(12,8))

from sklearn import tree

tree.plot_tree(clf_en.fit(X_train, y_train))

In [None]:

dot_data = tree.export_graphviz(clf_en, out_file=None,
                              feature_names=X_train.columns,
                              class_names=y_train,
                              filled=True, rounded=True,
                              special_characters=True)

graph = graphviz.Source(dot_data)

graph

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred_en))

In this project, I build a Decision-Tree Classifier model to predict the safety of the car. I build two models, one with criterion gini index and another one with criterion entropy. The model yields a very good performance as indicated by the model accuracy in both the cases which was found to be 0.8021.
In the model with criterion gini index, the training-set accuracy score is 0.7865 while the test-set accuracy to be 0.8021. These two values are quite comparable. So, there is no sign of overfitting.
Similarly, in the model with criterion entropy, the training-set accuracy score is 0.7865 while the test-set accuracy to be 0.8021.We get the same values as in the case with criterion gini. So, there is no sign of overfitting.
In both the cases, the training-set and test-set accuracy score is the same. It may happen because of small dataset.
The confusion matrix and classification report yields very good model performance.