# Assignment 6.1

In [1]:
import pandas as pd

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import confusion_matrix, classification_report

## Activity 4.01: Car Data Classification

In [2]:
# Load the dataset
df = pd.read_csv('car.csv')

# Basic data exploration
print(f"{df.head()}")
print(f"\n{df.describe()}")
print(f"\nMissing values:\n{df.isna().sum()}")


  buying maintenance doors persons luggage_boot safety  class
0  vhigh       vhigh     2       2        small    low  unacc
1  vhigh       vhigh     2       2        small    med  unacc
2  vhigh       vhigh     2       2        small   high  unacc
3  vhigh       vhigh     2       2          med    low  unacc
4  vhigh       vhigh     2       2          med    med  unacc

       buying maintenance doors persons luggage_boot safety  class
count    1728        1728  1728    1728         1728   1728   1728
unique      4           4     4       3            3      3      4
top     vhigh       vhigh     2       2        small    low  unacc
freq      432         432   432     576          576    576   1210

Missing values:
buying          0
maintenance     0
doors           0
persons         0
luggage_boot    0
safety          0
class           0
dtype: int64


Even though I don't believe a data leak is possible with this pipeline, as a matter of best practice, I perform the splitting of features and target variables, as well as training and testing set as early as possible to avoid inadvertent data leaks.

In [3]:
# Split the data into features and target variable
label = df.pop('class')
features = df

# Split the dataset into training and testing sets
features_train, features_test, label_train, label_test = train_test_split(
    features, label, test_size=0.1, random_state=620)

In [4]:
# Encode the training labels
label_encoder = LabelEncoder()
label_train_enc = label_encoder.fit_transform(label_train)

# Encode the training features
features_encoder = OrdinalEncoder()
features_train_enc = features_encoder.fit_transform(features_train)

In [5]:
# Verify the features encoder categories
features_encoder.categories_

[array(['high', 'low', 'med', 'vhigh'], dtype=object),
 array(['high', 'low', 'med', 'vhigh'], dtype=object),
 array(['2', '3', '4', '5more'], dtype=object),
 array(['2', '4', 'more'], dtype=object),
 array(['big', 'med', 'small'], dtype=object),
 array(['high', 'low', 'med'], dtype=object)]

In [6]:
# Train the Decision Tree Classifier
model = DecisionTreeClassifier()
model.fit(features_train_enc, label_train_enc)

# Evaluate the model

# Encode the test labels and features
label_test_enc = label_encoder.transform(label_test)
features_test_enc = features_encoder.transform(features_test)

# Accuracy Score
model_score = model.score(features_test_enc, label_test_enc)
print(f"Accuracy Score: {model_score:.3f}")

# Confusion Matrix
model_confusion_matrix = confusion_matrix(label_test_enc, model.predict(features_test_enc))
print("\nConfusion Matrix:")
print(model_confusion_matrix)

# Classification Report
model_classification_report = classification_report(label_test_enc, model.predict(features_test_enc), target_names=label_encoder.classes_)
print("\nClassification Report:")
print(model_classification_report)


Accuracy Score: 0.977

Confusion Matrix:
[[ 39   0   2   0]
 [  1   7   0   0]
 [  1   0 116   0]
 [  0   0   0   7]]

Classification Report:
              precision    recall  f1-score   support

         acc       0.95      0.95      0.95        41
        good       1.00      0.88      0.93         8
       unacc       0.98      0.99      0.99       117
       vgood       1.00      1.00      1.00         7

    accuracy                           0.98       173
   macro avg       0.98      0.95      0.97       173
weighted avg       0.98      0.98      0.98       173



## Activity 4.02: Random Forest Classification for Your Car Rental Company

### Random Forests

In [7]:
# Train the Random Forest Classifier
model_rfc = RandomForestClassifier(n_estimators=100, max_depth=15, random_state=620)
model_rfc.fit(features_train_enc, label_train_enc)

# Evaluate the Random Forest model
model_rfc_score = model_rfc.score(features_test_enc, label_test_enc)
print(f"Random Forest Accuracy Score: {model_rfc_score:.3f}")

model_rfc_confusion_matrix = confusion_matrix(label_test_enc, model_rfc.predict(features_test_enc))
print("\nRandom Forest Confusion Matrix:")
print(model_rfc_confusion_matrix)

model_rfc_classification_report = classification_report(label_test_enc, model_rfc.predict(features_test_enc), target_names=label_encoder.classes_)
print("\nRandom Forest Classification Report:")
print(model_rfc_classification_report)


Random Forest Accuracy Score: 0.977

Random Forest Confusion Matrix:
[[ 40   0   1   0]
 [  0   7   0   1]
 [  2   0 115   0]
 [  0   0   0   7]]

Random Forest Classification Report:
              precision    recall  f1-score   support

         acc       0.95      0.98      0.96        41
        good       1.00      0.88      0.93         8
       unacc       0.99      0.98      0.99       117
       vgood       0.88      1.00      0.93         7

    accuracy                           0.98       173
   macro avg       0.95      0.96      0.95       173
weighted avg       0.98      0.98      0.98       173



In [8]:
# Feature Importance
rfc_feature_importances = model_rfc.feature_importances_
feature_names = features.columns
rfc_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': rfc_feature_importances})
print("\nFeature Importances:")
print(rfc_importance_df)



Feature Importances:
        Feature  Importance
0        buying    0.196859
1   maintenance    0.159864
2         doors    0.061116
3       persons    0.230212
4  luggage_boot    0.086462
5        safety    0.265486


### Extremely Randomized Trees

In [9]:
# Train the Extra Trees Classifier
model_erfc = ExtraTreesClassifier(n_estimators=100, max_depth=15, random_state=620)
model_erfc.fit(features_train_enc, label_train_enc)
# Evaluate the Extra Random Forest model

# Evaluate the Extra Random Forest model
model_erfc_score = model_erfc.score(features_test_enc, label_test_enc)
print(f"Extra Random Forest Accuracy Score: {model_erfc_score:.3f}")

model_erfc_confusion_matrix = confusion_matrix(label_test_enc, model_erfc.predict(features_test_enc))
print("\nExtra Random Forest Confusion Matrix:")
print(model_erfc_confusion_matrix)

model_erfc_classification_report = classification_report(label_test_enc, model_erfc.predict(features_test_enc), target_names=label_encoder.classes_)
print("\nExtra Random Forest Classification Report:")
print(model_erfc_classification_report)

Extra Random Forest Accuracy Score: 0.994

Extra Random Forest Confusion Matrix:
[[ 41   0   0   0]
 [  1   7   0   0]
 [  0   0 117   0]
 [  0   0   0   7]]

Extra Random Forest Classification Report:
              precision    recall  f1-score   support

         acc       0.98      1.00      0.99        41
        good       1.00      0.88      0.93         8
       unacc       1.00      1.00      1.00       117
       vgood       1.00      1.00      1.00         7

    accuracy                           0.99       173
   macro avg       0.99      0.97      0.98       173
weighted avg       0.99      0.99      0.99       173



In [10]:
# Feature Importance
erfc_feature_importances = model_erfc.feature_importances_
erfc_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': erfc_feature_importances})
print("\nExtra Random Forest Feature Importances:")
print(erfc_importance_df)


Extra Random Forest Feature Importances:
        Feature  Importance
0        buying    0.204091
1   maintenance    0.180577
2         doors    0.052922
3       persons    0.231161
4  luggage_boot    0.073799
5        safety    0.257449
