In [1]:
# Import pandas for data loading and manipulation
import pandas as pd


In [2]:
# URL of the dataset (car dealership dataset)
file_url = 'https://raw.githubusercontent.com/PacktWorkshops/The-Applied-Artificial-Intelligence-Workshop/master/Datasets/car.csv'

In [3]:
# Read the CSV file directly from the URL
df = pd.read_csv(file_url)

In [4]:
# Import label encoding tool from scikit-learn
from sklearn import preprocessing

In [5]:
# Function to encode a categorical column into numeric labels
def encode(data_frame, column):
    label_encoder = preprocessing.LabelEncoder()     # create label encoder
    label_encoder.fit(data_frame[column].unique())   # learn all unique values
    return label_encoder.transform(data_frame[column])  # transform into numbers

In [6]:
# Apply label encoding to every column in the dataframe
for column in df.columns:
    df[column] = encode(df, column)

In [7]:
# Separate the target label column ("class")
label = df.pop('class')

In [8]:
# Split the dataset into training and testing sets (90% train, 10% test)
from sklearn import model_selection
features_train, features_test, label_train, label_test = model_selection.train_test_split(
    df, label, test_size=0.1, random_state=88
)


In [9]:
# Import Random Forest classifier
from sklearn.ensemble import RandomForestClassifier

In [10]:
# Create a Random Forest model with 100 trees and limited depth to avoid overfitting
random_forest_classifier = RandomForestClassifier(
    n_estimators=100,
    max_depth=6,
    random_state=168
)

In [11]:
# Train the Random Forest model on training data
random_forest_classifier.fit(features_train, label_train)

In [12]:
# Predict the classes on the test set
rf_preds_test = random_forest_classifier.predict(features_test)

In [13]:
# Show the predictions
rf_preds_test

array([0, 0, 2, 0, 0, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       0, 2, 2, 3, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 0, 2, 0, 0, 0, 2, 2, 0, 2, 0, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 0,
       0, 2, 2, 2, 0, 2, 3, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 0, 0, 2, 2, 2,
       2, 2, 2, 2, 0, 2, 0, 0, 2, 2, 0, 2, 2, 2, 0, 0, 2, 0, 2, 2, 2, 2,
       0, 2, 2, 0, 2, 2, 3, 0, 2, 2, 2, 2, 0, 0, 2, 2, 2, 0, 2, 2, 2, 0,
       2, 2, 2, 2, 2, 2, 2, 0, 3, 3, 2, 0, 0, 2, 2, 2, 0, 0, 2, 2, 0, 2,
       2, 2, 2, 2, 0, 0, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 0, 0, 2])

In [14]:
# Import classification report to evaluate performance
from sklearn.metrics import classification_report


In [15]:
# Display precision, recall, F1-score for each class
print(classification_report(label_test, rf_preds_test))

              precision    recall  f1-score   support

           0       0.67      0.76      0.71        42
           1       0.00      0.00      0.00         9
           2       0.92      0.96      0.94       114
           3       0.83      0.62      0.71         8

    accuracy                           0.84       173
   macro avg       0.60      0.59      0.59       173
weighted avg       0.80      0.84      0.82       173



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [16]:
# Import confusion matrix to view errors in classification
from sklearn.metrics import confusion_matrix


In [17]:
# Generate confusion matrix for the Random Forest predictions
confusion_matrix(label_test, rf_preds_test)

array([[ 32,   0,  10,   0],
       [  8,   0,   0,   1],
       [  5,   0, 109,   0],
       [  3,   0,   0,   5]])

In [18]:
# Get feature importance scores from Random Forest
rf_varimp = random_forest_classifier.feature_importances_

In [19]:
# Display which features contributed most to the decision-making
rf_varimp

array([0.12676384, 0.10366314, 0.02119621, 0.35266673, 0.05915769,
       0.33655239])

In [20]:
# Import Extra Trees classifier (more randomness than Random Forest)
from sklearn.ensemble import ExtraTreesClassifier

In [21]:
# Create Extra Trees model with same parameters
extra_trees_classifier = ExtraTreesClassifier(
    n_estimators=100,
    max_depth=6,
    random_state=168
)

In [22]:
# Train the Extra Trees model
extra_trees_classifier.fit(features_train, label_train)

In [23]:
# Predict on the test dataset
et_preds_test = extra_trees_classifier.predict(features_test)


In [24]:
# Show Extra Trees predictions
et_preds_test

array([0, 0, 2, 0, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       0, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 0, 2, 0, 0, 0, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 0,
       0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 0, 0, 2, 2, 2,
       2, 2, 2, 2, 0, 2, 0, 0, 2, 2, 0, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2,
       0, 2, 2, 0, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 0,
       2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 2, 0, 0, 2, 2, 2, 0, 0, 2, 2, 0, 2,
       2, 2, 2, 2, 0, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 0, 2])

In [25]:
# Classification report for Extra Trees
print(classification_report(label_test, et_preds_test))

              precision    recall  f1-score   support

           0       0.61      0.67      0.64        42
           1       0.00      0.00      0.00         9
           2       0.89      0.98      0.93       114
           3       1.00      0.12      0.22         8

    accuracy                           0.82       173
   macro avg       0.62      0.44      0.45       173
weighted avg       0.78      0.82      0.78       173



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [26]:
# Confusion matrix for Extra Trees
confusion_matrix(label_test, et_preds_test)

array([[ 28,   0,  14,   0],
       [  9,   0,   0,   0],
       [  2,   0, 112,   0],
       [  7,   0,   0,   1]])

In [27]:
# Get feature importances from Extra Trees
et_varimp = extra_trees_classifier.feature_importances_

In [28]:
# Display the feature importance scores
et_varimp

array([0.08844544, 0.0702334 , 0.01440408, 0.37662014, 0.05965896,
       0.39063797])