In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

This Model contains variables that are related to car evaluation.  
Things that would be related to buying a car and wether the car is accptable to buy or not.  

I will see if I can use a Decision Tree Classifier to produce a reasonable result.

# Load Data 

In [9]:
# Load dataset to a pandas DataFrame
path_to_data = 'https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data'
column_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'accep']
df = pd.read_csv(path_to_data, names=column_names)
print(df)

     buying  maint  doors persons lug_boot safety  accep
0     vhigh  vhigh      2       2    small    low  unacc
1     vhigh  vhigh      2       2    small    med  unacc
2     vhigh  vhigh      2       2    small   high  unacc
3     vhigh  vhigh      2       2      med    low  unacc
4     vhigh  vhigh      2       2      med    med  unacc
...     ...    ...    ...     ...      ...    ...    ...
1723    low    low  5more    more      med    med   good
1724    low    low  5more    more      med   high  vgood
1725    low    low  5more    more      big    low  unacc
1726    low    low  5more    more      big    med   good
1727    low    low  5more    more      big   high  vgood

[1728 rows x 7 columns]


# Features and Labels

One hot encode the features and set the labels to binary values


In [3]:
target_column = 'accep'
raw_feature_columns = [col for col in column_names if col != target_column]

# Create dummy variables from the feature columns
X = pd.get_dummies(df[raw_feature_columns], drop_first=True)

# Convert target column to binary variable; 0 if 'unacc', 1 otherwise
df[target_column] = np.where(df[target_column] == 'unacc', 0, 1)
y = df[target_column]

# Split The Data
Train Test Split gives us something to test after we train our training set.  
This will give us something to test out predictions so we don't test on trained data. 

In [4]:
# Split the full dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123, test_size=0.3)

# Train Model

In [5]:
# 1. Create a decision stump base model using the Decision Tree Classifier and print its parameters
# a decision stump uses a depth of 1 and only makes 2 leaf nodes
decision_stump = DecisionTreeClassifier(max_depth=1)
decision_stump.fit(X_train, y_train)
y_pred = decision_stump.predict(X_test)
print(decision_stump.get_params())

{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 1, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': None, 'splitter': 'best'}


# Boost the Decision Tree

The Adaboostclassifier will give us better parameters that will produce a better score.

In [6]:
# 2. Create an Adaptive Boost Classifier and print its parameters
ada_classifier = AdaBoostClassifier(estimator=decision_stump, n_estimators=5)
print(ada_classifier.get_params())

{'algorithm': 'SAMME.R', 'base_estimator': 'deprecated', 'estimator__ccp_alpha': 0.0, 'estimator__class_weight': None, 'estimator__criterion': 'gini', 'estimator__max_depth': 1, 'estimator__max_features': None, 'estimator__max_leaf_nodes': None, 'estimator__min_impurity_decrease': 0.0, 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 2, 'estimator__min_weight_fraction_leaf': 0.0, 'estimator__random_state': None, 'estimator__splitter': 'best', 'estimator': DecisionTreeClassifier(max_depth=1), 'learning_rate': 1.0, 'n_estimators': 5, 'random_state': None}


# Predict

In [7]:
# 3. Fit the Adaptive Boost Classifier to the training data and get the list of predictions
ada_classifier.fit(X_train, y_train)
y_pred = ada_classifier.predict(X_test)

# 4. Calculate the accuracy, precision, recall, and f1-score on the testing data
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Prediction Outcome

In [8]:
# 5. Remove the comments from the following code block to print the confusion matrix
test_conf_matrix = pd.DataFrame(
    confusion_matrix(y_test, y_pred, labels=[1, 0]), 
    index=['actual yes', 'actual no'], 
    columns=['predicted yes', 'predicted no']
)
print(f'Confusion Matrix:\n{test_conf_matrix.to_string()}')

print(f'Accuracy score : {accuracy}')
print(f'Precision score : {precision}')
print(f'Recall score : {recall}')
print(f'f1 score : {f1}')

Confusion Matrix:
            predicted yes  predicted no
actual yes            129            25
actual no              49           316
Accuracy score : 0.8574181117533719
Precision score : 0.7247191011235955
Recall score : 0.8376623376623377
f1 score : 0.7771084337349398


# Conclusion

The accuracy is at 85% which isnt too bad for a simple model like this.  
looking at the f1 score will give a better representation on how the model is performing.  
With a f1 score of .77 its not performing good enough to be super useful.  
We could use other models like Randomized Decision Tree, or SVM to see if they might perform better. 