In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn import preprocessing

#read file
df = pd.read_csv("car_eval_train.csv")

In [18]:
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,med,med,3,4,med,med,acc
1,med,high,3,2,small,med,unacc
2,low,high,5more,more,small,med,acc
3,med,low,2,4,med,high,good
4,vhigh,low,3,2,small,high,unacc


In [None]:
names = list(df.columns.values[:-1])
print(names)

#identifying columns   
for item in names:
    uniqueVals = df[item].unique()
    print(item+": ")
    print(uniqueVals)



['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']
buying: 
['med' 'low' 'vhigh' 'high']
maint: 
['med' 'high' 'low' 'vhigh']
doors: 
['3' '5more' '2' '4']
persons: 
['4' '2' 'more']
lug_boot: 
['med' 'small' 'big']
safety: 
['med' 'high' 'low']


In [None]:
class_types = df.columns.values[-1]
df[class_types].value_counts()

class
unacc    968
acc      307
good      55
vgood     52
Name: count, dtype: int64

In [None]:
#encode into numbers for model

df_en = pd.get_dummies(df, columns=names, drop_first=True)
df_en = df_en.replace({True: 1, False: 0})
class_mapping = {"unacc": 0,
                 "acc": 1,
                 "good": 2,
                 "vgood": 3}
df_en['class'] = df_en['class'].map(class_mapping)

class_col = df_en['class']
df_en.drop(columns=['class'], inplace=True)
last_col_pos = df_en.columns.get_loc('safety_med') + 1
df_en.insert(last_col_pos, 'class', class_col)
df_en.head()

  df_en = df_en.replace({True: 1, False: 0})


Unnamed: 0,buying_low,buying_med,buying_vhigh,maint_low,maint_med,maint_vhigh,doors_3,doors_4,doors_5more,persons_4,persons_more,lug_boot_med,lug_boot_small,safety_low,safety_med,class
0,0,1,0,0,1,0,1,0,0,1,0,1,0,0,1,1
1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0
2,1,0,0,0,0,0,0,0,1,0,1,0,1,0,1,1
3,0,1,0,1,0,0,0,0,0,1,0,1,0,0,0,2
4,0,0,1,1,0,0,1,0,0,0,0,0,1,0,0,0


# Training the model
- Random Forest

In [40]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix

# Assume df_en is your one-hot encoded DataFrame with a 'class' target column.
X = df_en.drop(columns=["class"])  # Features
y = df_en["class"]                 # Target

# Split data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Gradient Boosting Classifier with the best-found hyperparameters.
gb_clf = GradientBoostingClassifier(
    n_estimators=700,       # Number of boosting stages
    learning_rate=0.1,      # Step size shrinkage
    max_depth=3,            # Maximum depth of individual estimators
    subsample=0.9,          # Fraction of samples used for fitting each base learner
    random_state=42         # Ensures reproducibility
)

# Train the model on the full training set.
gb_clf.fit(X_train, y_train)

# Evaluate the model on the validation set.
val_preds = gb_clf.predict(X_val)
val_accuracy = accuracy_score(y_val, val_preds)
print(f"Validation Accuracy: {val_accuracy:.2f}")

# Identify misclassified samples
misclassified_indices = np.where(y_val != val_preds)[0]
misclassified_samples = X_val.iloc[misclassified_indices]
misclassified_true_labels = y_val.iloc[misclassified_indices]
misclassified_pred_labels = val_preds[misclassified_indices]

# Print misclassified samples with their true and predicted labels
print("\nMisclassified Samples:")
for i in range(len(misclassified_samples)):
    print(f"Sample index: {misclassified_samples.index[i]}")
    print(f"True label: {misclassified_true_labels.iloc[i]}")
    print(f"Predicted label: {misclassified_pred_labels[i]}")
    print(f"Sample features:\n{misclassified_samples.iloc[i]}\n")

# If you have a test DataFrame (df_test) with an 'id' column, generate a submission file.
if 'df_test' in globals():
    # Ensure df_test is preprocessed similarly to df_en.
    test_preds = gb_clf.predict(df_test.drop(columns=["id"]))
    submission = pd.DataFrame({
        "id": df_test["id"],
        "class": test_preds
    })
    submission.to_csv("submission.csv", index=False)
    print("Submission file saved as submission.csv")




Validation Accuracy: 0.97

Misclassified Samples:
Sample index: 192
True label: 3
Predicted label: 2
Sample features:
buying_low        1
buying_med        0
buying_vhigh      0
maint_low         0
maint_med         1
maint_vhigh       0
doors_3           0
doors_4           1
doors_5more       0
persons_4         1
persons_more      0
lug_boot_med      1
lug_boot_small    0
safety_low        0
safety_med        0
Name: 192, dtype: int64

Sample index: 477
True label: 0
Predicted label: 2
Sample features:
buying_low        1
buying_med        0
buying_vhigh      0
maint_low         0
maint_med         1
maint_vhigh       0
doors_3           0
doors_4           0
doors_5more       0
persons_4         0
persons_more      1
lug_boot_med      0
lug_boot_small    1
safety_low        0
safety_med        0
Name: 477, dtype: int64

Sample index: 425
True label: 0
Predicted label: 1
Sample features:
buying_low        0
buying_med        0
buying_vhigh      1
maint_low         0
maint_med       