In [95]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier, StackingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
#read file
df = pd.read_csv("car_eval_train.csv")
df_test = pd.read_csv("car_eval_test.csv")

In [96]:
df_test.head()

Unnamed: 0,id,buying,maint,doors,persons,lug_boot,safety
0,0,low,vhigh,3,more,big,high
1,1,med,med,4,2,big,high
2,2,med,low,2,more,big,high
3,3,med,low,3,4,small,med
4,4,high,low,2,4,med,low


In [97]:
names = list(df.columns.values[:-1])
print(names)

#identifying columns   
for item in names:
    uniqueVals = df[item].unique()
    print(item+": ")
    print(uniqueVals)



['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']
buying: 
['med' 'low' 'vhigh' 'high']
maint: 
['med' 'high' 'low' 'vhigh']
doors: 
['3' '5more' '2' '4']
persons: 
['4' '2' 'more']
lug_boot: 
['med' 'small' 'big']
safety: 
['med' 'high' 'low']


In [98]:
class_types = df.columns.values[-1]
df[class_types].value_counts()

class
unacc    968
acc      307
good      55
vgood     52
Name: count, dtype: int64

In [99]:
#encode into numbers for model

df_en = pd.get_dummies(df, columns=names, drop_first=True)
df_en = df_en.replace({True: 1, False: 0})
class_mapping = {"unacc": 0,
                 "acc": 1,
                 "good": 2,
                 "vgood": 3}
df_en['class'] = df_en['class'].map(class_mapping)

class_col = df_en['class']
df_en.drop(columns=['class'], inplace=True)
last_col_pos = df_en.columns.get_loc('safety_med') + 1
df_en.insert(last_col_pos, 'class', class_col)
df_en.head()

  df_en = df_en.replace({True: 1, False: 0})


Unnamed: 0,buying_low,buying_med,buying_vhigh,maint_low,maint_med,maint_vhigh,doors_3,doors_4,doors_5more,persons_4,persons_more,lug_boot_med,lug_boot_small,safety_low,safety_med,class
0,0,1,0,0,1,0,1,0,0,1,0,1,0,0,1,1
1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0
2,1,0,0,0,0,0,0,0,1,0,1,0,1,0,1,1
3,0,1,0,1,0,0,0,0,0,1,0,1,0,0,0,2
4,0,0,1,1,0,0,1,0,0,0,0,0,1,0,0,0


# Training the model
- ... to be updated


In [100]:
"""
# Prepare training data
X = df_en.drop(columns=["class"])  # Features
y = df_en["class"]                 # Target

# Train Gradient Boosting model
best_gb_clf = GradientBoostingClassifier(
    learning_rate=0.1,
    max_depth=3,
    n_estimators=700,
    subsample=1.0,
    random_state=42
)
best_gb_clf.fit(X, y)

# Process test dataset
if 'df_test' in globals():
    test_ids = df_test["id"]  # Preserve test IDs
    X_test = pd.get_dummies(df_test.drop(columns=["id"]), columns=names, drop_first=True)
    
    # Ensure all features match the training set
    missing_cols = set(X.columns) - set(X_test.columns)
    for col in missing_cols:
        X_test[col] = 0  # Add missing columns with default value 0
    
    X_test = X_test[X.columns]  # Reorder columns to match training data

    # Predict on the test data
    test_preds = best_gb_clf.predict(X_test)

    # Convert numerical predictions back to original class labels
    reverse_class_mapping = {0: "unacc", 1: "acc", 2: "good", 3: "vgood"}
    test_preds = [reverse_class_mapping[pred] for pred in test_preds]

    # Create submission file with correct format
    submission = pd.DataFrame({"id": test_ids, "output": test_preds})
    submission.to_csv("submission.csv", index=False)
    print("Submission file saved as submission.csv")

"""

'\n# Prepare training data\nX = df_en.drop(columns=["class"])  # Features\ny = df_en["class"]                 # Target\n\n# Train Gradient Boosting model\nbest_gb_clf = GradientBoostingClassifier(\n    learning_rate=0.1,\n    max_depth=3,\n    n_estimators=700,\n    subsample=1.0,\n    random_state=42\n)\nbest_gb_clf.fit(X, y)\n\n# Process test dataset\nif \'df_test\' in globals():\n    test_ids = df_test["id"]  # Preserve test IDs\n    X_test = pd.get_dummies(df_test.drop(columns=["id"]), columns=names, drop_first=True)\n    \n    # Ensure all features match the training set\n    missing_cols = set(X.columns) - set(X_test.columns)\n    for col in missing_cols:\n        X_test[col] = 0  # Add missing columns with default value 0\n    \n    X_test = X_test[X.columns]  # Reorder columns to match training data\n\n    # Predict on the test data\n    test_preds = best_gb_clf.predict(X_test)\n\n    # Convert numerical predictions back to original class labels\n    reverse_class_mapping = {0:

In [101]:
# Assume df_en is your one-hot encoded DataFrame with a 'class' target column.
X = df_en.drop(columns=["class"])  # Features
y = df_en["class"]  # Target

# Split data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

#Fine tuning extra trees classifiers

# Set up the hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Extra Trees Classifier (More randomness, reduces overfitting)


#    n_estimators=1000,
  #  max_depth=None,
 #   min_samples_split=5,
  #  min_samples_leaf=2,
  #  random_state=42,
  #  n_jobs=-1

# Instantiate and fit the grid search
grid = GridSearchCV(
    ExtraTreesClassifier(), 
    param_grid, 
    cv=5, 
    n_jobs=-1
)
grid.fit(X_train, y_train)


#best model for Extra Trees
et_clf = grid.best_estimator_
# Print the best parameters
print(f"Best parameters: {grid.best_params_}")

# Gradient Boosting with RandomizedSearchCV for better hyperparameter tuning
param_dist = {
    'n_estimators': [600, 1000, 2000],
    'learning_rate': [0.08, 0.05, 0.01],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 0.9, 1.0],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 3, 5]
}

random_search = RandomizedSearchCV(
    estimator = GradientBoostingClassifier(random_state=42),
    param_distributions=param_dist,
    n_iter=20,  # Try 20 different settings
    cv=10,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2,
    random_state=42
)

random_search.fit(X_train, y_train) # Trains the Gradient Boosting Model using random combos of hyperparameters defined in param_dist
best_params = random_search.best_params_
gb_clf = random_search.best_estimator_ # best model for gradient boosting
print("Best Parameters for Gradient Boosting:", best_params)


# Stacking Classifier (Combines multiple models for better accuracy)
stacked_clf = StackingClassifier(
    estimators=[
        ('et', et_clf),
        ('gb', gb_clf),
    ],
    final_estimator=RandomForestClassifier(n_estimators=100,random_state=42),
    n_jobs=-1
)

# Train stacked classifier
stacked_clf.fit(X_train, y_train)

# Evaluate on validation set
val_preds = stacked_clf.predict(X_val)
val_accuracy = accuracy_score(y_val, val_preds)
print(f"Validation Accuracy: {val_accuracy:.4f}")

scores = cross_val_score(stacked_clf, X_train, y_train, cv=5, scoring='accuracy')
print(f"Stacked Classifier CV Accuracy: {scores.mean():.4f} ± {scores.std():.4f}")



Best parameters: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 200}
Fitting 10 folds for each of 20 candidates, totalling 200 fits
[CV] END learning_rate=0.05, max_depth=7, min_samples_leaf=5, min_samples_split=5, n_estimators=600, subsample=0.8; total time=   9.0s
[CV] END learning_rate=0.05, max_depth=7, min_samples_leaf=5, min_samples_split=5, n_estimators=600, subsample=0.8; total time=   9.3s
[CV] END learning_rate=0.05, max_depth=7, min_samples_leaf=5, min_samples_split=5, n_estimators=600, subsample=0.8; total time=   9.3s
[CV] END learning_rate=0.05, max_depth=7, min_samples_leaf=5, min_samples_split=5, n_estimators=600, subsample=0.8; total time=   9.5s
[CV] END learning_rate=0.05, max_depth=7, min_samples_leaf=5, min_samples_split=5, n_estimators=600, subsample=0.8; total time=   9.4s
[CV] END learning_rate=0.05, max_depth=7, min_samples_leaf=5, min_samples_split=5, n_estimators=600, subsample=0.8; total time=   9.5s
[CV] END learning_rate=0.05, max_depth=7, min_s