In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
#read file
df = pd.read_csv("car_eval_train.csv")
df_test = pd.read_csv("car_eval_test.csv")

In [10]:
df_test.head()

Unnamed: 0,id,buying,maint,doors,persons,lug_boot,safety
0,0,low,vhigh,3,more,big,high
1,1,med,med,4,2,big,high
2,2,med,low,2,more,big,high
3,3,med,low,3,4,small,med
4,4,high,low,2,4,med,low


In [11]:
names = list(df.columns.values[:-1])
print(names)

#identifying columns   
for item in names:
    uniqueVals = df[item].unique()
    print(item+": ")
    print(uniqueVals)



['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']
buying: 
['med' 'low' 'vhigh' 'high']
maint: 
['med' 'high' 'low' 'vhigh']
doors: 
['3' '5more' '2' '4']
persons: 
['4' '2' 'more']
lug_boot: 
['med' 'small' 'big']
safety: 
['med' 'high' 'low']


In [12]:
class_types = df.columns.values[-1]
df[class_types].value_counts()

class
unacc    968
acc      307
good      55
vgood     52
Name: count, dtype: int64

In [13]:
#encode into numbers for model

df_en = pd.get_dummies(df, columns=names, drop_first=True)
df_en = df_en.replace({True: 1, False: 0})
class_mapping = {"unacc": 0,
                 "acc": 1,
                 "good": 2,
                 "vgood": 3}
df_en['class'] = df_en['class'].map(class_mapping)

class_col = df_en['class']
df_en.drop(columns=['class'], inplace=True)
last_col_pos = df_en.columns.get_loc('safety_med') + 1
df_en.insert(last_col_pos, 'class', class_col)
df_en.head()

  df_en = df_en.replace({True: 1, False: 0})


Unnamed: 0,buying_low,buying_med,buying_vhigh,maint_low,maint_med,maint_vhigh,doors_3,doors_4,doors_5more,persons_4,persons_more,lug_boot_med,lug_boot_small,safety_low,safety_med,class
0,0,1,0,0,1,0,1,0,0,1,0,1,0,0,1,1
1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0
2,1,0,0,0,0,0,0,0,1,0,1,0,1,0,1,1
3,0,1,0,1,0,0,0,0,0,1,0,1,0,0,0,2
4,0,0,1,1,0,0,1,0,0,0,0,0,1,0,0,0


# Training the model
- Random Forest

In [16]:
# Prepare training data
X = df_en.drop(columns=["class"])  # Features
y = df_en["class"]                 # Target

# Train Gradient Boosting model
best_gb_clf = GradientBoostingClassifier(
    learning_rate=0.2,
    max_depth=3,
    n_estimators=700,
    subsample=1.0,
    random_state=42
)
best_gb_clf.fit(X, y)

# Process test dataset
if 'df_test' in globals():
    test_ids = df_test["id"]  # Preserve test IDs
    X_test = pd.get_dummies(df_test.drop(columns=["id"]), columns=names, drop_first=True)
    
    # Ensure all features match the training set
    missing_cols = set(X.columns) - set(X_test.columns)
    for col in missing_cols:
        X_test[col] = 0  # Add missing columns with default value 0
    
    X_test = X_test[X.columns]  # Reorder columns to match training data

    # Predict on the test data
    test_preds = best_gb_clf.predict(X_test)

    # Convert numerical predictions back to original class labels
    reverse_class_mapping = {0: "unacc", 1: "acc", 2: "good", 3: "vgood"}
    test_preds = [reverse_class_mapping[pred] for pred in test_preds]

    # Create submission file with correct format
    submission = pd.DataFrame({"id": test_ids, "output": test_preds})
    submission.to_csv("submission.csv", index=False)
    print("Submission file saved as submission.csv")



Submission file saved as submission.csv


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score

# Assume df_en is your one-hot encoded DataFrame with a 'class' target column.
X = df_en.drop(columns=["class"])  # Features
y = df_en["class"]                 # Target

# Split data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.0001, random_state=42)

# Define a parameter grid for tuning:
param_grid = {
    'n_estimators': [300, 500, 700],      # Number of trees
    'learning_rate': [0.01, 0.05, 0.1],     # Contribution of each tree
    'max_depth': [3, 5, 7],                 # Maximum depth of each tree
    'subsample': [0.8, 0.9, 1.0]            # Fraction of samples for fitting individual trees
}

# Initialize the Gradient Boosting Classifier with a fixed random_state for reproducibility.
gb_clf = GradientBoostingClassifier(random_state=42)

# Use GridSearchCV to search for the best parameter combination.
grid_search = GridSearchCV(
    estimator=gb_clf,
    param_grid=param_grid,
    cv=5,                    # 5-fold cross-validation
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)
grid_search.fit(X_train, y_train)

print("Best parameters found:", grid_search.best_params_)

# Retrieve the best estimator from grid search.
best_gb_clf = grid_search.best_estimator_

# Optionally, evaluate with 5-fold cross-validation on the training set.
cv_scores = cross_val_score(best_gb_clf, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy: {np.mean(cv_scores):.2f} ± {np.std(cv_scores):.2f}")

# Train the best model on the full training set.
best_gb_clf.fit(X_train, y_train)

# Evaluate the model on the validation set.
val_preds = best_gb_clf.predict(X_val)
val_accuracy = accuracy_score(y_val, val_preds)
print(f"Validation Accuracy: {val_accuracy:.2f}")

# If you have a test DataFrame (df_test) with an 'id' column, generate a submission file.
if 'df_test' in globals():
    # Make sure df_test is preprocessed in the same way as df_en.
    test_preds = best_gb_clf.predict(df_test.drop(columns=["id"]))
    submission = pd.DataFrame({
        "id": df_test["id"],
        "class": test_preds
    })
    submission.to_csv("submission.csv", index=False)
    print("Submission file saved as submission.csv")

