In [1]:
import pandas as pd
import numpy as np
import sys
import os

In [2]:
sys.path.append(os.path.abspath(".."))

In [3]:
raw_data=pd.read_csv("../data/car_evaluation/car.data",header=None)
#header=none says that the dataset has no columns names so it gives column name 1,2,3..

In [4]:
raw_data.columns=["buying","maint","doors","persons","lug_boot","safety","class"]

In [5]:
raw_data

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
...,...,...,...,...,...,...,...
1723,low,low,5more,more,med,med,good
1724,low,low,5more,more,med,high,vgood
1725,low,low,5more,more,big,low,unacc
1726,low,low,5more,more,big,med,good


In [6]:
from src.preprocess import split_feature_target ,encode_and_split
from src.train import (train_logistics,train_svm,train_decision_tree,train_random_forest)
from src.evaluate import evaluate_model

In [7]:
df=raw_data

In [8]:
#preprocess
x,y=split_feature_target(df)
x_train, x_test, y_train, y_test, encoder = encode_and_split(x, y)

In [9]:
def main():
    #preprocess
    #x,y=split_feature_target(df)
    #x_train, x_test, y_train, y_test, encoder = encode_and_split(x, y)
    #train model
    log_model = train_logistics(x_train, y_train)
    svm_model = train_svm(x_train, y_train)
    tree_model = train_decision_tree(x_train, y_train)
    rf_model = train_random_forest(x_train, y_train)
    #evaluate
    results = []

    results.append(evaluate_model(log_model, x_test, y_test, "Logistic Regression"))
    results.append(evaluate_model(svm_model, x_test, y_test, "SVM"))
    results.append(evaluate_model(tree_model, x_test, y_test, "Decision Tree"))
    results.append(evaluate_model(rf_model, x_test, y_test, "Random Forest"))

    results_df = pd.DataFrame(results)
    print("\n===== Model Ranking =====")
    print(results_df.sort_values(by="macro_f1", ascending=False))

if __name__ == "__main__":
        main()


===== Model Ranking =====
                 model  accuracy  macro_f1  macro_precision  macro_recall
3        Random Forest  0.985549  0.971130         0.988327      0.955386
2        Decision Tree  0.976879  0.966746         0.960148      0.976240
1                  SVM  0.973988  0.944619         0.941159      0.949880
0  Logistic Regression  0.901734  0.787257         0.803379      0.775293


In [10]:
from src.train import (
    tune_logistic,
    tune_svm,
    tune_decision_tree,
    tune_random_forest
)


In [11]:
log_model, log_cv_score = tune_logistic(x_train, y_train)
svm_model, svm_cv_score = tune_svm(x_train, y_train)
tree_model, tree_cv_score = tune_decision_tree(x_train, y_train)
rf_model, rf_cv_score = tune_random_forest(x_train, y_train)


In [12]:
tuning_results = pd.DataFrame([
    {"model": "Logistic Regression", "best_cv_macro_f1": log_cv_score},
    {"model": "SVM", "best_cv_macro_f1": svm_cv_score},
    {"model": "Decision Tree", "best_cv_macro_f1": tree_cv_score},
    {"model": "Random Forest", "best_cv_macro_f1": rf_cv_score},
]).sort_values(by="best_cv_macro_f1", ascending=False).reset_index(drop=True)

display(tuning_results)


Unnamed: 0,model,best_cv_macro_f1
0,SVM,0.993671
1,Decision Tree,0.941528
2,Logistic Regression,0.8627
3,Random Forest,0.843538
