# Title

#### Subtitle

In [21]:
# imports

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (
GridSearchCV,
RandomizedSearchCV,
cross_validate,
train_test_split,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV

## Summary

## Introduction

## Methods





#### Data

In [2]:
# import raw data
# data located at https://archive.ics.uci.edu/dataset/19/car+evaluation

colnames = ['buying','maint','doors','persons','lug_boot','safety','class']
car_data = pd.read_csv('../data/raw/car.data', names=colnames, header=None)

car_data

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
...,...,...,...,...,...,...,...
1723,low,low,5more,more,med,med,good
1724,low,low,5more,more,med,high,vgood
1725,low,low,5more,more,big,low,unacc
1726,low,low,5more,more,big,med,good


In [3]:
# train test split, export to csv
np.random.seed(522)

car_train, car_test = train_test_split(car_data, train_size = 0.8, random_state = 522, stratify=car_data['class'])
car_train.to_csv('../data/processed/car_train.csv')
car_test.to_csv('../data/processed/car_test.csv')

#### Preprocessing

In [4]:
# preprocessing

# transform categorical features
car_preprocessor = make_column_transformer(
    (OrdinalEncoder(categories=[['low','med','high','vhigh']]), ['buying']),
    (OrdinalEncoder(categories=[['low','med','high','vhigh']]), ['maint']),
    (OrdinalEncoder(categories=[['2','3','4','5more']]), ['doors']),
    (OrdinalEncoder(categories=[['2','4','more']]), ['persons']),
    (OrdinalEncoder(categories=[['small','med','big']]), ['lug_boot']),
    (OrdinalEncoder(categories=[['low','med','high']]), ['safety']),
    remainder='passthrough',
    verbose_feature_names_out=False
)

car_preprocessor.fit(car_train)
encoded_car_train = car_preprocessor.transform(car_train)
encoded_car_test = car_preprocessor.transform(car_test)

names = car_preprocessor.get_feature_names_out()
encoded_car_train = pd.DataFrame(encoded_car_train, columns=names)
encoded_car_test = pd.DataFrame(encoded_car_test, columns=names)

encoded_car_train.to_csv('../data/encoded_car_train.csv')
encoded_car_test.to_csv('../data/encoded_car_train.csv')

In [5]:
# EDA: histogram for each categorical column?
import altair as alt

alt.Chart(encoded_car_train).mark_bar().encode(
    x=alt.X('lug_boot'),
    y='count()',
    color=alt.Color('class')
)

# alt.Chart(encoded_car_train).mark_histogram().encode(
#      alt.Y(alt.repeat('column')).type('quantitative'),
#     color='class'
# ).properties(
#     width=200,
#     height=200
# ).repeat(
#     column=['buying', 'maint', 'doors','persons','lug_boot','safety']
# )

In [8]:
X_train, y_train = car_train.drop(columns=['class']), car_train['class']
X_test, y_test = car_test.drop(columns=['class']), car_test['class']

#### Analysising

Before we junp into analysising, we will have a plan on what we are going to do. Since we already have the X_train, y_train and X_test, y_test, the next step we need to choose the best classifier. First, lets put all the possible classification classifer into a dictonary.

In [19]:
models = {
    "dummy": DummyClassifier(random_state=123),
    "Decision Tree": DecisionTreeClassifier(random_state=123, max_depth=5),
    "KNN": KNeighborsClassifier(),
    "RBF SVM": SVC(random_state=123),
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=2000, random_state=123)
}

Then, we need to apply cross_validation to all the models to find out the best one.

In [20]:
cv_results = []
for model_name, model in models.items():
    pipe = make_pipeline(car_preprocessor, model)
    scores = cross_validate(pipe, X_train, y_train, n_jobs=-1, return_train_score=True, cv=5)
    cv_results.append({
        "Model": model_name,
        "Mean Train Score": np.mean(scores['train_score']),
        "Std Train Score": np.std(scores['train_score']),
        "Mean Test Score": np.mean(scores['test_score']),
        "Std Test Score": np.std(scores['test_score'])
    })

cv_results_df = pd.DataFrame(cv_results)
cv_results_df


Unnamed: 0,Model,Mean Train Score,Std Train Score,Mean Test Score,Std Test Score
0,dummy,0.700434,0.00033,0.700434,0.001324
1,Decision Tree,0.872649,0.003011,0.855295,0.017305
2,KNN,0.96961,0.002833,0.942848,0.010523
3,RBF SVM,0.971239,0.00358,0.95226,0.018302
4,Naive Bayes,0.711288,0.00172,0.70767,0.003501
5,Logistic Regression,0.838458,0.006947,0.833584,0.018662


According to our cross-validation result. We can see the test score and the train score. RBF SVM has the highest test score, suggesting it is the most accurate model for unseen data. Therefore, we will choose RBF SVM as our best model.

Next, we will find out the optimized hyperparameter for our model using random search

In [27]:
param_grid = {
    "svc__gamma": 10.0 ** np.arange(-5, 5, 1),
    "svc__C": 10.0 ** np.arange(-5, 5, 1)
}
svc_pipe = make_pipeline(car_preprocessor, SVC(random_state=123))
random_search = RandomizedSearchCV(svc_pipe, param_distributions=param_grid, n_iter=100, n_jobs= -1, return_train_score=True) 
random_search.fit(X_train, y_train)

## Results & Discussion

## References

In [29]:
random_search.

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_svc__gamma,param_svc__C,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.134107,0.007397,0.074012,0.005648,0.00001,0.00001,"{'svc__gamma': 1e-05, 'svc__C': 1e-05}",0.700361,0.700361,0.702899,...,0.700434,0.001324,26,0.700452,0.700452,0.699819,0.700723,0.700723,0.700434,0.00033
1,0.122372,0.009256,0.054228,0.017799,0.0001,0.00001,"{'svc__gamma': 0.0001, 'svc__C': 1e-05}",0.700361,0.700361,0.702899,...,0.700434,0.001324,26,0.700452,0.700452,0.699819,0.700723,0.700723,0.700434,0.00033
2,0.104630,0.004075,0.038698,0.003644,0.001,0.00001,"{'svc__gamma': 0.001, 'svc__C': 1e-05}",0.700361,0.700361,0.702899,...,0.700434,0.001324,26,0.700452,0.700452,0.699819,0.700723,0.700723,0.700434,0.00033
3,0.098189,0.006239,0.045601,0.005651,0.01,0.00001,"{'svc__gamma': 0.01, 'svc__C': 1e-05}",0.700361,0.700361,0.702899,...,0.700434,0.001324,26,0.700452,0.700452,0.699819,0.700723,0.700723,0.700434,0.00033
4,0.102840,0.018634,0.061386,0.006250,0.1,0.00001,"{'svc__gamma': 0.1, 'svc__C': 1e-05}",0.700361,0.700361,0.702899,...,0.700434,0.001324,26,0.700452,0.700452,0.699819,0.700723,0.700723,0.700434,0.00033
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.403379,0.006370,0.088072,0.008127,1.0,10000.0,"{'svc__gamma': 1.0, 'svc__C': 10000.0}",0.985560,0.967509,0.978261,...,0.979744,0.008730,2,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.00000
96,0.460897,0.024513,0.093791,0.010817,10.0,10000.0,"{'svc__gamma': 10.0, 'svc__C': 10000.0}",0.700361,0.700361,0.702899,...,0.700434,0.001324,26,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.00000
97,0.454729,0.014131,0.106966,0.003182,100.0,10000.0,"{'svc__gamma': 100.0, 'svc__C': 10000.0}",0.700361,0.700361,0.702899,...,0.700434,0.001324,26,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.00000
98,0.320104,0.025537,0.077932,0.012568,1000.0,10000.0,"{'svc__gamma': 1000.0, 'svc__C': 10000.0}",0.700361,0.700361,0.702899,...,0.700434,0.001324,26,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.00000
