### Import Statement

In [1]:
import os
import sys
import random
from pprint import pprint

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn import set_config
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer

from sklearn.model_selection import GridSearchCV, cross_val_score

from sklearn.tree import DecisionTreeClassifier

from tqdm.notebook import tqdm

#### Configuration 

In [2]:
sns.set_theme(style="darkgrid")
sns.set(style='darkgrid', context='notebook', palette='rainbow')
sns.set(rc={'figure.figsize':(4,3)})
pd.set_option("display.max_colwidth", 1000)
set_config(display="diagram")
random.seed(100)
tqdm.pandas()

#### Data Source

In [3]:
datasource = 'D:/codespace/python/datascience-ml-handson/data/car_evaluation/car.data'
modelpath = 'D:/codespace/python/datascience-ml-handson/model'

if os.path.isfile(datasource) is False:
    raise FileNotFoundError(f'data souce {datasource} is not Found')
    
dataframe = pd.read_csv(datasource, header=None)  

dataframe.columns = ['buying','maint','doors','persons','lug_boot','safety','class']

print(f'datafame shape {dataframe.shape}')

dataframe.head()

datafame shape (1728, 7)


Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [4]:
pprint(dataframe['buying'].value_counts().to_dict())
pprint(dataframe['maint'].value_counts().to_dict())
pprint(dataframe['doors'].value_counts().to_dict())
pprint(dataframe['persons'].value_counts().to_dict())
pprint(dataframe['lug_boot'].value_counts().to_dict())
pprint(dataframe['class'].value_counts().to_dict())

{'high': 432, 'low': 432, 'med': 432, 'vhigh': 432}
{'high': 432, 'low': 432, 'med': 432, 'vhigh': 432}
{'2': 432, '3': 432, '4': 432, '5more': 432}
{'2': 576, '4': 576, 'more': 576}
{'big': 576, 'med': 576, 'small': 576}
{'acc': 384, 'good': 69, 'unacc': 1210, 'vgood': 65}


In [5]:
#dataframe['doors'].replace(to_replace={'5more':'5'}, inplace=True)
#dataframe['doors'] = dataframe['doors'].astype(int)
#dataframe['persons'] = dataframe['persons'].astype(int)
features_column = ['buying','maint','doors','persons','lug_boot','safety']
target_column = 'class'
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1728 non-null   object
 1   maint     1728 non-null   object
 2   doors     1728 non-null   object
 3   persons   1728 non-null   object
 4   lug_boot  1728 non-null   object
 5   safety    1728 non-null   object
 6   class     1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [6]:
dataframe['class'] = LabelEncoder().fit_transform(dataframe['class'])
dataframe.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,2
1,vhigh,vhigh,2,2,small,med,2
2,vhigh,vhigh,2,2,small,high,2
3,vhigh,vhigh,2,2,med,low,2
4,vhigh,vhigh,2,2,med,med,2


In [20]:
X_data, Y_data = dataframe[['buying','maint','doors','persons','lug_boot','safety']], dataframe[['class']] 

print(f'X_data shape {X_data.shape} , Y_data Shape {Y_data.shape}')


X_data shape (1728, 6) , Y_data Shape (1728, 1)


In [8]:
def strip_then_lower_case(data):
    return data.strip().lower()

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values


column_preprocessor = Pipeline(
    [   
        ('selector', DataFrameSelector(['buying','maint','doors','persons','lug_boot','safety'])),
        ("imputer", SimpleImputer(strategy='most_frequent')),
        (
            "toLower",FunctionTransformer(
                        np.vectorize(strip_then_lower_case), validate=False 
            ),
        ),
        ("encoder", OrdinalEncoder())        
    ]
)
              

column_transformers = ColumnTransformer(
    transformers=[
        ("x_trf", column_preprocessor,['buying','maint','doors','persons','lug_boot','safety']),
    ],
    remainder="drop"
)

pipe = Pipeline(steps=[("trf", column_transformers), ("calssifier", DecisionTreeClassifier(random_state=1024))])

pipe

In [9]:
pre_processed_data = pipe.fit(dataframe, Y_data)


In [10]:
param_grid = {'calssifier__ccp_alpha': [0.1, .01, .001],
              'calssifier__max_depth' : [2,3,4,5,6, 7, 8, 9],
              'calssifier__criterion' :['gini', 'entropy']
             }
grid_search = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=5, verbose=True)

grid_search.fit(dataframe, Y_data)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [22]:
print("Best Number Of Components:", grid_search.best_estimator_.get_params()["calssifier__ccp_alpha"])
print(grid_search.best_estimator_.get_params()["calssifier__max_depth"])

final_model = grid_search.best_estimator_
final_model

Best Number Of Components: 0.001
9
