# Import Libraries

In [85]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, confusion_matrix, classification_report

# Import data

In [86]:
df= pd.read_csv('E:/Python Projects/shohreh/Shohreh_GitHub_Repository/Data-Analysis-And-Machine-Learning-Projects/3. Business Intelligence and Sales Analysis/Car_evaluation/car_evaluation.csv', header= None)

In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       1728 non-null   object
 1   1       1728 non-null   object
 2   2       1728 non-null   object
 3   3       1728 non-null   object
 4   4       1728 non-null   object
 5   5       1728 non-null   object
 6   6       1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [54]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


# Data Review

## Car Evaluation Dataset Overview

The provided dataset, `car_evaluation.csv`, contains evaluations of cars based on various attributes. Each row represents a unique combination of car features and the corresponding evaluation outcome.
#
## **Columns (Features)**
1. **`buying`**: The buying price of the car.  
   - Possible values: `vhigh` (very high), `high`, `med` (medium), `low`  
2. **`maint`**: The maintenance cost of the car.  
   - Possible values: `vhigh`, `high`, `med`, `low`  
3. **`doors`**: The number of doors.  
   - Possible values: `2`, `3`, `4`, `5more` (5 or more)  
4. **`persons`**: The seating capacity.  
   - Possible values: `2`, `4`, `more` (more than 4)  
5. **`lug_boot`**: The size of the luggage boot.  
   - Possible values: `small`, `med` (medium), `big`  
6. **`safety`**: The estimated safety of the car.  
   - Possible values: `low`, `med`, `high`  
7. **`class`**: The evaluation outcome (target variable).  
   - Possible values: `unacc` (unacceptable), `acc` (acceptable), `good`, `vgoforest model to predict car acceptability.

# Data cleaning

In [87]:
# change columns name with appropriate name
# df= df.rename(columns={0: 'buying', 1: 'maint', 2: 'doors', 3: 'persons', 4: 'lug_boot', 5: 'safety', 6: 'class'})
df.columns= ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']

In [88]:
df.columns

Index(['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class'], dtype='object')

In [89]:
pd.DataFrame(df.nunique(), columns=['Number of unique values'])

Unnamed: 0,Number of unique values
buying,4
maint,4
doors,4
persons,3
lug_boot,3
safety,3
class,4


In [21]:
def valuecounts(df, column):
    return pd.DataFrame(df[column].value_counts(), columns=['count'])

In [22]:
for col in df.columns:
    print(valuecounts(df, col))
    print(15*'*')

        count
buying       
vhigh     432
high      432
med       432
low       432
***************
       count
maint       
vhigh    432
high     432
med      432
low      432
***************
       count
doors       
2        432
3        432
4        432
5more    432
***************
         count
persons       
2          576
4          576
more       576
***************
          count
lug_boot       
small       576
med         576
big         576
***************
        count
safety       
low       576
med       576
high      576
***************
       count
class       
unacc   1210
acc      384
good      69
vgood     65
***************


In [23]:
valuecounts(df,'buying')

Unnamed: 0_level_0,count
buying,Unnamed: 1_level_1
vhigh,432
high,432
med,432
low,432


In [90]:
encoders = {}
for col in df.columns:
    if df[col].dtype == 'object':
        le= LabelEncoder()
        df[col] = le.fit_transform(df[col])
        encoders[col] = le

# Data spliting

In [91]:
X= df.drop(['class'], axis=1)
y=df['class']
#X.shape, y.shape
X_train, X_test, y_train, y_test= train_test_split(X, y, train_size= 0.7, random_state= 42)

In [92]:
# # standardization
# scaler= StandardScaler()
# X_train= scaler.fit_transform(X_train)
# X_test= scaler.transform(X_test)

In [93]:
# Normalization
scaler= MinMaxScaler()
X_train= scaler.fit_transform(X_train)
X_test= scaler.transform(X_test)

# Modeling_ DecisionTreeClassifier

In [66]:
model= DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred= model.predict(X_test)
print(f'Confusion Matrix: {confusion_matrix(y_test, y_pred)}')
print(f'f1_score: {f1_score(y_test, y_pred, average= "micro")}')
print(f'Classifiction_report: {classification_report(y_test, y_pred, target_names= encoders["class"].classes_)}')

Confusion Matrix: [[111   5   1   1]
 [  1  17   0   1]
 [  4   0 354   0]
 [  3   2   0  19]]
f1_score: 0.9653179190751445
Classifiction_report:               precision    recall  f1-score   support

         acc       0.93      0.94      0.94       118
        good       0.71      0.89      0.79        19
       unacc       1.00      0.99      0.99       358
       vgood       0.90      0.79      0.84        24

    accuracy                           0.97       519
   macro avg       0.89      0.90      0.89       519
weighted avg       0.97      0.97      0.97       519



## Hyperparameters_ DecisionTreeClassifications

In [84]:

model= DecisionTreeClassifier()
param_distribution= {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 3, 5, 10],
    'min_samples_split': [2, 5, 10 , 20],
    'min_samples_leaf': [1, 5, 10, 20],
    'max_features': [None, "sqrt", "log2", 0.5, 'auto'],
    'random_state' : [None, 42],
    'max_leaf_nodes': [None, 10, 50],
    'class_weight' : [None, "balanced"]
    
}
random_search= RandomizedSearchCV(model, param_distribution, n_iter= 100, cv= 5, random_state= 42,scoring='accuracy', n_jobs=-1)
random_search.fit(X_train, y_train)
print(random_search.best_params_)

{'splitter': 'random', 'random_state': 42, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_leaf_nodes': None, 'max_features': None, 'max_depth': None, 'criterion': 'entropy', 'class_weight': None}


105 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
100 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\ebsor\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\ebsor\anaconda3\lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\ebsor\anaconda3\lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\ebsor\anaconda3\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParame

In [83]:
model= DecisionTreeClassifier(splitter= 'random',
                            random_state= 42,
                            min_samples_split= 2,
                            min_samples_leaf= 1,
                            max_leaf_nodes= None,
                            max_features= None,
                            max_depth= None,
                            criterion= 'entropy',
                            class_weight= None)
model.fit(X_train, y_train)
y_pred= model.predict(X_test)
print(f'Confusion Matrix: {confusion_matrix(y_test, y_pred)}')
print(f'f1_score: {f1_score(y_test, y_pred, average= "micro")}')
print(f'Classifiction_report: {classification_report(y_test, y_pred, target_names= encoders["class"].classes_)}')

Confusion Matrix: [[107  10   0   1]
 [  1  17   0   1]
 [  2   0 356   0]
 [  1   2   0  21]]
f1_score: 0.9653179190751445
Classifiction_report:               precision    recall  f1-score   support

         acc       0.96      0.91      0.93       118
        good       0.59      0.89      0.71        19
       unacc       1.00      0.99      1.00       358
       vgood       0.91      0.88      0.89        24

    accuracy                           0.97       519
   macro avg       0.87      0.92      0.88       519
weighted avg       0.97      0.97      0.97       519



# Modeling _ RandomForestClassifications

In [94]:
model= RandomForestClassifier()
model.fit(X_train, y_train)
y_pred= model.predict(X_test)
print(f'Confusion Matrix: {confusion_matrix(y_test, y_pred)}')
print(f'f1_score: {f1_score(y_test, y_pred, average= "micro")}')
print(f'Classifiction_report: {classification_report(y_test, y_pred, target_names= encoders["class"].classes_)}')

Confusion Matrix: [[108   6   3   1]
 [  2  16   0   1]
 [  2   0 356   0]
 [  2   0   0  22]]
f1_score: 0.9672447013487476
Classifiction_report:               precision    recall  f1-score   support

         acc       0.95      0.92      0.93       118
        good       0.73      0.84      0.78        19
       unacc       0.99      0.99      0.99       358
       vgood       0.92      0.92      0.92        24

    accuracy                           0.97       519
   macro avg       0.90      0.92      0.91       519
weighted avg       0.97      0.97      0.97       519



## Hyperparameters _ RandomForestClassifier

In [98]:
model= RandomForestClassifier()
params={
     'n_estimators': [10, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 5, 10, 20],
    'max_features': [None, "sqrt", "log2", 0.5, 'auto'],
    'bootstrap': [True, False],
    'random_state': [42, None],
    'n_jobs': [-1, None],
    'class_weight': [None, "balanced"],
    'oob_score': [True, False]  
}
random_search = RandomizedSearchCV(model, params, n_iter=100, cv= 5, n_jobs= -1,random_state= 42, scoring='accuracy')
random_search.fit(X_train, y_train)
print(f'dest_parameters: {random_search.best_params_}')
print(f'dest_parameters: {random_search.best_params_}')

150 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
53 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\ebsor\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\ebsor\anaconda3\lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\ebsor\anaconda3\lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\ebsor\anaconda3\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParamet

dest_parameters: {'random_state': 42, 'oob_score': False, 'n_jobs': None, 'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 0.5, 'max_depth': None, 'class_weight': 'balanced', 'bootstrap': False}


In [99]:
model= RandomForestClassifier(
n_estimators=100,           # More trees
    max_depth=None,              # Limit tree depth
    min_samples_split=2,       # Require more samples to split
    min_samples_leaf=1,        # Require more samples at leaf
    max_features= 0.5,       # Feature sampling
    bootstrap=False,            # Bootstrap sampling
    random_state=42,           # Reproducibility
    n_jobs=None,                 # Use all cores
    class_weight='balanced',   # Handle imbalanced classes
    oob_score=False            # Out-of-bag scoring
)
model.fit(X_train, y_train)
y_pred= model.predict(X_test)
print(f'Confusion Matrix: {confusion_matrix(y_test, y_pred)}')
print(f'f1_score: {f1_score(y_test, y_pred, average= "micro")}')
print(f'Classifiction_report: {classification_report(y_test, y_pred, target_names= encoders["class"].classes_)}')

Confusion Matrix: [[110   6   1   1]
 [  2  16   0   1]
 [  2   0 356   0]
 [  1   0   0  23]]
f1_score: 0.9730250481695568
Classifiction_report:               precision    recall  f1-score   support

         acc       0.96      0.93      0.94       118
        good       0.73      0.84      0.78        19
       unacc       1.00      0.99      1.00       358
       vgood       0.92      0.96      0.94        24

    accuracy                           0.97       519
   macro avg       0.90      0.93      0.91       519
weighted avg       0.97      0.97      0.97       519

