In [1]:
import warnings
# Suppress the FutureWarning
warnings.filterwarnings("ignore", category=FutureWarning)

# Import Libraries

In [2]:
# Import essential libraries for data processing, machine learning, visualization, and evaluation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# Import Dataset

In [3]:
# Load the dataset
df = pd.read_csv('./Car_Details.csv')

# Preview dataset
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [4]:
print(df.columns)

Index(['name', 'year', 'selling_price', 'km_driven', 'fuel', 'seller_type',
       'transmission', 'owner'],
      dtype='object')


In [5]:
print(df.shape ,"\n")

print(df.info())

(4340, 8) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4340 entries, 0 to 4339
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           4340 non-null   object
 1   year           4340 non-null   int64 
 2   selling_price  4340 non-null   int64 
 3   km_driven      4340 non-null   int64 
 4   fuel           4340 non-null   object
 5   seller_type    4340 non-null   object
 6   transmission   4340 non-null   object
 7   owner          4340 non-null   object
dtypes: int64(3), object(5)
memory usage: 271.4+ KB
None


In [6]:
df['selling_price'].max()

8900000

In [7]:
df['selling_price'].describe()

count    4.340000e+03
mean     5.041273e+05
std      5.785487e+05
min      2.000000e+04
25%      2.087498e+05
50%      3.500000e+05
75%      6.000000e+05
max      8.900000e+06
Name: selling_price, dtype: float64

In [8]:
df['selling_price'].min() 

20000

In [9]:
# One-liner using nested if-else to create the 'selling_price_range' column
df['selling_price_range'] = df['selling_price'].apply(
    lambda x: 'Less than 100000' if x <= 100000 else
              '100001 - 1000000' if x <= 1000000 else
              '1000001 - 2000000' if x <= 2000000 else
              '2000001 - 3000000' if x <= 3000000 else
              '3000001 - 4000000' if x <= 4000000 else
              '4000001 - 5000000' if x <= 5000000 else
              '5000001 - 6000000' if x <= 6000000 else
              '6000001 - 7000000' if x <= 7000000 else
              '7000001 - 8000000' if x <= 8000000 else
              '8000001 - 9000000'
)

# Display the first few rows of the updated DataFrame with the new classification column
print(df.head())

                       name  year  selling_price  km_driven    fuel  \
0             Maruti 800 AC  2007          60000      70000  Petrol   
1  Maruti Wagon R LXI Minor  2007         135000      50000  Petrol   
2      Hyundai Verna 1.6 SX  2012         600000     100000  Diesel   
3    Datsun RediGO T Option  2017         250000      46000  Petrol   
4     Honda Amaze VX i-DTEC  2014         450000     141000  Diesel   

  seller_type transmission         owner selling_price_range  
0  Individual       Manual   First Owner    Less than 100000  
1  Individual       Manual   First Owner    100001 - 1000000  
2  Individual       Manual   First Owner    100001 - 1000000  
3  Individual       Manual   First Owner    100001 - 1000000  
4  Individual       Manual  Second Owner    100001 - 1000000  


# PreProcessing

In [10]:
## Unique Values and their counts in 'Fuel Type'
print(df['fuel'].value_counts())

print("\n ============================= \n")

## Unique Values and their counts in 'Seller Type'
print(df['seller_type'].value_counts())

print("\n ============================= \n")

## Unique Values and their counts in 'Transmission Type'
print(df['transmission'].value_counts())

print("\n ============================= \n")

## Unique Values and their counts in 'Owner Type'
print(df['owner'].value_counts())

print("\n ============================= \n")

## Unique Values and their counts in 'Owner Type'
print(df['selling_price_range'].value_counts())

print("\n ============================= \n")


fuel
Diesel      2153
Petrol      2123
CNG           40
LPG           23
Electric       1
Name: count, dtype: int64


seller_type
Individual          3244
Dealer               994
Trustmark Dealer     102
Name: count, dtype: int64


transmission
Manual       3892
Automatic     448
Name: count, dtype: int64


owner
First Owner             2832
Second Owner            1106
Third Owner              304
Fourth & Above Owner      81
Test Drive Car            17
Name: count, dtype: int64


selling_price_range
100001 - 1000000     3619
Less than 100000      379
1000001 - 2000000     245
2000001 - 3000000      48
3000001 - 4000000      26
4000001 - 5000000      20
8000001 - 9000000       2
5000001 - 6000000       1
Name: count, dtype: int64




## Convert Categorical features to Numerical Values

| Features | Values : replacements |
| -------- | --------------------- |
| **Fuel** |  Diesel : 1   ;   Petrol : 2   ;   CNG : 3   ;   LPG : 4   ;   Electric : 5  |
| **Seller_Type** | Individual : 1   ;   Dealer : 2   ;   Trustmark Dealer : 3 |
| **Transmission** | Manual : 1   ;   Automatic : 2 |
| **Owner** | First Owner : 1   ;   Second Owner : 2   ;   Third Owner : 3   ;   Fourth & Above Owner : 4   ;   Test Drive Car: 5 |
| **Selling_price_range** | Less than 100000 : 1 ; 100001 - 1000000 : 2 ; 1000001 - 2000000 : 3 ; 2000001 - 3000000 : 4 ; 3000001 - 4000000 : 5 ; 4000001 - 5000000 : 6 ; 5000001 - 6000000 : 7 ; 6000001 - 7000000 : 8 ; 7000001 - 8000000 : 9 ; 8000001 - 9000000 : 10 |


In [11]:
print("=================================================")

# Conversion of 'fuel' column from categorical to numerical using a mapping dictionary
mapping = {'Diesel': 1, 'Petrol': 2, 'CNG': 3, 'LPG': 4, 'Electric': 5}
df['fuel'] = df['fuel'].map(mapping)

# Printing the conversion
print("Converted 'fuel' column:")
print(df['fuel'].value_counts())
print("=================================================")

# -------------------------------------------------------------------------------------

# Conversion of 'fuel' column from categorical to numerical using a mapping dictionary
mapping = {'Individual': 1, 'Dealer': 2, 'Trustmark Dealer': 3}
df['seller_type'] = df['seller_type'].map(mapping)

# Printing the conversion
print("Converted 'seller_type' column:")
print(df['seller_type'].value_counts())
print("=================================================")

# -------------------------------------------------------------------------------------

# Conversion of 'fuel' column from categorical to numerical using a mapping dictionary
mapping = {'Manual': 1, 'Automatic': 2}
df['transmission'] = df['transmission'].map(mapping)

# Printing the conversion
print("Converted 'transmission' column:")
print(df['transmission'].value_counts())
print("=================================================")

# -------------------------------------------------------------------------------------

# Conversion of 'fuel' column from categorical to numerical using a mapping dictionary
mapping = {'First Owner': 1, 'Second Owner': 2, 'Third Owner': 3,  'Fourth & Above Owner': 4, 'Test Drive Car': 5 }
df['owner'] = df['owner'].map(mapping)

# Printing the conversion
print("Converted 'owner' column:")
print(df['owner'].value_counts())
print("=================================================")

# -------------------------------------------------------------------------------------

# Conversion of 'fuel' column from categorical to numerical using a mapping dictionary
mapping = {
    'Less than 100000' : 1 ,
    '100001 - 1000000' : 2 , 
    '1000001 - 2000000' : 3 , 
    '2000001 - 3000000' : 4 , 
    '3000001 - 4000000' : 5 , 
    '4000001 - 5000000' : 6 , 
    '5000001 - 6000000' : 7 , 
    '6000001 - 7000000' : 8 , 
    '7000001 - 8000000' : 9 , 
    '8000001 - 9000000' : 10 ,
}
df['selling_price_range'] = df['selling_price_range'].map(mapping)

# Printing the conversion
print("Converted 'selling_price_range' column:")
print(df['selling_price_range'].value_counts())
print("=================================================")



 

Converted 'fuel' column:
fuel
1    2153
2    2123
3      40
4      23
5       1
Name: count, dtype: int64
Converted 'seller_type' column:
seller_type
1    3244
2     994
3     102
Name: count, dtype: int64
Converted 'transmission' column:
transmission
1    3892
2     448
Name: count, dtype: int64
Converted 'owner' column:
owner
1    2832
2    1106
3     304
4      81
5      17
Name: count, dtype: int64
Converted 'selling_price_range' column:
selling_price_range
2     3619
1      379
3      245
4       48
5       26
6       20
10       2
7        1
Name: count, dtype: int64


In [12]:
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,selling_price_range
0,Maruti 800 AC,2007,60000,70000,2,1,1,1,1
1,Maruti Wagon R LXI Minor,2007,135000,50000,2,1,1,1,2
2,Hyundai Verna 1.6 SX,2012,600000,100000,1,1,1,1,2
3,Datsun RediGO T Option,2017,250000,46000,2,1,1,1,2
4,Honda Amaze VX i-DTEC,2014,450000,141000,1,1,1,2,2


# Selecting Features (Inputs)

In [13]:
# Features(X) 
X = df[['year', 'km_driven', 'fuel', 'seller_type', 'transmission', 'owner']]

In [14]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4340 entries, 0 to 4339
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   year          4340 non-null   int64
 1   km_driven     4340 non-null   int64
 2   fuel          4340 non-null   int64
 3   seller_type   4340 non-null   int64
 4   transmission  4340 non-null   int64
 5   owner         4340 non-null   int64
dtypes: int64(6)
memory usage: 203.6 KB


# Selecting Label (Output)

In [15]:
Y = df['selling_price'] 
Y.head()

0     60000
1    135000
2    600000
3    250000
4    450000
Name: selling_price, dtype: int64

In [16]:
Y_classification = df['selling_price_range'] 
Y_classification.head()

0    1
1    2
2    2
3    2
4    2
Name: selling_price_range, dtype: int64

## Normalize Data

In [17]:
from sklearn import preprocessing

X = preprocessing.StandardScaler().fit(X).transform(X)
X[0:5]

array([[-1.44507431,  0.08113906,  0.86482829, -0.5555905 , -0.33927557,
        -0.63031847],
       [-1.44507431, -0.3476891 ,  0.86482829, -0.5555905 , -0.33927557,
        -0.63031847],
       [-0.2587948 ,  0.7243813 , -0.95365755, -0.5555905 , -0.33927557,
        -0.63031847],
       [ 0.92748471, -0.43345473,  0.86482829, -0.5555905 , -0.33927557,
        -0.63031847],
       [ 0.215717  ,  1.60347903, -0.95365755, -0.5555905 , -0.33927557,
         0.7205863 ]])

In [18]:
from sklearn.preprocessing import LabelEncoder

# Assuming y is your target label
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(Y)
y[0:5]

array([ 17,  60, 236, 114, 193], dtype=int64)

In [19]:
from sklearn.preprocessing import LabelEncoder

# Assuming y is your target label
label_encoder = LabelEncoder()
y_classification = label_encoder.fit_transform(Y_classification)
y_classification[0:5]

array([0, 1, 1, 1, 1], dtype=int64)

## Split the Data into Training and Testing Set

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, 
                                                    y, 
                                                    test_size = 0.2,
                                                    random_state = 10)
print ('Train set:', X_train.shape, y_train.shape)
print ('Test set:', X_test.shape, y_test.shape)

Train set: (3472, 6) (3472,)
Test set: (868, 6) (868,)


In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_classification_train, y_classification_test = train_test_split( X, 
                                                    y_classification, 
                                                    test_size = 0.2,
                                                    random_state = 10)
print ('Train set:', X_train.shape, y_classification_train.shape)
print ('Test set:', X_test.shape, y_classification_test.shape)

Train set: (3472, 6) (3472,)
Test set: (868, 6) (868,)


# Model Building

# Evaluating the Best Model

## 1. Regression

**Algorithms I'm using**

A) Linear Regression 

B) Random Forest Regressor

C) Gradient Boosting Regressor

D) XGBoost Regressor

In [22]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split

# Dictionary to store the evaluation results
regression_results = {}

# Function to evaluate a regression model
def evaluate_regression_model(model, param_grid, X_train, X_test, y_train, y_test, model_name):
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='r2', n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    
    r2 = r2_score(y_test, y_pred)
    
    # Store results
    regression_results[model_name] = {
        'Best Params': grid_search.best_params_,
        'R-squared Score': r2
    }
    print(f"{model_name} Best Params: {grid_search.best_params_}")
    print(f"{model_name} R-squared Score: {r2}")

# Example parameters for regression models
X_train, X_test, y_train, y_test = train_test_split( X, 
                                                    y, 
                                                    test_size = 0.2,
                                                    random_state = 10)
print ('Train set:', X_train.shape, y_train.shape)
print ('Test set:', X_test.shape, y_test.shape)

# Linear Regression
evaluate_regression_model(
    LinearRegression(), 
    param_grid={'fit_intercept': [True, False]},
    X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test,
    model_name='Linear Regression'
)

# Random Forest Regressor
evaluate_regression_model(
    RandomForestRegressor(), 
    param_grid={'n_estimators': [50, 100], 'max_depth': [None, 10, 20]},
    X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test,
    model_name='Random Forest Regressor'
)

# Gradient Boosting Regressor
evaluate_regression_model(
    GradientBoostingRegressor(), 
    param_grid={'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1], 'max_depth': [3, 5]},
    X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test,
    model_name='Gradient Boosting Regressor'
)

# XGBoost Regressor
evaluate_regression_model(
    XGBRegressor(), 
    param_grid={'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1], 'max_depth': [3, 5]},
    X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test,
    model_name='XGBoost Regressor'
)


Train set: (3472, 6) (3472,)
Test set: (868, 6) (868,)
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Linear Regression Best Params: {'fit_intercept': True}
Linear Regression R-squared Score: 0.6260495361618581
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Random Forest Regressor Best Params: {'max_depth': 10, 'n_estimators': 100}
Random Forest Regressor R-squared Score: 0.6918184367045845
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Gradient Boosting Regressor Best Params: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
Gradient Boosting Regressor R-squared Score: 0.6979828214528354
Fitting 5 folds for each of 8 candidates, totalling 40 fits
XGBoost Regressor Best Params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
XGBoost Regressor R-squared Score: 0.6846190469114681


## 2. Classification

**Algorithms I'm using**

A) SVM Classifier

B) Decision Tree Classifier

C) Random Forest Classifier

D) XGBoost Classifier

In [23]:
from sklearn.metrics import f1_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

# Dictionary to store the evaluation results
classification_results = {}

# Function to evaluate a classification model
def evaluate_classification_model(model, param_grid, X_train, X_test, y_classification_train, y_classification_test, model_name):
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_classification_train)
    
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    
    f1 = f1_score(y_classification_test, y_pred, average='weighted')
    
    # Store results
    classification_results[model_name] = {
        'Best Params': grid_search.best_params_,
        'F1 Score': f1
    }
    print(f"{model_name} Best Params: {grid_search.best_params_}")
    print(f"{model_name} F1 Score: {f1}")

# Example parameters for classification models
X_train, X_test, y_classification_train, y_classification_test = train_test_split( X, 
                                                    y_classification, 
                                                    test_size = 0.2,
                                                    random_state = 10)
print ('Train set:', X_train.shape, y_classification_train.shape)
print ('Test set:', X_test.shape, y_classification_test.shape)

# SVM Classifier
evaluate_classification_model(
    SVC(), 
    param_grid={'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']},
    X_train=X_train, X_test=X_test, y_classification_train=y_classification_train, y_classification_test=y_classification_test,
    model_name='SVM Classifier'
)

# Decision Tree Classifier
evaluate_classification_model(
    DecisionTreeClassifier(), 
    param_grid={'criterion': ['gini', 'entropy'], 'max_depth': [None, 10, 20]},
    X_train=X_train, X_test=X_test, y_classification_train=y_classification_train, y_classification_test=y_classification_test,
    model_name='Decision Tree Classifier'
)

# Random Forest Classifier
evaluate_classification_model(
    RandomForestClassifier(), 
    param_grid={'n_estimators': [50, 100], 'max_depth': [None, 10, 20]},
    X_train=X_train, X_test=X_test, y_classification_train=y_classification_train, y_classification_test=y_classification_test,
    model_name='Random Forest Classifier'
)

# XGBoost Classifier
evaluate_classification_model(
    XGBClassifier(), 
    param_grid={'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1], 'max_depth': [3, 5]},
    X_train=X_train, X_test=X_test, y_classification_train=y_classification_train, y_classification_test=y_classification_test,
    model_name='XGBoost Classifier'
)


Train set: (3472, 6) (3472,)
Test set: (868, 6) (868,)
Fitting 5 folds for each of 12 candidates, totalling 60 fits




SVM Classifier Best Params: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
SVM Classifier F1 Score: 0.87669526272569
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Decision Tree Classifier Best Params: {'criterion': 'gini', 'max_depth': 10}
Decision Tree Classifier F1 Score: 0.8777478622086794
Fitting 5 folds for each of 6 candidates, totalling 30 fits




Random Forest Classifier Best Params: {'max_depth': 10, 'n_estimators': 100}
Random Forest Classifier F1 Score: 0.8887260767397489
Fitting 5 folds for each of 8 candidates, totalling 40 fits


8 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
8 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\HP\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\HP\AppData\Local\Programs\Python\Python312\Lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "C:\Users\HP\AppData\Local\Programs\Python\Python312\Lib\site-packages\xgboost\sklearn.py", line 1491, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2 3

XGBoost Classifier Best Params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}
XGBoost Classifier F1 Score: 0.8625311777220561


## 3. Clustering :

**Algorithms I'm using**

A) K-Means Clustering

B) Hierarchical Clustering


In [None]:
from sklearn.metrics import davies_bouldin_score
from sklearn.cluster import KMeans, AgglomerativeClustering

# Dictionary to store the evaluation results
clustering_results = {}

# Function to evaluate a clustering model
def evaluate_clustering_model(model, X, model_name):
    model.fit(X)
    labels = model.labels_
    
    # Calculate Davies-Bouldin Index
    db_index = davies_bouldin_score(X, labels)
    
    # Store results
    clustering_results[model_name] = {
        'Davies-Bouldin Index': db_index
    }
    print(f"{model_name} Davies-Bouldin Index: {db_index}")

# Example data (X for clustering, no need for train-test split)
X = df[['year', 'km_driven', 'fuel', 'seller_type', 'transmission', 'owner']]

# K-Means Clustering
evaluate_clustering_model(
    KMeans(n_clusters=5), 
    X=X, 
    model_name='K-Means Clustering'
)

# Hierarchical Clustering
evaluate_clustering_model(
    AgglomerativeClustering(n_clusters=5), 
    X=X, 
    model_name='Hierarchical Clustering'
)


## Comparing All Models

In [None]:
# Function to compare and find the best model
def compare_models():
    # Compare regression models based on R-squared score
    best_regression_model = max(regression_results.items(), key=lambda x: x[1]['R-squared Score'])
    print(f"Best Regression Model: {best_regression_model[0]} with R-squared Score: {best_regression_model[1]['R-squared Score']}")
    
    # Compare classification models based on F1 Score
    best_classification_model = max(classification_results.items(), key=lambda x: x[1]['F1 Score'])
    print(f"Best Classification Model: {best_classification_model[0]} with F1 Score: {best_classification_model[1]['F1 Score']}")
    
    # Compare clustering models based on Davies-Bouldin Index (lower is better)
    best_clustering_model = min(clustering_results.items(), key=lambda x: x[1]['Davies-Bouldin Index'])
    print(f"Best Clustering Model: {best_clustering_model[0]} with Davies-Bouldin Index: {best_clustering_model[1]['Davies-Bouldin Index']}")
    
    # Store and display all best models
    best_models = {
        'Regression': best_regression_model,
        'Classification': best_classification_model,
        'Clustering': best_clustering_model
    }
    
    return best_models

# Run the comparison
best_models = compare_models()
print("\n\nMost Suitable Models:")
for i in best_models:
    print(best_models[i])


## Save Best Model (Random Forest Classifier)

In [None]:
import joblib

# Assuming `best_model` is your trained model
joblib.dump(best_model, 'best_model.pkl')

print("Model saved successfully!")
