# Car Price prediction v3



### About Dataset
> This dataset contains information about used cars.
> This data can be used for a lot of purposes such as price prediction to exemplify the use of linear regression in Machine Learning.
> The columns in the given dataset are as follows:

+ name
+ year
+ selling_price
+ km_driven
+ fuel
+ seller_type
+ transmission
+ Owner

<br>[Car price](https://www.kaggle.com/datasets/nehalbirla/vehicle-dataset-from-cardekho)



## Démarche de travail en Machine Learning
#### Exploratory Data Analysis : 
> comprendre au maximum les données dont on dispose pour définir une stratégie de modélisation.
Checklist (non-exhaustive)

+ Analyse de forme :
    + Identification de la target
    + Nombre de lignes et de colonnes
    + Types de variables
    + Identification des valeurs manquantes


In [None]:
## Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [2]:
# import datasets
url = "Car details v3.csv"
car_df = pd.read_csv(url)

car_df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0


In [3]:
# déterminer les types
columns_type = car_df.dtypes
print(columns_type)

name              object
year               int64
selling_price      int64
km_driven          int64
fuel              object
seller_type       object
transmission      object
owner             object
mileage           object
engine            object
max_power         object
torque            object
seats            float64
dtype: object


### Identification de la target

In [None]:
# on crée X et Y
X = car_df.drop(["selling_price"], axis=1)
X_numeric_columns = X.drop(
    ["name", "fuel", "seller_type", "transmission", "owner"], axis=1
)
y = car_df["selling_price"]

In [None]:
# on crée des expressions régulières pour découper torque en 2 colonnes
torque_patterns = [
    r"(\d+\.?\d*)\s*Nm\s*@\s*(\d+\.?\d*)\s*rpm",  # handle 250Nm@ 1500rpm format
    r"(\d+\.?\d*)\s*nm\s*@\s*(\d+\.?\d*)\s*rpm",  # handle 250nm@ 1500rpm format (lowercase 'nm')
    r"(\d+\.?\d*)\s*Nm\s*at\s*(\d+)\s*rpm",  # handle 96 Nm at 3000 rpm format
    r"(\d+\.?\d*)\s*NM\s*@\s*(\d+)\s*rpm",  # handle 151NM@ 4850rpm format (uppercase 'NM')
    r"(\d+\.?\d*)\s*Nm\s*@\s*(\d+)\s*~\s*(\d+)\s*rpm",  # handle 250Nm@ 1500~4500rpm format
    r"(\d+\.?\d*)\s*Nm\s*@\s*(\d+\.?\d*)\s*-\s*(\d+\.?\d*)\s*rpm",  # handle 250Nm@ 1500-2750rpm format
    r"(\d+\.?\d*)\s*@\s*(\d+\.?\d*)\s*-\s*(\d+\.?\d*)\s*rpm",  # handle 250@ 1500-2750rpm format
    r"(\d+\.?\d*)\s*nm\s*@\s*(\d+)\s*-\s*(\d+)\s*rpm",  # handle 190.24nm@ 1750-2250rpm format
    r"(\d+\.?\d*)\s*kgm\s*@\s*(\d+\.?\d*)\s*rpm",  # handle 22.4 kgm@ 1750rpm format
    r"(\d+\.?\d*)\s*kgm\s*at\s*(\d+)\s*rpm",  # handle 11.4 kgm at 4000 rpm format
    r"(\d+\.?\d*)\s*kgm\s*at\s*(\d+,\d*)\s*rpm",  # handle 11.4 kgm at 4,000 rpm format
    r"(\d+\.?\d*)\s*@\s*(\d+\.?\d*)\s*\(kgm\s*@\s*rpm\)",  # handle kgm@rpm format
    r"(\d+\.?\d*)\s*@\s*(\d+,\d*)\s*\(kgm\s*@\s*rpm\)",  # handle 12.7@ 2,700(kgm@ rpm) format
    r"(\d+\.?\d*)\s*kgm\s*at\s*(\d+)\s*-\s*(\d+)\s*rpm",  # handle 22.4 kgm at 1750-2750rpm format
    r"(\d+\.?\d*)\s*@\s*(\d+)\s*-\s*(\d+)\s*\(kgm\s*@\s*rpm\)",  # handle 20.4@ 1400-3400(kgm@ rpm) format
    r"(\d+\.?\d*)\s*@\s*(\d+,\d*)\s*-\s*(\d+,\d*)\s*\(kgm\s*@\s*rpm\)",  # handle 24@ 1,900-2,750(kgm@ rpm) format
]

In [None]:
# Initialiser les colonnes torqueNm et torqueRPM avec NaN
X_numeric_columns["torqueNm"] = float("nan")
X_numeric_columns["torqueRPM"] = float("nan")

# Appliquer chaque pattern et remplir les colonnes torqueNm et torqueRPM
for pattern in torque_patterns:
    extracted = X_numeric_columns["torque"].str.extract(pattern)
    if not extracted.empty:
        if "kgm" in pattern:
            X_numeric_columns["torqueNm"] = X_numeric_columns["torqueNm"].combine_first(
                extracted[0].str.replace(",", "").astype(float) * 9.80665
            )
        else:
            X_numeric_columns["torqueNm"] = X_numeric_columns["torqueNm"].combine_first(
                extracted[0].str.replace(",", "").astype(float)
            )
        if extracted.shape[1] > 1:
            if "-" in pattern or "~" in pattern:
                X_numeric_columns["torqueRPM"] = X_numeric_columns[
                    "torqueRPM"
                ].combine_first(
                    extracted[[1, 2]].max(axis=1).str.replace(",", "").astype(float)
                )
            else:
                X_numeric_columns["torqueRPM"] = X_numeric_columns[
                    "torqueRPM"
                ].combine_first(extracted[1].str.replace(",", "").astype(float))

# Remplacer les NaN par -1
X_numeric_columns["torqueNm"] = X_numeric_columns["torqueNm"].fillna(-1)
X_numeric_columns["torqueRPM"] = X_numeric_columns["torqueRPM"].fillna(-1)

X_numeric_columns = X_numeric_columns.drop(["torque"], axis=1)

In [None]:
# Découper les champs max_power, mileage et engine et remplacer les NaN par -1
X_numeric_columns["max_power"] = (
    X_numeric_columns["max_power"]
    .str.extract(r"(\d+\.?\d*)\s*bhp")
    .astype(float)
    .fillna(-1)
)

X_numeric_columns["engine"] = (
    X_numeric_columns["engine"]
    .str.extract(r"(\d+\.?\d*)\s*CC")
    .astype(float)
    .fillna(-1)
)
X_numeric_columns["seats"] = X_numeric_columns["seats"].fillna(-1)

X_numeric_columns["mileage"] = X_numeric_columns["mileage"].astype(str)
mileage_values = X_numeric_columns["mileage"].str.extract(r"(\d+\.?\d*)\s*(kmpl|kmpkg)")
mileage_values.columns = ["value", "unit"]
mileage_values["value"] = mileage_values["value"].astype(float).fillna(-1)

# Convertir les valeurs en kmpl (kilometer per liter)
# Supposons que 1 kmpkg = 1.4 kmpl
conversion_factor = 1.4
mileage_values["value"] = np.where(
    mileage_values["unit"] == "kmpkg",
    mileage_values["value"] * conversion_factor,
    mileage_values["value"],
)

# Remplacer la colonne mileage dans X_numeric_columns par les valeurs converties
X_numeric_columns["mileage"] = mileage_values["value"]

In [8]:
display(X_numeric_columns)

Unnamed: 0,year,km_driven,mileage,engine,max_power,seats,torqueNm,torqueRPM
0,2014,145500,23.40,1248.0,74.00,5.0,190.000000,2000.0
1,2014,120000,21.14,1498.0,103.52,5.0,250.000000,2500.0
2,2006,140000,17.70,1497.0,78.00,5.0,124.544455,2700.0
3,2010,127000,23.00,1396.0,90.00,5.0,219.668960,2750.0
4,2007,120000,16.10,1298.0,88.20,5.0,112.776475,4500.0
...,...,...,...,...,...,...,...,...
8123,2013,110000,18.50,1197.0,82.85,5.0,113.700000,4000.0
8124,2007,119000,16.80,1493.0,110.00,5.0,235.359600,2750.0
8125,2009,120000,19.30,1248.0,73.90,5.0,190.000000,2000.0
8126,2013,25000,23.57,1396.0,70.00,5.0,140.000000,3000.0


In [9]:
# # Ajouter la colonne cible y au DataFrame X_numeric_columns
# X_numeric_columns["selling_price"] = y
# # Calculer la matrice de corrélation
# correlation_matrix = X_numeric_columns.corr()
# # Extraire la dernière ligne de la matrice de corrélation
# # Extraire la colonne de corrélation avec selling_price et trier par ordre décroissant
# corr_with_target = correlation_matrix["selling_price"].sort_values(ascending=False)
# # Exclure la corrélation de selling_price avec lui-même
# corr_with_target = corr_with_target.drop(labels=["selling_price"])
# display(corr_with_target)
# # Afficher la matrice de corrélation
# plt.figure(figsize=(12, 8))
# sns.heatmap(correlation_matrix[-1:], annot=True, cmap="coolwarm", fmt=".2f")
# plt.title("Matrice de Corrélation")
# plt.show()


#### Preprocessing : 
> transformer le data pour le mettre dans un format propice au ML.

Checklist (non-exhaustive)
+ Preparation des donnees
    + Création du Train / Test sets
    + Elimination des NaN : dropna(), imputation, colonnes « vides »
    + Encodage (var. catégorielles => dummies)
    + Suppression des outliers néfastes au modèle (à faire plutôt après avoir créer un 1e modèle de Machine Learning)
    + Feature Selection (dans un 1e temps, éliminer les var. avec une variance nulle et les variables redondantes)
    + Feature Engineering
    + Feature Scaling (MinMaxScaler/StandardScaler/RobustScaler)
    Modeling : développer un modèle de Machine Learning qui réponde à l’objectif
    final.
Checklist (non-exhaustive)
    + Définir une fonction d’évaluation (que cherche t-on à mesurer ?)
    + Entrainement de différents modèles
    + Optimisation avec GrisSearchCV
    + (Optionnel) Analyse des erreurs et retour au Preprocessing/EDA
    + Learning Curve (savoir si récolter plus de données permettrait d’améliorer notre modèle)
    + Prise de décision

#### Suppression des outliers


### Elimination des Nans

### Feature Importance Analysis

1. Correlation Analysis: check correlation between Features Target
1. Model-Based Featue importance : Use Random Forrest ===> score of features
1. RFE: Deatermine feature importance by recursively removing less importance features

In [10]:
# Supposons que car_df soit votre DataFrame
X = car_df.drop(["selling_price"], axis=1)
y = car_df["selling_price"]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=101
)

In [12]:
# Identifier les colonnes numériques et catégorielles
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object"]).columns

# Appliquer StandardScaler aux colonnes numériques
scaler = StandardScaler()
X_train[X_numeric_columns] = scaler.fit_transform(X_train[X_numeric_columns])
X_test[X_numeric_columns] = scaler.transform(X_test[X_numeric_columns])

# Appliquer OneHotEncoder aux colonnes catégorielles
encoder = OneHotEncoder(handle_unknown="ignore")

ValueError: Boolean array expected for the condition, not int64

In [None]:
# Remettre les index des DataFrames encodés pour les concaténer correctement
X_train_encoded.index = X_train.index
X_test_encoded.index = X_test.index

# Concaténer les colonnes numériques et catégorielles encodées
X_train = pd.concat([X_train[numeric_features], X_train_encoded], axis=1)
X_test = pd.concat([X_test[numeric_features], X_test_encoded], axis=1)

# Convertir tous les noms de colonnes en chaînes de caractères
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

# Entraîner un modèle de régression linéaire
model = LinearRegression()
model.fit(X_train, y_train)

# Prédire les valeurs
y_pred = model.predict(X_test)

ValueError: setting an array element with a sequence.

In [None]:
rmse = root_mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R²: {r2}")

In [None]:
plt.figure(figsize=(12, 8))
sns.regplot(
    x=y_test, y=y_pred, color="r", scatter_kws={"s": 10}, line_kws={"color": "blue"}
)
plt.xlabel("Actual Selling Price")
plt.ylabel("Predicted Selling Price")
plt.title("Scatter Plot of Actual vs. Predicted Selling Price")

# Afficher le graphique
plt.show()