In [1]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd




In [2]:
# Chargement des données depuis un fichier CSV 
new_data = pd.read_csv('data.csv')

In [3]:
# Function to encode categorical columns
def encode_categorical_columns(df, categorical_columns):
    le = LabelEncoder()
    for column in categorical_columns:
        # Only encode columns that are of type 'object' (categorical)
        if df[column].dtype == 'object':
            df[column] = le.fit_transform(df[column].astype(str))
    return df


In [4]:
# Identifying categorical columns
categorical_columns = new_data.select_dtypes(include=['object']).columns

# Encoding categorical columns
encoded_data = encode_categorical_columns(new_data.copy(), categorical_columns)


In [5]:
# Preparing data for 'Levy' imputation
# Separate dataset into rows with missing 'Levy' and rows without missing 'Levy'
data_with_levy = encoded_data[encoded_data['Levy'].notnull()]
data_without_levy = encoded_data[encoded_data['Levy'].isnull()]

# Splitting the data_with_levy into features and target ('Levy')
X = data_with_levy.drop(['Levy', 'Engine volume'], axis=1)  # Dropping 'Engine volume' as it has missing values too
y = data_with_levy['Levy']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:

# Creating and training the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predicting on the test set and calculating the Mean Squared Error
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mse


55845.84279976396

In [7]:
# Predicting the missing 'Levy' values
X_missing_levy = data_without_levy.drop(['Levy', 'Engine volume'], axis=1)  # Dropping 'Engine volume' for consistency
predicted_levy = model.predict(X_missing_levy)

# Filling in the missing 'Levy' values in the original dataset
new_data.loc[new_data['Levy'].isnull(), 'Levy'] = predicted_levy

# Verifying if the missing values in 'Levy' have been filled
new_data.isnull().sum()


ID                     0
Price                  0
Levy                   0
Manufacturer           0
Model                  0
Prod. year             0
Category               0
Leather interior       0
Fuel type              0
Engine volume       4782
Mileage(km)            0
Cylinders              0
Gear box type          0
Drive wheels           0
Doors                  0
Wheel                  0
Color                  0
Airbags                0
dtype: int64

In [9]:
# Creating a custom model to impute missing values in 'Engine volume'
# This model will consider the relationships between different features in the dataset

# Prepare the data for 'Engine volume' imputation
# Separate dataset into rows with and without missing 'Engine volume'
data_with_engine_vol = encoded_data[encoded_data['Engine volume'].notnull()]
data_without_engine_vol = encoded_data[encoded_data['Engine volume'].isnull()]

# Splitting the data_with_engine_vol into features and target ('Engine volume')
X_engine_vol = data_with_engine_vol.drop(['Engine volume', 'Levy'], axis=1)  # Dropping 'Levy' for consistency
y_engine_vol = data_with_engine_vol['Engine volume']

# Splitting the dataset into training and testing sets for 'Engine volume'
X_train_engine_vol, X_test_engine_vol, y_train_engine_vol, y_test_engine_vol = train_test_split(X_engine_vol, y_engine_vol, test_size=0.2, random_state=42)


In [10]:

# Creating and training the model for 'Engine volume'
model_engine_vol = RandomForestRegressor(n_estimators=100, random_state=42)
model_engine_vol.fit(X_train_engine_vol, y_train_engine_vol)

# Predicting on the test set and calculating the Mean Squared Error for 'Engine volume'
y_pred_engine_vol = model_engine_vol.predict(X_test_engine_vol)
mse_engine_vol = mean_squared_error(y_test_engine_vol, y_pred_engine_vol)
mse_engine_vol


0.07418512560207484

In [11]:
# Predicting the missing 'Engine volume' values
X_missing_engine_vol = data_without_engine_vol.drop(['Engine volume', 'Levy'], axis=1)
predicted_engine_vol = model_engine_vol.predict(X_missing_engine_vol)

# Filling in the missing 'Engine volume' values in the original dataset
new_data.loc[new_data['Engine volume'].isnull(), 'Engine volume'] = predicted_engine_vol

# Verifying if the missing values in 'Engine volume' have been filled
new_data.isnull().sum()


ID                  0
Price               0
Levy                0
Manufacturer        0
Model               0
Prod. year          0
Category            0
Leather interior    0
Fuel type           0
Engine volume       0
Mileage(km)         0
Cylinders           0
Gear box type       0
Drive wheels        0
Doors               0
Wheel               0
Color               0
Airbags             0
dtype: int64

In [14]:
new_data

Unnamed: 0,ID,Price,Levy,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage(km),Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags
0,45654403,13328,1399.00,LEXUS,RX 450,2010,Jeep,True,Hybrid,3.500,186005,6.0,Automatic,4x4,04-May,Left wheel,Silver,12
1,44731507,16621,1018.00,CHEVROLET,Equinox,2011,Jeep,False,Petrol,3.030,192000,6.0,Tiptronic,4x4,04-May,Left wheel,Black,8
2,45774419,8467,3743.99,HONDA,FIT,2006,Hatchback,False,Petrol,1.300,200000,4.0,Variator,Front,04-May,Right-hand drive,Black,2
3,45769185,3607,862.00,FORD,Escape,2011,Jeep,True,Hybrid,2.500,168966,4.0,Automatic,4x4,04-May,Left wheel,White,0
4,45809263,11726,446.00,HONDA,FIT,2014,Hatchback,True,Petrol,1.300,91901,4.0,Automatic,Front,04-May,Left wheel,Silver,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18271,45798355,8467,1602.53,MERCEDES-BENZ,CLK 200,1999,Coupe,True,CNG,2.000,300000,4.0,Manual,Rear,02-Mar,Left wheel,Silver,5
18272,45778856,15681,831.00,HYUNDAI,Sonata,2011,Sedan,True,Petrol,2.400,161600,4.0,Tiptronic,Front,04-May,Left wheel,Red,8
18273,45804997,26108,836.00,HYUNDAI,Tucson,2010,Jeep,True,Diesel,2.250,116365,4.0,Automatic,Front,04-May,Left wheel,Grey,4
18274,45793526,5331,1288.00,CHEVROLET,Captiva,2007,Jeep,True,Diesel,1.976,51258,4.0,Automatic,Front,04-May,Left wheel,Black,4
