In [7]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load the dataset
data = pd.read_csv("scraped_data.csv")

# Data preprocessing
# Handling missing values and scaling for numeric features
numeric_features = ['Annee']
categorical_features = ['Couleur', 'Finition', 'Modele', 'Marque']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Custom preprocessing for 'Kilometrage' column
def extract_numeric(text):
    try:
        # Refine the regular expression to capture numeric values along with units
        match = re.search(r'(\d+(\.\d+)?)', text)
        if match:
            return float(match.group(1))
        else:
            return None
    except:
        return None  # Return None if conversion fails


data['Kilometrage'] = data['Kilometrage'].apply(extract_numeric)  # Apply custom preprocessing
data['price'] = data['price'].apply(extract_numeric)  # Apply custom preprocessing

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Train-test split
X = data.drop(columns=['price'])
y = data['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Find indices of rows with NaN target values
nan_indices = np.isnan(y_train)

# Remove rows with NaN target values
X_train = X_train[~nan_indices]
y_train = y_train[~nan_indices]


In [1]:
import pandas as pd
# Load the dataset
data = pd.read_csv("scraped_data.csv")
data

Unnamed: 0,options,price
0,,950 Millions
1,,1 340 Millions
2,,1 Millions
3,,795 Millions
4,,690 Millions
...,...,...
91,"Climatisation,ABS,ESP,Radar de recul,Direction...",380 Millions
92,"Climatisation,ABS,ESP,Radar de recul,Direction...",1 Millions
93,"Climatisation,Toit ouvrant,ABS,ESP,Radar de re...",200 Millions
94,,1 Millions


In [8]:
# Model training
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', RandomForestRegressor())])  # Example: RandomForestRegressor
model.fit(X_train, y_train)


# Model evaluation
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)

print(f"Train RMSE: {train_rmse}, Train MSE: {train_mse}")
print(f"Test RMSE: {test_rmse}, Test MSE: {test_mse}")

Train RMSE: 77.78206842742532, Train MSE: 6050.050168848674
Test RMSE: 211.21030935916883, Test MSE: 44609.7947795958


