# <span style="background-color: pink;">Shlomo Kleinman 209345578</span>
# <span style="background-color: pink;">Eilon Dadon 315784611</span>
### https://github.com/ShlomoKleinman/Mercedes-Project.git

# Building ML Model On Used Cars

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import SelectFromModel
from scipy.stats import uniform
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, MinMaxScaler

In [2]:
import warnings
from sklearn.exceptions import ConvergenceWarning

# Ignore ConvergenceWarning
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)

# <span style="background-color: grey;">Loading The Data</span>

In [3]:
data = pd.read_csv('dataset.csv')
data

Unnamed: 0,manufactor,Year,model,Hand,Gear,capacity_Engine,Engine_type,Prev_ownership,Curr_ownership,Area,City,Price,Pic_num,Cre_date,Repub_date,Description,Color,Km,Test,Supply_score
0,יונדאי,2015,i35,2,אוטומטית,1600,בנזין,פרטית,פרטית,רעננה - כפר סבא,רעננה,51000.0,2.0,11/07/2023,11/07/2023,['רכב שמור בקנאות\nמוכרת עקב קבלת רכב חברה'],כחול כהה מטאלי,144000,,
1,ניסאן,2018,ניסאן מיקרה,1,אוטומטית,1200,בנזין,פרטית,פרטית,מושבים בשרון,אבן יהודה,49000.0,0.0,06/04/2022,22/05/2022,['שמורה כל התוספות'],כחול בהיר,69000,,
2,סוזוקי,2010,סוזוקי סוויפט,1,אוטומטית,1450,בנזין,,,רמת,רמת,22500.0,1.0,29/10/2022,29/10/2022,['רכב במצב מתוחזק ברמה גבוהה טסט עד אפריל 2023'],,145000,,
3,טויוטה,2016,אוריס,1,טיפטרוניק,1600,בנזין,פרטית,פרטית,נס ציונה - רחובות,רחובות,63000.0,5.0,16/05/2024,16/05/2024,['אוטו במצב חדש!! שמור בקנאות!! נהג יחיד מטופל...,אפור מטאלי,27300,,
4,קיה,2012,פיקנטו,1,אוטומטית,1248,בנזין,,,"ראשל""צ והסביבה",ראשון לציון,37000.0,1.0,13/06/2022,13/06/2022,['שמור'],,70000,,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,סקודה,2016,סקודה אוקטביה (2016),2,אוטומטית,1400,בנזין,,,ראש העין והסביבה,כפר קאסם,60000.0,1.0,18/07/2022,18/07/2022,['רכב שמור &lt;br/&gt;&lt;br/&gt;פירוט:&lt;br/...,,180,,
1496,אלפא רומיאו,2013,אלפא רומיאו ג'ולייטה,4,ידנית,1400,בנזין,,,חיפה וחוף הכרמל,חיפה,45000.0,0.0,44791,44791,['פרטית \r\nמכונית יפיפיה ללא שום תאונות ואו ת...,,160000,,
1497,סקודה,2014,סקודה ראפיד (2014),1,אוטומטית,1400,בנזין,,,,כפר מנדא,30000.0,1.0,30/08/2022,30/08/2022,['רכב נקי גיר שילדה מנוע במצב מעולה &lt;br/&gt...,,,,
1498,ניסאן,2011,ניסאן ג'וק JUKE,3,אוטומטית,1600,בנזין,פרטית,פרטית,אשדוד - אשקלון,אשדוד,28000.0,0.0,31/08/2023,09/11/2023,"['רכב מתוחזק היטב ללא תאונות או תקלות, טסט עד ...",אפור מטאלי,118000,81,


## Prepare data function for both train and test data

In [4]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler

def prepare_data(data): 
    # Drop unnecessary columns
    columns_to_drop = ['Pic_num', 'Cre_date', 'Repub_date', 'Description', 'Test', 'Supply_score', 'Gear', 'Engine_type', 'Prev_ownership', 'Curr_ownership', 'Area', 'City', 'Color']
    data = data.drop(columns=columns_to_drop)  # Drop the unnecessary columns from the DataFrame

    # Replace same name in different language
    data['manufactor'] = data['manufactor'].str.replace('Lexsus', 'לקסוס')  # Replace 'Lexsus' with 'לקסוס' in the 'manufactor' column
    
    # Remove manufactor name from model and keep only main model name
    manufacturers = data['manufactor'].unique()  # Get unique values of 'manufactor'
    def remove_manufacturer_from_model(row):
        model = row['model']
        for manufacturer in manufacturers:
            model = model.replace(manufacturer, '').strip()  # Remove the manufacturer name from the model name
        model = model.split()[0]  # Keep only the main model name
        return model
    data['model'] = data.apply(remove_manufacturer_from_model, axis=1)  # Apply the function to each row in the 'model' column
    
    # Clean and impute capacity_Engine
    try:
        data['capacity_Engine'] = pd.to_numeric(data['capacity_Engine'].str.replace(',', ''), errors='coerce')  # Convert 'capacity_Engine' to numeric, handling commas
    except AttributeError:
        pass
    data['capacity_Engine'] = data.groupby(['manufactor','model','Year'])['capacity_Engine'].transform(lambda x: x.fillna(x.mean()))  # Fill missing values with mean per group
    data['capacity_Engine'] = data.groupby(['manufactor','model'])['capacity_Engine'].transform(lambda x: x.fillna(x.mean()))  # Fill remaining missing values with mean per group
    data['capacity_Engine'] = data.groupby(['manufactor','Year'])['capacity_Engine'].transform(lambda x: x.fillna(x.mean()))  # Fill remaining missing values with mean per group
    
    # Clean and impute Km
    try:
        data['Km'] = pd.to_numeric(data['Km'].str.replace(',', ''), errors='coerce')  # Convert 'Km' to numeric, handling commas
    except AttributeError:
        pass
    data['Km'] = data.groupby(['Year'])['Km'].transform(lambda x: x.fillna(x.mean()))  # Fill missing values with mean per year
    
    # Feature engineering
    current_year = 2024
    data['km_per_year'] = data['Km'] / (current_year - data['Year'])  # Calculate kilometers per year
    data['age'] = current_year - data['Year']  # Calculate the age of the car
    data['km_age_ratio'] = data['Km'] / data['age']  # Calculate the ratio of kilometers to age
    
    # Log transform of continuous variables
    data['log_km'] = np.log1p(data['Km'])  # Apply log transformation to 'Km'
    data['log_capacity_Engine'] = np.log1p(data['capacity_Engine'])  # Apply log transformation to 'capacity_Engine'
    
    # Seperate In Order To Handle missing values before polynomial features
    numeric_columns = data.select_dtypes(include=[np.number]).columns  # Select numeric columns
    non_numeric_columns = data.select_dtypes(exclude=[np.number]).columns  # Select non-numeric columns
    
    # Impute numeric columns
    numeric_imputer = SimpleImputer(strategy='mean')  # Create a SimpleImputer for numeric columns with mean strategy
    data[numeric_columns] = numeric_imputer.fit_transform(data[numeric_columns])  # Impute missing values in numeric columns
    
    # Impute non-numeric columns
    non_numeric_imputer = SimpleImputer(strategy='most_frequent')  # Create a SimpleImputer for non-numeric columns with most frequent strategy
    data[non_numeric_columns] = non_numeric_imputer.fit_transform(data[non_numeric_columns])  # Impute missing values in non-numeric columns
    
    # One-hot encoding
    categorical_columns = ['manufactor', 'model']
    data = pd.get_dummies(data, columns=[col for col in categorical_columns if col in data.columns])  # One-hot encode categorical columns
    
    # Polynomial features
    required_columns = ['Year', 'Hand', 'capacity_Engine', 'Km', 'km_per_year','age']  # Columns to use for polynomial features
    
    poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)  # Create PolynomialFeatures transformer
    poly_features = poly.fit_transform(data[required_columns])  # Fit and transform polynomial features
    poly_feature_names = poly.get_feature_names_out(required_columns)  # Get names of polynomial features
    data = data.drop(columns=required_columns)  # Drop original columns
    poly_df = pd.DataFrame(poly_features, columns=poly_feature_names)  # Create DataFrame with polynomial features
    data = pd.concat([data.reset_index(drop=True), poly_df], axis=1)  # Concatenate original data with polynomial features
    
    # Normalization
    scaler = MinMaxScaler()  # Create MinMaxScaler
    data[required_columns] = scaler.fit_transform(data[required_columns])  # Normalize specified columns
    
    return data


## Preparing train data

In [5]:
prepared_data = prepare_data(data)

## Splitting the data

In [6]:
# splitting data
X = prepared_data.drop(columns=['Price'])
y = prepared_data['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Choosing Best Parameters for ElasticNet Model

In [7]:
# Hyperparameter tuning using Grid Search 
param_grid = {
    'alpha': [0.005,0.01,0.05, 0.1],
    'l1_ratio': [0.9,0.93,0.94, 0.95, 0.97,0.99]
    #'max_iter': [1000, 2000, 5000,10000]
}
elastic_net = ElasticNet(random_state=42)
grid_search = GridSearchCV(elastic_net, param_grid, cv=10, scoring='neg_mean_squared_error',verbose=1)
grid_search.fit(X_train, y_train)

# Best model from Grid Search
best_model = grid_search.best_estimator_

Fitting 10 folds for each of 24 candidates, totalling 240 fits


In [8]:
# Print best parameters
print("Best parameters:", best_model)

Best parameters: ElasticNet(alpha=0.005, l1_ratio=0.97, random_state=42)


## Top 5 Most Important features

In [9]:
# Feature importance
feature_importance = np.abs(best_model.coef_)  # Calculate the absolute values of the coefficients from the model
feature_names = X.columns  # Get the names of the features from the DataFrame

# Pair the feature importances with their corresponding feature names and sort them in descending order
important_5features = sorted(zip(feature_importance, feature_names), reverse=True)[:5]  # Get the top 5 features

print('Top 5 Features:')
for importance, name in important_5features:
    # Determine if the impact is positive or negative based on the original coefficient value
    sign = 'Positive' if best_model.coef_[list(feature_names).index(name)] > 0 else 'Negative'
    # Print the feature name, its impact sign, and the coefficient value
    print(f'{name}: {sign} impact with coefficient {importance}')

Top 5 Features:
model_קורבט: Positive impact with coefficient 72327.91477459692
model_I-MIEV: Positive impact with coefficient 43078.69154092309
model_לנסר: Negative impact with coefficient 39781.987233122534
model_מוסטנג: Positive impact with coefficient 38413.96393526294
model_חיפושית: Positive impact with coefficient 38184.565924657625


## 10-fold cross-validation on the entire dataset

In [10]:
# Perform 10-fold cross-validation on the entire dataset
cv_scores = cross_val_score(best_model, X, y, cv=10, scoring='neg_mean_squared_error')
cv_rmse = np.sqrt(-cv_scores)
print("Cross-validation RMSE scores:", cv_rmse)
print("Mean CV RMSE:", cv_rmse.mean())
print("Standard deviation of CV RMSE:", cv_rmse.std())

Cross-validation RMSE scores: [15247.38047989  9739.91099429 13976.72034649 10480.01408969
 14177.52718768 10676.4931994  12365.66038514 12646.3055417
 13107.11114852 10045.63475868]
Mean CV RMSE: 12246.275813148022
Standard deviation of CV RMSE: 1826.3268298902422


## Creating Predict Function for the test data

In [11]:
def predict_price(test_data):
    # Ensure all columns present in the training data are also in the test data
    for col in X.columns:
        if col not in test_data.columns:
            test_data[col] = 0  # Add missing columns with default value 0

    # Remove any extra columns that are not present during training
    test_data = test_data[X.columns]  # Keep only columns present in the training data
    
    # Make predictions using the best trained model
    prediction = best_model.predict(test_data)
    return prediction

## loading sample data

In [12]:
sample = pd.read_csv('sample_test_data.csv')
sample

Unnamed: 0,manufactor,Year,model,Hand,Gear,capacity_Engine,Engine_type,Prev_ownership,Curr_ownership,Area,City,Price,Pic_num,Cre_date,Repub_date,Description,Color,Km,Test,Supply_score
0,יונדאי,2015,i35,2,אוטומטית,1600,בנזין,פרטית,פרטית,רעננה - כפר סבא,רעננה,51000,2.0,11/07/2023,11/07/2023,בננה,כחול כהה מטאלי,144000.0,,
1,ניסאן,2018,ניסאן מיקרה,1,אוטומטית,1200,בנזין,פרטית,פרטית,מושבים בשרון,אבן יהודה,49000,0.0,06/04/2022,22/05/2022,בננה,כחול בהיר,69000.0,,
2,סוזוקי,2010,סוזוקי סוויפט,1,אוטומטית,1450,בנזין,,,רמת,רמת,22500,1.0,29/10/2022,29/10/2022,בננה,,145000.0,,
3,טויוטה,2016,אוריס,1,טיפטרוניק,1600,בנזין,פרטית,פרטית,נס ציונה - רחובות,רחובות,63000,5.0,16/05/2024,16/05/2024,בננה,אפור מטאלי,27300.0,,
4,קיה,2012,פיקנטו,1,אוטומטית,1248,בנזין,,,"ראשל""צ והסביבה",ראשון לציון,37000,1.0,13/06/2022,13/06/2022,בננה,,70000.0,,4.0
5,אאודי,2015,אאודי A1,3,אוטומטית,1400,בנזין,,,פתח תקוה והסביבה,פתח תקווה,60000,1.0,23/06/2022,23/06/2022,בננה,,120000.0,,357.0
6,סובארו,2009,אימפרזה,3,ידנית,2400,בנזין,פרטית,פרטית,,ירכא,95000,16.0,04/08/2022,03/08/2023,בננה,שחור,125000.0,83.0,
7,מיצובישי,2018,ASX,1,אוטומטית,2000,בנזין,,,"ראשל""צ והסביבה",בית דגן,90000,,15/07/2022,15/07/2022,בננה,,102000.0,,5.0
8,מרצדס,2015,220,1,אוטומטית,2200,דיזל,,,חיפה וחוף הכרמל,חיפה,70000,1.0,30/06/2022,30/06/2022,בננה,,,,
9,ב.מ.וו,2003,525,3,אוטומטית,2500,דיזל,,,חולון - בת ים,בת ים,45000,1.0,17/06/2022,17/06/2022,בננה,,400000.0,,57.0


## Using the same prepare_data function on the sample data

In [13]:
sample = prepare_data(sample)

In [14]:
sample['Price']

0     51000.0
1     49000.0
2     22500.0
3     63000.0
4     37000.0
5     60000.0
6     95000.0
7     90000.0
8     70000.0
9     45000.0
10    65000.0
11    22000.0
12    19500.0
13    77500.0
14    45000.0
15    23000.0
Name: Price, dtype: float64

## Predicting sample data

In [15]:
predict_price(sample)

array([50074.60165799, 47315.20772333, 24928.36155119, 73341.60754011,
       30583.16846535, 58450.25085123, 39473.11411535, 83742.07390504,
       67978.99496789, 44717.30363159, 62936.05492058, 27750.75730703,
       16229.12989831, 69896.5038318 , 40314.72126993, 24698.79608024])

In [16]:
# # Save the final model
import joblib
joblib.dump(best_model, 'final_model.pkl')

['final_model.pkl']