In [650]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import make_scorer, root_mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
import re
from datetime import datetime

In [None]:
file_train_set = pd.read_csv()
file_test_set = pd.read_csv()

In [None]:
train_set = prepare_data(file_train_set)
test_set = prepare_data(file_test_set)

In [641]:
X_train = train_set.drop(columns=['Price']) 
y_train = train_set['Price'] 
X_test = test_set.drop(columns=['Price']) 
y_test = test_set['Price'] 

In [642]:
def prepare_data(df):
    #Data arrangement
    df['capacity_Engine'] = df['capacity_Engine'].replace('None', np.nan)
    df['capacity_Engine'] = df['capacity_Engine'].str.replace(',', '')
    df['Km'] = df['Km'].replace('None', np.nan)
    df['Km'] = df['Km'].str.replace(',', '')
    df['capacity_Engine'] = df['capacity_Engine'].fillna(0)
    df['Pic_num'] = df['Pic_num'].fillna(0)
    df = df.drop(columns=['Test','Supply_score','Cre_date','Repub_date','Area','Description'])
    
    def clean_model_column(row):
        manufactor = row['manufactor']
        model = row['model']
        model = re.sub(r'\b' + re.escape(manufactor) + r'\b', '', model, flags=re.IGNORECASE)
        model = re.sub(r'\(\d{4}\)', '', model)
        model = re.sub(r'\b\d{4}\b', '', model)
        model = re.sub(r',', '', model)
        model = model.strip()
    
        return model

    df.loc[:,'model']= df.apply(clean_model_column, axis=1)
    
    #arranging types
    df['Year'] = df['Year'].astype(int)
    df['Hand'] = df['Hand'].astype(int)
    df['Gear'] = df['Gear'].astype('category')
    df['capacity_Engine'] = df['capacity_Engine'].astype(int)
    df['Engine_type'] = df['Engine_type'].astype('category')
    df['Engine_type'] = df['Engine_type'].astype('category')
    df['Prev_ownership'] = df['Prev_ownership'].astype('category')
    df['Curr_ownership'] = df['Curr_ownership'].astype('category')
    df['City'] = df['City'].astype('string')
    df['Price'] = df['Price'].astype('float64')
    df['Pic_num'] = df['Pic_num'].astype(int)
    
    #Filling in missing values
    df['Engine_type'].fillna('בנזין', inplace=True)
    df['Gear'].fillna('אוטומטית', inplace=True)
    df['Prev_ownership'].fillna('פרטית', inplace=True)
    df['Curr_ownership'].fillna('פרטית', inplace=True)

    #Calculation of KM according to the annual average of a vehicle in Israel
    current_year = datetime.now().year
    df['difference'] = current_year - df['Year']
    df['Km'].fillna(df['difference'] * 15000, inplace=True)
    df.drop(columns=['difference'], inplace=True)
    
    #One-Hot Encoding
    columns_to_encode = ['Color', 'manufactor', 'model', 'Gear', 'Engine_type', 'Prev_ownership', 'Curr_ownership', 'City']

    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    encoded_features = encoder.fit_transform(df[columns_to_encode])
    encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(columns_to_encode))
    df = df.drop(columns_to_encode, axis=1).reset_index(drop=True)
    df = pd.concat([df, encoded_df], axis=1)
    # scaling 
    scaler = StandardScaler()  
    columns_to_scale = ['Year', 'Hand', 'capacity_Engine', 'Pic_num', 'Km']
    df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])
    
    
    return df

In [645]:
model = ElasticNet(alpha=0.1, l1_ratio=0.9)

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Define the cross-validation procedure
cv = KFold(n_splits=10, random_state=42, shuffle=True)

# Define the scoring method
scoring = make_scorer(rmse, greater_is_better=False)

# Perform cross-validation
cv_scores = cross_val_score(model, X_test, y_test, cv=cv, scoring=scoring)

# Convert the negative RMSE scores to positive by taking the absolute value
cv_scores = np.abs(cv_scores)

In [646]:
model.fit(X_train, y_train)

# Get the coefficients and feature names
coefficients = model.coef_
feature_names = X_train.columns

# Create a DataFrame to hold feature names and their corresponding coefficients
coeff_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients
})

# Aggregate the coefficients back to the original features
# Create a dictionary to map encoded feature names to their original feature names
encoded_to_original = {v: v.split('_')[0] for v in feature_names}

# Map the coefficients to the original feature names
coeff_df['Original_Feature'] = coeff_df['Feature'].map(encoded_to_original)

# Sum the coefficients for each original feature and determine positive or negative influence
coeff_df['Positive_Influence'] = coeff_df['Coefficient'] > 0
aggregated_coeff_df = coeff_df.groupby('Original_Feature').agg(
    Total_Coefficient=('Coefficient', 'sum'),
    Absolute_Coefficient=('Coefficient', lambda x: x.abs().sum())
).reset_index()

aggregated_coeff_df['Influence_Type'] = aggregated_coeff_df['Total_Coefficient'].apply(lambda x: 'Positive' if x > 0 else 'Negative')

top_5_features = aggregated_coeff_df.head(5)

top_5_features

Unnamed: 0,Original_Feature,Total_Coefficient,Absolute_Coefficient,Influence_Type
0,City,261.016057,327422.401536,Positive
1,Color,54.895487,67888.266631,Positive
2,Curr,8.99658,15539.532476,Positive
3,Engine,8.999369,25859.229043,Positive
4,Gear,17.942849,13236.318084,Positive


## Results

In [649]:
print("Cross-validation RMSE scores: ", cv_scores)
print("Mean RMSE: ", cv_scores.mean())

Cross-validation RMSE scores:  [12643.46607627 14054.42869987 14852.18500311 14045.58340554
 15301.03793817 19137.78180734 13133.8760071  18115.63989836
 13656.84596782 16072.58349375]
Mean RMSE:  15101.342829734003
