In [3]:
# !pip3 install openpyxl

In [4]:
# Loading packages

import pandas as pd

import plotly_express as px

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression

from joblib import dump, load

In [5]:
data_price = pd.read_csv('./src/get_around_pricing_project.csv', index_col=[0])
data_price.head()

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106
1,Citroën,13929,317,petrol,grey,convertible,True,True,False,False,False,True,True,264
2,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True,101
3,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True,158
4,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True,183


In [6]:
data_price.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4843 entries, 0 to 4842
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   model_key                  4843 non-null   object
 1   mileage                    4843 non-null   int64 
 2   engine_power               4843 non-null   int64 
 3   fuel                       4843 non-null   object
 4   paint_color                4843 non-null   object
 5   car_type                   4843 non-null   object
 6   private_parking_available  4843 non-null   bool  
 7   has_gps                    4843 non-null   bool  
 8   has_air_conditioning       4843 non-null   bool  
 9   automatic_car              4843 non-null   bool  
 10  has_getaround_connect      4843 non-null   bool  
 11  has_speed_regulator        4843 non-null   bool  
 12  winter_tires               4843 non-null   bool  
 13  rental_price_per_day       4843 non-null   int64 
dtypes: bool(7), i

No missing data. Some qualitative data will need to be encoded.

## Preprocessing

In [7]:
features_list = data_price.drop('rental_price_per_day', axis=1).columns.to_list() 
target_name = 'rental_price_per_day'

X = data_price[features_list]
y = data_price[target_name]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=42) # No need to straify the target as it is continuous

Automatically detect numerical and categorical features

In [9]:
numerical_columns = X.select_dtypes(include=['float64', 'int64']).columns.tolist() 
categorical_columns = X.select_dtypes(include=['object', 'bool']).columns.tolist()

In [10]:
numerical_columns

['mileage', 'engine_power']

In [11]:
categorical_columns

['model_key',
 'fuel',
 'paint_color',
 'car_type',
 'private_parking_available',
 'has_gps',
 'has_air_conditioning',
 'automatic_car',
 'has_getaround_connect',
 'has_speed_regulator',
 'winter_tires']

In [12]:
# No need for a complete Pipeline because we juste need a scaler for numerical and an encoder for categorical

numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first')

column_transformer = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_columns),
    ('cat', categorical_transformer, categorical_columns)]
    )

In [13]:
X_train = column_transformer.fit_transform(X_train)
X_test = column_transformer.transform(X_test)

## Training the model

In [14]:
lr = LinearRegression()

In [15]:
lr.fit(X_train, y_train)

In [16]:
print('Train score is : {:.2f}'.format(lr.score(X_train, y_train)))
print('Test score is : {:.2f}'.format(lr.score(X_test, y_test)))

Train score is : 0.71
Test score is : 0.69


The baseline has a moderate score but not much overfitting (only 0.02 of difference) !! This is encouraging.

We could perform a gradient boost regression in order to attempt improving predictions, but at the cost of risking overfitting. Not the objective here.

Quantifying the importance of each features in the prediciton

In [17]:
coef = lr.coef_.tolist()
coef_features = {}
coef_features = {k: v for k, v in zip(features_list, coef)}
coef_features

{'model_key': -13.081520725717947,
 'mileage': 14.172065945602755,
 'engine_power': 10.745210610250213,
 'fuel': 5.502099011267995,
 'paint_color': 3.508101062999904,
 'car_type': 14.437035291152892,
 'private_parking_available': -34.98427229667813,
 'has_gps': -14.277285918769932,
 'has_air_conditioning': -30.313636757499292,
 'automatic_car': 7.68118058073951,
 'has_getaround_connect': 1.5893456328257196,
 'has_speed_regulator': 9.546249589672069,
 'winter_tires': 29.38398842081913}

Now I am sorting those coef in ascending order (in absolute values as I am not interested in polarity, only intensity)

In [18]:
coef_df = pd.DataFrame(coef_features.items(), columns=['Features', 'Coeff'])
coef_df['Coeff'] = abs(coef_df['Coeff'])
coef_df = coef_df.sort_values(by='Coeff', ascending=True)
coef_df

Unnamed: 0,Features,Coeff
10,has_getaround_connect,1.589346
4,paint_color,3.508101
3,fuel,5.502099
9,automatic_car,7.681181
11,has_speed_regulator,9.54625
2,engine_power,10.745211
0,model_key,13.081521
1,mileage,14.172066
7,has_gps,14.277286
5,car_type,14.437035


In [19]:
px.bar(coef_df, 'Features', 'Coeff', title='The most important features')

## Saving the model and the transformers

In [20]:
# dump(lr, './model/model_reg.pkl')

['./model/model_reg.pkl']

In [21]:
# dump(column_transformer, './model/transformer.pkl')

['./model/transformer.pkl']

## Test prediction

In [30]:
predictor = load('./api/model/model_reg.pkl')
transformer = load('./api/model/transformer.pkl')

In [31]:
test_input = {'model_key':'Citroën', 
              'mileage':77334, 
              'engine_power':256, 
              'fuel':'diesel', 
              'paint_color':'black', 
              'car_type':'coupe', 
              'private_parking_available':True, 
               'has_gps':False, 
               'has_air_conditioning':True, 
               'automatic_car':False, 
               'has_getaround_connect':False, 
               'has_speed_regulator':True, 
               'winter_tires':False}

In [32]:
user_data = pd.DataFrame([test_input])
user_data

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires
0,Citroën,77334,256,diesel,black,coupe,True,False,True,False,False,True,False


In [34]:
X = transformer.transform(user_data)

In [35]:
X

<1x55 sparse matrix of type '<class 'numpy.float64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [36]:
X_pred = predictor.predict(X)
print(f'The predicted price is {X_pred[0]}')

The predicted price is 179.22685972150043
