#### Importing packages

In [1]:
import warnings    
warnings.simplefilter("ignore", UserWarning)
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt    
from sklearn.linear_model import LinearRegression, Ridge, Lasso, SGDRegressor
from sklearn.preprocessing import PolynomialFeatures, FunctionTransformer,\
    OneHotEncoder, LabelEncoder, PowerTransformer
from sklearn.neighbors import KNeighborsRegressor 
import seaborn as sns
from sklearn.impute import KNNImputer
import re
from sklearn.tree import DecisionTreeRegressor
from scipy import stats  
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline 
from sklearn.model_selection import cross_val_score
import pickle 
from src.utils import previous_owners, convert_text_to_number,\
    passengers_number, get_model_scores, outlier_detector,\
    outlier_detector_zscore 

### reading data

In [8]:
data = pd.read_csv('../data/cars_data.csv')

In [9]:
data.head()

Unnamed: 0,السعر,الموديل,موديل سنة,لون السيارة,قوة الماتور,عدد الركاب,عداد السيارة,أصحاب سابقون,أصل السيارة,رخصة السيارة,...,الزجاج,سعر التأمين,وسادة حماية هوائية,فرش جلد,جنطات مغنيسيوم,فتحة سقف,مسجل CD,إغلاق مركزي,مُكيّف,جهاز إنذار
0,100000.0,كيا اوبتيما,2014,أبيض عاجي,2000.0,4+1,75000,يد اولى,خصوصي,فلسطينية,...,الكتروني,3090,1,1,1,1,1,1,1,1
1,60000.0,كيا سورينتو,2007,سكني,2500.0,7+1,130000,2,خصوصي,فلسطينية,...,الكتروني,2740,1,1,1,1,1,1,1,1
2,43500.0,هونداي افانتي,2006,سكني,1600.0,,,,خصوصي,فلسطينية,...,الكتروني,2340,1,1,1,0,1,1,1,1
3,5500.0,فيات 127,1982,بيج,906.0,4+1,شغال,00,خصوصي,فلسطينية,...,يدوي,967,0,0,0,0,1,0,1,0
4,54000.0,بيجو 208,2014,فضي,1200.0,4+1,38000,,خصوصي,فلسطينية,...,الكتروني,2035,1,0,1,0,0,1,1,1


In [10]:
#get the shape of the data
data.shape

(6864, 22)

there are 6864 samples and 22 features

# Feature Engineering

- Encode categorical features
- Dealing with skewed features 

In [59]:
#get numeric features name
numeric_features = [
                'عدد الركاب', 
                'قوة الماتور', 
                'موديل سنة', 
                'أصحاب سابقون', 
                'عداد السيارة'
                ]

#get categorical features name
categorical_features = [
    col for col in data.columns if data[col].dtypes == 'O'
    ]

In [60]:
#OneHotEncoder object to encode the categorical features
ohe = OneHotEncoder(sparse = False, handle_unknown = 'ignore')
#power transformer object to transform numeric features 
power_transformer  = PowerTransformer() 

In [61]:
#define transformer to selectively apply data preparation transforms
transformer = ColumnTransformer( 
    transformers = [  
        #apply OneHotEncoder to categorical features
        ('categorical', ohe, categorical_features),
        #apply Log transformation to numeric features  
        ('numerical', power_transformer, numeric_features),
    ],
    #unspecified columns will be passed through without transformation  
    remainder='passthrough',
    #prevent adding prefix to columns names
    verbose_feature_names_out = False 
)

In [62]:
#slice the target variable and the other features
X = data.drop(columns = 'السعر')
y = data['السعر'] 

In [63]:
#transform the data (without the target variable) 
transformed_X = transformer.fit_transform(X)

  out[pos] = (np.power(x[pos] + 1, lmbda) - 1) / lmbda
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


# Model Selection

Try polynomial regression, kNN, and decision tree to select the best model to predict car price 

- `KNN`

Search for an optimal value of K for KNN

In [64]:
# range of k we want to try
k_range = range(1, 31)
# empty dictionary to store k with it's score 
k_scores = {}
# 1. loop through reasonable values of k
for k in k_range:
    # 2. run KNeighborsRegressor with k neighbours
    knn = KNeighborsRegressor(n_neighbors=k)
    # 3. obtain cross_val_score for KNeighborsRegressor with k neighbours
    scores = cross_val_score(knn, transformed_X, y)
    # 4. store mean of scores for k neighbors to k_scores dictionary 
    k_scores[str(k)] = scores.mean().round(2)

In [65]:
# get the optimal k (k maximum score)
optimal_k = max(k_scores, key = k_scores.get)  
print("optimal value of k is ", optimal_k)

optimal value of k is  9


In [66]:
#define models dictionary which contains models I want to try 
#with thier description, hyperparameters, and the score after evaluating the model
models = {
    "KNN": {
        "model": KNeighborsRegressor(n_neighbors = int(optimal_k)), 
        "description":"KNN Regressor",
        "K": int(optimal_k),  
        },   
    "DecisionTreeRegressor" : {  
        "model": DecisionTreeRegressor(),
        "description": "Decision Tree Regressor" 
        },
    "ols": {
        "model": LinearRegression(),
        "description":"orinary least square"
        }, 
    "sgd1": {
        "model": SGDRegressor(),
        "description":"gradient descent with degree 1"
        },  
    "poly2_lasso": {
        "model": Lasso(), 
        "degree" : 2,
        "description":"orinary least square"
        },  
    # "poly2": {
    #     "model": LinearRegression(), 
    #     "degree": 2,
    #     "description":"Polynomial of degree 2"
    #     }, 
    # "poly2_ridge": {
    #     "model": Ridge(), "degree" : 2,
    #     "description":"orinary least square"
    #     },   
    }

In [181]:
#train models and get the score for each one
models = get_model_scores(models, transformed_X, y)

degree


In [182]:
models

{'KNN': {'model': KNeighborsRegressor(n_neighbors=9),
  'description': 'KNN Regressor',
  'K': 9,
  'scores_list': array([0.57721361, 0.49016709, 0.47240443, 0.54135009, 0.55050326]),
  'score_mean': 0.5263276976574237},
 'DecisionTreeRegressor': {'model': DecisionTreeRegressor(),
  'description': 'Decision Tree Regressor',
  'scores_list': array([0.54804847, 0.67775327, 0.58553857, 0.68116249, 0.73948871]),
  'score_mean': 0.6463983011092546},
 'ols': {'model': LinearRegression(),
  'description': 'orinary least square',
  'scores_list': array([-1.41827845e+18, -1.89122681e+18, -5.35281508e+18, -8.76163913e+19,
         -1.25454480e+21]),
  'score_mean': -2.7016470315513577e+20},
 'sgd1': {'model': SGDRegressor(),
  'description': 'gradient descent with degree 1',
  'scores_list': array([0.59516241, 0.4470111 , 0.50048765, 0.5661522 , 0.50115749]),
  'score_mean': 0.5219941679816815},
 'poly2_lasso': {'model': Lasso(),
  'degree': 2,
  'description': 'orinary least square',
  'scores_

In [192]:
#get each model with it's score
for model_id in models: 
    print(model_id)
    print(models[model_id]["score_mean"])
    print("--------------------------")

KNN
0.5263276976574237
--------------------------
DecisionTreeRegressor
0.6463983011092546
--------------------------
ols
-2.7016470315513577e+20
--------------------------
sgd1
0.5219941679816815
--------------------------
poly2_lasso
0.6611242943659412
--------------------------


The best score is from  <b>Polynomial Regression with degree of 2</b>

## Pickling

In [67]:
# Decision Regressor Pipeline
tree_pip = Pipeline( 
    steps = [
        ("transformer", transformer),
        ("DecisionTreeRegressor", DecisionTreeRegressor())
    ]
)

In [68]:
#fit data
tree_pip.fit(X, y) 

  out[pos] = (np.power(x[pos] + 1, lmbda) - 1) / lmbda
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


In [89]:
# export tree regressor pipline  
tree_regressor_model_file_name = "../Models/Decision Tree Regressor.pkl"
pickle.dump(tree_pip, open(tree_regressor_model_file_name, 'wb'))

In [91]:
poly2 = PolynomialFeatures(2)    
poly2_lasso_pipline = Pipeline(
    steps = [
        ("transformer", transformer),
        ("polynomial", poly2),
        ("polynomial with degree of 2 lasso", Lasso())
    ]
)

In [92]:
#fit data
poly2_lasso_pipline.fit(X, y) 

In [93]:
# exporting poly2_lasso_pipline 
poly2_lasso_model_file_name = "../models/polynomial degree 2 lasso.pkl"
pickle.dump(poly2_lasso_pipline, open(poly2_lasso_model_file_name, 'wb'))