In [73]:
# standard library imports
import re
import pickle

# third-party library imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# custom imports
from pprint import pprint
from sklearn.impute import KNNImputer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from category_encoders.one_hot import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer

In [51]:
dataset = pd.read_csv("../data/data_enriched.csv")

In [52]:
dataset.head()

Unnamed: 0,car_color,fuel_type,car_type,car_license,gear_type,windows,motor_power,speedometer,passengers_number,payment_method,car_state,ex_owners,additions,price
0,أبيض عاجي,بنزين,خصوصي,فلسطينية,اوتوماتيك,الكتروني,2000,75000.0,5.0,نقدا فقط,للبيع فقط,1.0,\n\nمُكيّف\nإغلاق مركزي\nجهاز إنذار\nمسجل CD\n...,100000
1,سكني,ديزل,خصوصي,فلسطينية,نصف اوتوماتيك,الكتروني,2500,130000.0,8.0,إمكانية التقسيط,للبيع أو التبديل,2.0,\n\nمُكيّف\nإغلاق مركزي\nجهاز إنذار\nمسجل CD\n...,60000
2,سكني,بنزين,خصوصي,فلسطينية,اوتوماتيك,الكتروني,1600,,,نقدا فقط,للبيع فقط,,\n\nمُكيّف\nإغلاق مركزي\nجهاز إنذار\nمسجل CD\n...,43500
3,بيج,بنزين,خصوصي,فلسطينية,عادي,يدوي,906,,5.0,إمكانية التقسيط,للبيع فقط,0.0,\n\nمُكيّف\nمسجل CD\n\n,5500
4,فضي,بنزين,خصوصي,فلسطينية,اوتوماتيك,الكتروني,1200,38000.0,5.0,نقدا فقط,للبيع فقط,,\n\nمُكيّف\nإغلاق مركزي\nجهاز إنذار\nجنطات مغن...,54000


In [53]:
dataset.shape

(6864, 14)

## Feature Engineering

### Handle outliers

In [54]:
def handle_outliers(data, feature):
    upper_lim = data[feature].quantile(.999)
    lower_lim = data[feature].quantile(.0000001)

    return data[(data[feature] < upper_lim) & (data[feature] > lower_lim)]
dataset = handle_outliers(dataset, 'speedometer')

TypeError: unsupported operand type(s) for -: 'str' and 'str'

In [None]:
dataset.shape

(6495, 14)

### Handle missing data

In [55]:
def imputer_transform(data_columns):
    return transform


def knn_imputer(data_columns):
    KNNImputer(n_neighbors=3).fit(data_columns)

In [56]:
def handle_missing_data(data):
    nan_columns = data.columns[data.isna().any()].tolist()
    if nan_columns:
        model = KNNImputer(n_neighbors=3)
        after_imputation = model.fit_transform(data[nan_columns])
        for indx, feature in enumerate(nan_columns):
            data[feature] = after_imputation[:,indx].round().astype(int)


In [57]:
handle_missing_data(dataset)

#### Feature Selection

In [58]:
dataset.drop(['additions'], axis=1, inplace=True) 

#### Split data into features and target variable

In [59]:
X = dataset.drop(columns = 'price')
y = dataset['price'] 

In [60]:
categorical_features = dataset.select_dtypes(['object']).columns.to_list()

In [61]:
numeric_features = X.select_dtypes('number').columns.to_list()

#### Feature Encoding

In [62]:
# encode categorical features 
ohe = OneHotEncoder()

#### Feature Scaling

In [63]:
# feature scaling using log transformation
power_transformer  = PowerTransformer()

In [67]:
# Apply different preprocessing steps to different columns of the data
transformer = ColumnTransformer( 
    transformers = [  
        # encode categorical features
        ('categorical', ohe, categorical_features), # ('categorical', OneHotEncoder(), categorical_features),
        # feature scaling using log transformation
        ('numerical', PowerTransformer(), numeric_features), # ('numerical', PowerTransformer(), numeric_features),
    ],
    remainder='passthrough'
    )

#### Feature Transformation
###### transform features applying ColumnTransformer() pipeline

In [68]:
# transform the data (without the target variable) 
X_transformed = transformer.fit_transform(X)

  for cat_name, class_ in values.iteritems():
  for cat_name, class_ in values.iteritems():
  for cat_name, class_ in values.iteritems():
  for cat_name, class_ in values.iteritems():
  for cat_name, class_ in values.iteritems():
  for cat_name, class_ in values.iteritems():
  for cat_name, class_ in values.iteritems():
  for cat_name, class_ in values.iteritems():


In [69]:
pd.DataFrame(X_transformed)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,42,43,44,45,46,47,48,49,50,51
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.525931,0.037392,-0.193832,-0.495750
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.312045,0.037899,2.482188,0.304865
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,-0.175659,-2.319713,-0.193832,0.304865
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,-1.641241,0.037069,-0.193832,-1.952893
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,-0.970011,0.037064,-0.193832,-0.495750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6859,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,1.0,0.0,0.0,-1.416348,0.037899,-0.193832,0.304865
6860,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.161471,0.036756,2.482188,1.590857
6861,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,-0.175659,0.037168,-0.193832,-1.952893
6862,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.850793,0.036756,1.703235,0.304865


Split the data

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=.25, random_state=42)

In [71]:
y_train.shape

(5148,)

In [72]:
y_test.shape

(1716,)

In [85]:
models = {'ols': {'model': LinearRegression(), 
                  'description': 'orinary least square'
                  },
        'sgd1': {'model': SGDRegressor(),
                  'description': 'gradient descent with 1000 iteration'
                  },
        'sgd2': {'model': SGDRegressor(max_iter=10),
                  'description': 'gradient descent with 10 iteration'
                  },
        'ply2': {'model': LinearRegression(),
                  'degree': 2,
                  'description': 'Polynomial with 2 degree'
                  },
        'ply2_ridge': {'model': Ridge(),
                  'degree': 2,
                  'description': 'orinary least square'
                  },
        'ply2_lasso': {'model': Lasso(),
                  'degree': 2,
                  'description': 'orinary least square'
                  },
        '3nn': {'model': KNeighborsClassifier(n_neighbors=3),
                  'description': 'KNN with k=3'
                  },
        'DT': {'model': DecisionTreeRegressor(),
                  'description': 'Decision Tree Regressor'
                  },
        "rfr": {"model": RandomForestRegressor(n_estimators = 15),
                  "description":"Random Forest Regressor"
                  },
        'GBR': {'model': GradientBoostingRegressor(),
                  'description': 'Gradient Boosting Regressor'
                  }
  }

Train the data

In [75]:
def train_plynomial_model(model, degree, X_train, y_train):
    """
    returns polynomial model with the given degree
    """
    poly = PolynomialFeatures(degree)
    poly_x_train = poly.fit_transform(X_train)
    model.fit(poly_x_train, y_train)

In [76]:
def train_model(models, X_train, y_train):
    """
    models: dictionary
    {"model_id":{"model":which model, "description":""}}
    """
    
    # go over all models
    for model_id in models:
        # train if polynomial
        print(models[model_id])
        if models[model_id].get("degree"):
            train_plynomial_model(models[model_id]['model'],\
                                  models[model_id]['degree'],\
                                  X_train,\
                                  y_train)
            
        else:  
            models[model_id]['model'].fit(X_train, y_train)
        #print(models[model_id]['model'].coef_)
        #print(models[model_id]['model'].intercept_)
        
    # return the new dictionary
    return models

In [77]:
def predict_plynomial_model(model, degree, X_test):
    poly = PolynomialFeatures(degree)
    poly_x_test = poly.fit_transform(X_test)   
    return model.predict(poly_x_test)

In [86]:
def eval_models(models, X_test, y_test, train_test):
    """
    models: dictionary
    {"model_id":{"model":which model, "description":"", "RMSE":<value>, "R2_score":<value>}}
    """
    # go over all models
    for model_id in models:
        # predict them
        if models[model_id].get("degree"): #if not (models[model_id].get("degree") is None):
            y_predict= predict_plynomial_model(models[model_id]['model'],\
                                  models[model_id]['degree'],\
                                  X_test)
            
        else: 
            y_predict = models[model_id]['model'].predict(X_test)

        # evaluate models using 'RMSE' evaluation metric
        rmse = mean_squared_error(y_test, y_predict, squared=False) # np.sqrt(mean_squared_error())

        if train_test == 'train':
            models[model_id]["train_RMSE"] = rmse
            
            # evaluate models using 'R square' evaluation metric
            models[model_id]["train_R2_score"] = r2_score(y_test, y_predict)
        
        else:
            models[model_id]["test_RMSE"] = rmse
            
            # evaluate models using 'R square' evaluation metric
            models[model_id]["test_R2_score"] = r2_score(y_test, y_predict)

    # return models dictionary adjusted attached with evaluation metric
    return models

In [87]:
models

{'ols': {'model': LinearRegression(), 'description': 'orinary least square'},
 'sgd1': {'model': SGDRegressor(),
  'description': 'gradient descent with 1000 iteration'},
 'sgd2': {'model': SGDRegressor(max_iter=10),
  'description': 'gradient descent with 10 iteration'},
 'ply2': {'model': LinearRegression(),
  'degree': 2,
  'description': 'Polynomial with 2 degree'},
 'ply2_ridge': {'model': Ridge(),
  'degree': 2,
  'description': 'orinary least square'},
 'ply2_lasso': {'model': Lasso(),
  'degree': 2,
  'description': 'orinary least square'},
 '3nn': {'model': KNeighborsClassifier(n_neighbors=3),
  'description': 'KNN with k=3'},
 'DT': {'model': DecisionTreeRegressor(),
  'description': 'Decision Tree Regressor'},
 'rfr': {'model': RandomForestRegressor(n_estimators=15),
  'description': 'Random Forest Regressor'},
 'GBR': {'model': GradientBoostingRegressor(),
  'description': 'Gradient Boosting Regressor'}}

In [88]:
train_models_res = train_model(models, X_train, y_train)

{'model': LinearRegression(), 'description': 'orinary least square'}
{'model': SGDRegressor(), 'description': 'gradient descent with 1000 iteration'}
{'model': SGDRegressor(max_iter=10), 'description': 'gradient descent with 10 iteration'}
{'model': LinearRegression(), 'degree': 2, 'description': 'Polynomial with 2 degree'}




{'model': Ridge(), 'degree': 2, 'description': 'orinary least square'}
{'model': Lasso(), 'degree': 2, 'description': 'orinary least square'}


  model = cd_fast.enet_coordinate_descent(


{'model': KNeighborsClassifier(n_neighbors=3), 'description': 'KNN with k=3'}
{'model': DecisionTreeRegressor(), 'description': 'Decision Tree Regressor'}
{'model': RandomForestRegressor(n_estimators=15), 'description': 'Random Forest Regressor'}
{'model': GradientBoostingRegressor(), 'description': 'Gradient Boosting Regressor'}


In [91]:
eval_models_test = eval_models(models, X_test, y_test, 'test')

In [92]:
eval_models_test

{'ols': {'model': LinearRegression(),
  'description': 'orinary least square',
  'test_RMSE': 49041.43714106187,
  'test_R2_score': 0.25969942320951633},
 'sgd1': {'model': SGDRegressor(),
  'description': 'gradient descent with 1000 iteration',
  'test_RMSE': 48963.98982776893,
  'test_R2_score': 0.2620357747626767},
 'sgd2': {'model': SGDRegressor(max_iter=10),
  'description': 'gradient descent with 10 iteration',
  'test_RMSE': 48933.00177902035,
  'test_R2_score': 0.26296955630947916},
 'ply2': {'model': LinearRegression(),
  'degree': 2,
  'description': 'Polynomial with 2 degree',
  'test_RMSE': 9348140002781.82,
  'test_R2_score': -2.6898751741365708e+16},
 'ply2_ridge': {'model': Ridge(),
  'degree': 2,
  'description': 'orinary least square',
  'test_RMSE': 49010.41287945031,
  'test_R2_score': 0.2606357748093837},
 'ply2_lasso': {'model': Lasso(),
  'degree': 2,
  'description': 'orinary least square',
  'test_RMSE': 49708.58445524091,
  'test_R2_score': 0.23942069792260268}

In [93]:
eval_models_test['DT']

{'model': DecisionTreeRegressor(),
 'description': 'Decision Tree Regressor',
 'test_RMSE': 68992.99687221769,
 'test_R2_score': -0.46518252610868105}

In [94]:
eval_models_tain = eval_models(models, X_train, y_train, 'train')

In [95]:
eval_models_tain

{'ols': {'model': LinearRegression(),
  'description': 'orinary least square',
  'test_RMSE': 49041.43714106187,
  'test_R2_score': 0.25969942320951633,
  'train_RMSE': 49003.404843488745,
  'train_R2_score': 0.261542333398054},
 'sgd1': {'model': SGDRegressor(),
  'description': 'gradient descent with 1000 iteration',
  'test_RMSE': 48963.98982776893,
  'test_R2_score': 0.2620357747626767,
  'train_RMSE': 49055.15719035543,
  'train_R2_score': 0.25998174397307183},
 'sgd2': {'model': SGDRegressor(max_iter=10),
  'description': 'gradient descent with 10 iteration',
  'test_RMSE': 48933.00177902035,
  'test_R2_score': 0.26296955630947916,
  'train_RMSE': 49212.81419972806,
  'train_R2_score': 0.25521745203901636},
 'ply2': {'model': LinearRegression(),
  'degree': 2,
  'description': 'Polynomial with 2 degree',
  'test_RMSE': 9348140002781.82,
  'test_R2_score': -2.6898751741365708e+16,
  'train_RMSE': 45166.41270654537,
  'train_R2_score': 0.37265811067963694},
 'ply2_ridge': {'model':

In [119]:
models

{'ols': {'model': LinearRegression(),
  'description': 'orinary least square',
  'RMSE': 47321.28154097956,
  'R2_score': 0.31136976724865373},
 'sgd1': {'model': SGDRegressor(),
  'description': 'gradient descent with 1000 iteration',
  'RMSE': 239694649.06592083,
  'R2_score': -17668095.732223246},
 'sgd2': {'model': SGDRegressor(max_iter=10),
  'description': 'gradient descent with 10 iteration',
  'RMSE': 201568952.2127231,
  'R2_score': -12494541.739883708},
 'ply2': {'model': LinearRegression(),
  'degree': 2,
  'description': 'Polynomial with 2 degree',
  'RMSE': 36779.111176999446,
  'R2_score': 0.5840170315721622},
 'ply3_ridge': {'model': Ridge(),
  'degree': 2,
  'description': 'orinary least square',
  'RMSE': 36375.784602337575,
  'R2_score': 0.5930904998326155},
 'ply3_lasso': {'model': Lasso(),
  'degree': 2,
  'description': 'orinary least square',
  'RMSE': 36378.30230469637,
  'R2_score': 0.5930341704655102},
 '3nn': {'model': KNeighborsClassifier(n_neighbors=3),
  'd

In [96]:
print(models['DT'])
print(eval_models_tain['DT'])
print(eval_models_test['DT'])

{'model': DecisionTreeRegressor(), 'description': 'Decision Tree Regressor', 'test_RMSE': 68992.99687221769, 'test_R2_score': -0.46518252610868105, 'train_RMSE': 5544.078354896493, 'train_R2_score': 0.990547823784261}
{'model': DecisionTreeRegressor(), 'description': 'Decision Tree Regressor', 'test_RMSE': 68992.99687221769, 'test_R2_score': -0.46518252610868105, 'train_RMSE': 5544.078354896493, 'train_R2_score': 0.990547823784261}
{'model': DecisionTreeRegressor(), 'description': 'Decision Tree Regressor', 'test_RMSE': 68992.99687221769, 'test_R2_score': -0.46518252610868105, 'train_RMSE': 5544.078354896493, 'train_R2_score': 0.990547823784261}


In [143]:
models_evaluation_test_train()

{'model': LinearRegression(), 'description': 'orinary least square', 'RMSE': 47321.28154097956, 'R2_score': 0.31136976724865373}
{'model': LinearRegression(), 'description': 'orinary least square', 'RMSE': 47321.28154097956, 'R2_score': 0.31136976724865373}
{'model': LinearRegression(), 'description': 'orinary least square', 'RMSE': 47321.28154097956, 'R2_score': 0.31136976724865373} 


{'model': SGDRegressor(), 'description': 'gradient descent with 1000 iteration', 'RMSE': 239694649.06592083, 'R2_score': -17668095.732223246}
{'model': SGDRegressor(), 'description': 'gradient descent with 1000 iteration', 'RMSE': 239694649.06592083, 'R2_score': -17668095.732223246}
{'model': SGDRegressor(), 'description': 'gradient descent with 1000 iteration', 'RMSE': 239694649.06592083, 'R2_score': -17668095.732223246} 


{'model': SGDRegressor(max_iter=10), 'description': 'gradient descent with 10 iteration', 'RMSE': 201568952.2127231, 'R2_score': -12494541.739883708}
{'model': SGDRegressor(max_iter

In [99]:
pd.DataFrame.from_dict(models,orient='index') # orient='index'

Unnamed: 0,model,description,test_RMSE,test_R2_score,train_RMSE,train_R2_score,degree
ols,LinearRegression(),orinary least square,49041.44,0.2596994,49003.404843,0.261542,
sgd1,SGDRegressor(),gradient descent with 1000 iteration,48963.99,0.2620358,49055.15719,0.259982,
sgd2,SGDRegressor(max_iter=10),gradient descent with 10 iteration,48933.0,0.2629696,49212.8142,0.255217,
ply2,LinearRegression(),Polynomial with 2 degree,9348140000000.0,-2.689875e+16,45166.412707,0.372658,2.0
ply2_ridge,Ridge(),orinary least square,49010.41,0.2606358,45325.00557,0.368245,2.0
ply2_lasso,Lasso(),orinary least square,49708.58,0.2394207,45191.688533,0.371956,2.0
3nn,KNeighborsClassifier(n_neighbors=3),KNN with k=3,55857.59,0.03961339,51336.139662,0.189563,
DT,DecisionTreeRegressor(),Decision Tree Regressor,68993.0,-0.4651825,5544.078355,0.990548,
rfr,"(DecisionTreeRegressor(max_features=1.0, rando...",Random Forest Regressor,49831.91,0.2356421,21599.493423,0.85653,
GBR,([DecisionTreeRegressor(criterion='friedman_ms...,Gradient Boosting Regressor,47091.93,0.3173868,42773.852571,0.437361,


In [122]:
def models_evaluation_test_train():
    for model in models:
        # print((eval_models_tain[model[0]], '\n', eval_models_test[model[1]], '\n\n'))
        print(model)
        print(f"train_R2_score = {models[model]['train_R2_score']}")
        print(f"test_R2_score = {models[model]['test_R2_score']}")
        print('-----------------------------------------------')
        

In [123]:
models_evaluation_test_train()

ols
train_R2_score = 0.261542333398054
test_R2_score = 0.25969942320951633
-----------------------------------------------
sgd1
train_R2_score = 0.25998174397307183
test_R2_score = 0.2620357747626767
-----------------------------------------------
sgd2
train_R2_score = 0.25521745203901636
test_R2_score = 0.26296955630947916
-----------------------------------------------
ply2
train_R2_score = 0.37265811067963694
test_R2_score = -2.6898751741365708e+16
-----------------------------------------------
ply2_ridge
train_R2_score = 0.3682448038262921
test_R2_score = 0.2606357748093837
-----------------------------------------------
ply2_lasso
train_R2_score = 0.3719557736839182
test_R2_score = 0.23942069792260268
-----------------------------------------------
3nn
train_R2_score = 0.1895625413876112
test_R2_score = 0.03961338651088231
-----------------------------------------------
DT
train_R2_score = 0.990547823784261
test_R2_score = -0.46518252610868105
------------------------------------

* GradientBoostingRegressor() scored the highest, pickle that model pipeline

In [133]:
pipeline = Pipeline( 
    steps = [
        ("transformer", transformer),
        ("model", GradientBoostingRegressor())
    ]
)

In [134]:
# fit the pipeline on the all the dataset
pipeline.fit(X, y) 

  for cat_name, class_ in values.iteritems():
  for cat_name, class_ in values.iteritems():
  for cat_name, class_ in values.iteritems():
  for cat_name, class_ in values.iteritems():
  for cat_name, class_ in values.iteritems():
  for cat_name, class_ in values.iteritems():
  for cat_name, class_ in values.iteritems():
  for cat_name, class_ in values.iteritems():


In [135]:
pickle.dump(pipeline,
            open('../pkls/gbr_model_pipeline.pkl',
            'wb'))