In [15]:
import warnings    
warnings.simplefilter("ignore", UserWarning)
import pandas as pd
import numpy as np   
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline 
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
import pickle  
from sklearn.ensemble import GradientBoostingRegressor 
from pprint import pprint
# from src.utils import previous_owners, convert_text_to_number,\
#     passengers_number, get_model_scores, outlier_detector,\
#     outlier_detector_zscore 

Reading Data

In [69]:
data = pd.read_csv('../data/clean_data.csv')

In [70]:
data.head()

Unnamed: 0,السعر,الموديل,موديل سنة,لون السيارة,قوة الماتور,عدد الركاب,عداد السيارة,أصحاب سابقون,أصل السيارة,رخصة السيارة,...,نوع الجير,الزجاج,وسادة حماية هوائية,فرش جلد,جنطات مغنيسيوم,فتحة سقف,مسجل CD,إغلاق مركزي,مُكيّف,جهاز إنذار
0,100000.0,كيا اوبتيما,2014,أبيض عاجي,2000.0,5.0,75000.0,1.0,خصوصي,فلسطينية,...,اوتوماتيك,الكتروني,1,1,1,1,1,1,1,1
1,60000.0,كيا سورينتو,2007,سكني,2500.0,8.0,130000.0,2.0,خصوصي,فلسطينية,...,نصف اوتوماتيك,الكتروني,1,1,1,1,1,1,1,1
2,43500.0,هونداي افانتي,2006,سكني,1600.0,5.0,65000.0,2.0,خصوصي,فلسطينية,...,اوتوماتيك,الكتروني,1,1,1,0,1,1,1,1
3,5500.0,فيات 127,1982,بيج,906.0,5.0,65000.0,0.0,خصوصي,فلسطينية,...,عادي,يدوي,0,0,0,0,1,0,1,0
4,54000.0,بيجو 208,2014,فضي,1200.0,5.0,38000.0,2.0,خصوصي,فلسطينية,...,اوتوماتيك,الكتروني,1,0,1,0,0,1,1,1


In [71]:
gbt = GradientBoostingRegressor() 

In [72]:
#Number of features to consider when looking for the best split
max_features = ['sqrt', 'log2', None] 
#Number of trees 
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 500, num = 5)]  
#Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 74, num = 10)]  
max_depth.append(None)
#Minimum number of samples required to split a node
min_samples_split = [5, 10, 16] #100, 200, 300
#Minimum number of samples required at each leaf node
min_samples_leaf = [2, 4, 7]#10, 15, 20  
#Method of selecting samples for training each tree
learning_rate = [0.01, 0.1] 
#Create the random grid
random_grid = {
    'n_estimators': n_estimators,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'learning_rate': learning_rate,
    'max_features': max_features
    } 

In [73]:
# Use the random grid to search for best hyperparameters
# Random search of parameters, using 4 fold cross validation, 
# search across 20 different combinations, and use all available cores
#define gbt_random a RandomizedSearchCV obj to 
gbt_random = RandomizedSearchCV(estimator = gbt, # the base model
                               param_distributions = random_grid, # grid of hyper parameters
                               n_iter = 20, # number of combinations of random hyper parameters to choose
                               cv = 5,#5-fold 
                               verbose=4,  # print some logs while evaluating the different combinations
                               n_jobs = 1) # use all cores in the machine to train (parallel)

In [74]:
#get categorical features name
categorical_features = [col for col in data.columns if data[col].dtypes == 'O']

#get numeric features name
numeric_features = [
                'عدد الركاب', 
                'قوة الماتور', 
                'موديل سنة', 
                'أصحاب سابقون', 
                'عداد السيارة'
                ]

In [75]:
ohe = OneHotEncoder(sparse = False, handle_unknown = 'ignore')
standard_scaler = StandardScaler()

In [80]:
#define transformer to selectively apply data preparation transforms
transformer = ColumnTransformer( 
    transformers = [  
        #apply onehot encoding to categorical features
        ('categorical', ohe, categorical_features),  
    ],
    #unspecified columns will be passed through without transformation  
    remainder='passthrough',
    #prevent adding prefix to columns names
    verbose_feature_names_out = False 
)

In [81]:
#slice the target variable and the other features
X = data.drop(columns = 'السعر')
y = data['السعر'] 
#transform the data (without the target variable) 
transformed_X = transformer.fit_transform(X)

In [82]:
#Fit the random search model  
gbt_random.fit(transformed_X, y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END learning_rate=0.1, max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100;, score=0.729 total time=  52.9s
[CV 2/5] END learning_rate=0.1, max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100;, score=0.740 total time= 1.1min
[CV 3/5] END learning_rate=0.1, max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100;, score=0.711 total time=  52.1s
[CV 4/5] END learning_rate=0.1, max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100;, score=0.733 total time=  59.1s
[CV 5/5] END learning_rate=0.1, max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100;, score=0.786 total time=  50.6s
[CV 1/5] END learning_rate=0.01, max_depth=5, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=300;, score=0.420 total time

In [83]:
best_gbt_model = gbt_random.best_estimator_

In [84]:
gbt_random.best_params_

{'n_estimators': 500,
 'min_samples_split': 10,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 51,
 'learning_rate': 0.01}

In [85]:
print(f"Best parameters:\n {gbt_random.best_params_}\n\n Best score\n {gbt_random.best_score_}")

Best parameters:
 {'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 51, 'learning_rate': 0.01}

 Best score
 0.7791771230827625


In [86]:
gbt_random.best_estimator_ 

In [93]:
gbt_random

# Pickling

In [87]:
#GBTRegressor Pipeline
gbt_pipline = Pipeline( 
    steps = [
        ("transformer", transformer),
        ("GBTRegressor", best_gbt_model)
    ] 
)
#fit data
gbt_pipline.fit(X, y) 

In [89]:
#export gbt regressor pipline 
gbt_regressor_pipline_file_name = "../models/gbt pipline.pkl"
pickle.dump(gbt_pipline, open(gbt_regressor_pipline_file_name, 'wb'))