In [94]:
import warnings    
warnings.simplefilter("ignore", UserWarning)
import pandas as pd
import numpy as np   
from sklearn.preprocessing import OneHotEncoder 
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline 
from sklearn.model_selection import RandomizedSearchCV
import pickle  
from sklearn.ensemble import GradientBoostingRegressor 
from pprint import pprint 

## Reading Data

In [95]:
data = pd.read_csv('../data/clean_data.csv')

In [99]:
data.head()

Unnamed: 0,السعر,الموديل,موديل سنة,لون السيارة,قوة الماتور,عدد الركاب,عداد السيارة,أصحاب سابقون,أصل السيارة,رخصة السيارة,...,نوع الجير,الزجاج,وسادة حماية هوائية,فرش جلد,جنطات مغنيسيوم,فتحة سقف,مسجل CD,إغلاق مركزي,مُكيّف,جهاز إنذار
0,100000.0,كيا اوبتيما,2014,أبيض عاجي,2000.0,5.0,75000.0,1.0,خصوصي,فلسطينية,...,اوتوماتيك,الكتروني,1,1,1,1,1,1,1,1
1,60000.0,كيا سورينتو,2007,سكني,2500.0,8.0,130000.0,2.0,خصوصي,فلسطينية,...,نصف اوتوماتيك,الكتروني,1,1,1,1,1,1,1,1
2,43500.0,هونداي افانتي,2006,سكني,1600.0,5.0,65000.0,2.0,خصوصي,فلسطينية,...,اوتوماتيك,الكتروني,1,1,1,0,1,1,1,1
3,5500.0,فيات 127,1982,بيج,906.0,5.0,65000.0,0.0,خصوصي,فلسطينية,...,عادي,يدوي,0,0,0,0,1,0,1,0
4,54000.0,بيجو 208,2014,فضي,1200.0,5.0,38000.0,2.0,خصوصي,فلسطينية,...,اوتوماتيك,الكتروني,1,0,1,0,0,1,1,1


In [None]:
#get categorical features name
categorical_features = [col for col in data.columns if data[col].dtypes == 'O']

In [None]:
ohe = OneHotEncoder(sparse = False, handle_unknown = 'ignore') 

In [None]:
#define transformer to selectively apply data preparation transforms
transformer = ColumnTransformer( 
    transformers = [  
        #apply onehot encoding to categorical features
        ('categorical', ohe, categorical_features),  
    ],
    #unspecified columns will be passed through without transformation  
    remainder='passthrough',
    #prevent adding prefix to columns names
    verbose_feature_names_out = False 
)

In [None]:
#slice the target variable and the other features
X = data.drop(columns = 'السعر')
y = data['السعر'] 
#transform the data (without the target variable) 
transformed_X = transformer.fit_transform(X)

In [72]:
#Number of features to consider
max_features = ['sqrt', 'log2', None] 
#Number of trees 
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 500, num = 5)]  
#Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 74, num = 10)]  
max_depth.append(None)
#Minimum number of samples required to split a node
min_samples_split = [int(x) for x in np.linspace(100, 300, num = 10)]
learning_rate = [0.01, 0.05, 0.1] 
#Create the random grid
random_grid = {
    'n_estimators': n_estimators,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split, 
    'learning_rate': learning_rate,
    'max_features': max_features
    } 

In [None]:
gbt = GradientBoostingRegressor() 

In [73]:
#define gbt_random a RandomizedSearchCV obj to search across 30 different combinations 
#for best hyperparameters for gradient boosting tree regressor using 5 fold cross validation 
gbt_random = RandomizedSearchCV(estimator = gbt, 
                               param_distributions = random_grid,  
                               n_iter = 30,  
                               cv = 5, 
                               verbose=4,  
                               n_jobs = 4) 

In [None]:
#Fit the random search model  
gbt_random.fit(transformed_X, y)

In [83]:
best_gbt_model = gbt_random.best_estimator_

In [85]:
print(f"Best parameters:\n {gbt_random.best_params_}\n\n Best score\n {gbt_random.best_score_}")

Best parameters:
 {'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 51, 'learning_rate': 0.01}

 Best score
 0.7791771230827625


# Pickling

In [87]:
#GBTRegressor Pipeline
gbt_pipline = Pipeline( 
    steps = [
        ("transformer", transformer),
        ("gradient boosting tree regressor", best_gbt_model)
    ] 
)
#fit data
gbt_pipline.fit(X, y) 

In [89]:
#export gbt regressor pipline 
gbt_regressor_pipline_file_name = "../models/gbt pipline.pkl"
pickle.dump(gbt_pipline, open(gbt_regressor_pipline_file_name, 'wb'))