In [1]:
import warnings    
warnings.simplefilter("ignore", UserWarning)
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt    
from sklearn.linear_model import LinearRegression, Ridge, Lasso, SGDRegressor
from sklearn.preprocessing import PolynomialFeatures, FunctionTransformer,\
    OneHotEncoder, LabelEncoder, PowerTransformer, StandardScaler
from sklearn.neighbors import KNeighborsRegressor 
import seaborn as sns
from sklearn.impute import KNNImputer
import re
from sklearn.tree import DecisionTreeRegressor
from scipy import stats  
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline 
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
import pickle  
from sklearn.ensemble import GradientBoostingRegressor 
from pprint import pprint
from src.utils import previous_owners, convert_text_to_number,\
    passengers_number, get_model_scores, outlier_detector,\
    outlier_detector_zscore 

Reading Data

In [2]:
data = pd.read_csv('../data/clean_data.csv')

In [3]:
gbt = GradientBoostingRegressor()
pprint(gbt.get_params())

{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'squared_error',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}


In [4]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 500, num = 5)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 74, num = 10)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [5, 10, 16]
# Minimum number of samples required at each leaf node
min_samples_leaf = [2, 4, 7]
# Method of selecting samples for training each tree
learning_rate = [0.01, 0.1]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'learning_rate': learning_rate}

In [5]:
# Use the random grid to search for best hyperparameters
# Random search of parameters, using 4 fold cross validation, 
# search across 20 different combinations, and use all available cores
gbt_random = RandomizedSearchCV(estimator = gbt, # the base model
                               param_distributions = random_grid, # grid of hyper parameters
                               n_iter = 20, # number of combinations of random hyper parameters to choose
                               cv = 5,#5-fold 
                               verbose=4,  # print some logs while evaluating the different combinations
                            #    random_state=42,
                               n_jobs = 1) # use all cores in the machine to train (parallel)

In [6]:
#get numeric features name
numeric_features = [
                'عدد الركاب', 
                'قوة الماتور', 
                'موديل سنة', 
                'أصحاب سابقون', 
                'عداد السيارة'
                ]

In [7]:
ohe = OneHotEncoder()
#get categorical features name
categorical_features = [col for col in data.columns if data[col].dtypes == 'O']
#define transformer to selectively apply data preparation transforms
transformer = ColumnTransformer( 
    transformers = [  
        #apply OneHotEncoder to categorical features
        ('categorical', ohe, categorical_features), 
        #apply log transformer to numeric features
        ('numerical', StandardScaler(), numeric_features),
    ],
    #unspecified columns will be passed through without transformation  
    remainder='passthrough',
    #prevent adding prefix to columns names
    verbose_feature_names_out = False 
)

In [8]:
#slice the target variable and the other features
X = data.drop(columns = 'السعر')
y = data['السعر'] 
#transform the data (without the target variable) 
X = transformer.fit_transform(X)

In [9]:
gbt_random.fit(X, y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END learning_rate=0.1, max_depth=28, min_samples_leaf=7, min_samples_split=10, n_estimators=200; total time=  35.1s
[CV] END learning_rate=0.1, max_depth=28, min_samples_leaf=7, min_samples_split=10, n_estimators=200; total time=  32.9s
[CV] END learning_rate=0.1, max_depth=28, min_samples_leaf=7, min_samples_split=10, n_estimators=200; total time=  36.0s
[CV] END learning_rate=0.1, max_depth=28, min_samples_leaf=7, min_samples_split=10, n_estimators=200; total time=  34.8s
[CV] END learning_rate=0.1, max_depth=28, min_samples_leaf=7, min_samples_split=10, n_estimators=200; total time=  34.6s
[CV] END learning_rate=0.1, max_depth=43, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=  36.0s
[CV] END learning_rate=0.1, max_depth=43, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=  37.1s
[CV] END learning_rate=0.1, max_depth=43, min_samples_leaf=2, min_samples_split=5, n_estimato

In [10]:
best_gbt_model = gbt_random.best_estimator_
best_gbt_model

In [11]:
gbt_random.best_params_

{'n_estimators': 200,
 'min_samples_split': 10,
 'min_samples_leaf': 2,
 'max_depth': 5,
 'learning_rate': 0.1}

In [12]:
# pprint(f"Best parameters:\n {gbt_random.best_params_}\n\n Best score\n {gbt_random.best_score_}")
pprint(gbt_random.best_params_)

{'learning_rate': 0.1,
 'max_depth': 5,
 'min_samples_leaf': 2,
 'min_samples_split': 10,
 'n_estimators': 200}


In [13]:
print(f"Best parameters:\n {gbt_random.best_params_}\n\n Best score\n {gbt_random.best_score_}")

Best parameters:
 {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_depth': 5, 'learning_rate': 0.1}

 Best score
 0.7992584243500028


In [72]:
gbt_random.fit(X, y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END learning_rate=0.1, max_depth=5, min_samples_leaf=2, min_samples_split=32, n_estimators=400; total time=   7.4s
[CV] END learning_rate=0.1, max_depth=5, min_samples_leaf=2, min_samples_split=32, n_estimators=400; total time=   7.2s
[CV] END learning_rate=0.1, max_depth=5, min_samples_leaf=2, min_samples_split=32, n_estimators=400; total time=   7.1s
[CV] END learning_rate=0.1, max_depth=5, min_samples_leaf=2, min_samples_split=32, n_estimators=400; total time=   8.4s
[CV] END learning_rate=0.1, max_depth=5, min_samples_leaf=2, min_samples_split=32, n_estimators=400; total time=   8.5s
[CV] END learning_rate=0.1, max_depth=5, min_samples_leaf=4, min_samples_split=16, n_estimators=400; total time=   8.4s
[CV] END learning_rate=0.1, max_depth=5, min_samples_leaf=4, min_samples_split=16, n_estimators=400; total time=   8.4s
[CV] END learning_rate=0.1, max_depth=5, min_samples_leaf=4, min_samples_split=16, n_estimators=40

In [14]:
print(f"Best parameters:\n {gbt_random.best_params_}\n\n Best score\n {gbt_random.best_score_}")

Best parameters:
 {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_depth': 5, 'learning_rate': 0.1}

 Best score
 0.7992584243500028
