In [1]:
from dataclasses import dataclass

import numpy as np 
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler
import os
from sklearn.model_selection import train_test_split

In [2]:
@dataclass
class DataTransformationConfig:
    preprocessor_obj_file_path = os.path.join("artifacts", "preprocessor.pkl")

In [3]:
TARGET_COLUMN_NAME = "price"

In [4]:
data_transformation_config = DataTransformationConfig()

In [5]:
df = pd.read_csv(r"Dataset\Clean_Dataset.csv")

In [6]:
df.drop(['Unnamed: 0', 'flight'], axis = 1, inplace = True)

In [7]:
df_ec = df[df['class'] == 'Economy'].loc[:5000,:]
df_bs = df.loc[df['class'] == 'Business'][:5000]

df = pd.concat([df_ec,df_bs], axis=0, ignore_index= True)

In [8]:
df.drop_duplicates(keep = 'first', inplace = True, ignore_index = True)

In [9]:
df['class'].value_counts()

class
Business    4998
Economy     4856
Name: count, dtype: int64

In [10]:
df.shape

(9854, 10)

In [11]:
categorical_columns = list(df.loc[:,df.dtypes == 'object'].columns)

In [12]:
df[categorical_columns].head(2)

Unnamed: 0,airline,source_city,departure_time,stops,arrival_time,destination_city,class
0,SpiceJet,Delhi,Evening,zero,Night,Mumbai,Economy
1,SpiceJet,Delhi,Early_Morning,zero,Morning,Mumbai,Economy


In [13]:
numerical_columns = list(df.loc[:,df.dtypes != 'object'].columns)

In [14]:
numerical_columns.remove('price')

In [15]:
numerical_columns

['duration', 'days_left']

In [17]:
num_pipeline = Pipeline(
                steps = [
                ("imputer" , SimpleImputer(strategy = "median")),
                ("scaler" , StandardScaler())
                ]
            )

In [18]:
type(num_pipeline)

sklearn.pipeline.Pipeline

In [19]:
num_pipeline

In [20]:
cat_pipeline = Pipeline(
                steps= [
                ("imputer" , SimpleImputer(strategy= "most_frequent")),
                ("one_hot_encoder" , OneHotEncoder()),
                ("scaler", StandardScaler(with_mean=False))
                ] 
            )

In [21]:
cat_pipeline

In [22]:
preprocessor = ColumnTransformer(
                [
                ("num_pipeline", num_pipeline, numerical_columns),
                ("cat_pipeline", cat_pipeline, categorical_columns)
                ],
                sparse_threshold = 0
            )

In [None]:
type(preprocessor)

In [23]:
preprocessor

In [25]:
train_df, test_df = train_test_split(df, test_size = 0.25, random_state=12)

In [26]:
print(train_df.shape)
print(test_df.shape)

(7390, 10)
(2464, 10)


In [27]:
train_df_independent_features = train_df.drop([TARGET_COLUMN_NAME], axis = 1)
train_df_target_feature = train_df[TARGET_COLUMN_NAME]

In [28]:
test_df_independent_features = test_df.drop([TARGET_COLUMN_NAME], axis = 1)
test_df_target_feature = test_df[TARGET_COLUMN_NAME]

In [29]:
input_feature_train_arr = preprocessor.fit_transform(train_df_independent_features)

In [30]:
len(input_feature_train_arr)

7390

In [31]:
type(input_feature_train_arr)

numpy.ndarray

In [32]:
input_feature_test_arr = preprocessor.transform(test_df_independent_features)

In [33]:
len(input_feature_test_arr)

2464

In [34]:
type(input_feature_test_arr)

numpy.ndarray

In [36]:
train_arr = np.c_[input_feature_train_arr, train_df_target_feature]
test_arr = np.c_[input_feature_test_arr, test_df_target_feature]

In [37]:
X_train,X_test, y_train,  y_test = (
                    train_arr[:,:-1],
                    train_arr[:,-1],
                    test_arr[:,:-1],
                    test_arr[:,-1]
                )

In [38]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(7390, 27)
(2464, 27)
(7390,)
(2464,)


In [39]:
from catboost import CatBoostRegressor
from sklearn.ensemble import (
    AdaBoostRegressor,
    GradientBoostingRegressor,
    RandomForestRegressor
)

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR

from sklearn.metrics import (
    mean_absolute_error, 
    mean_squared_error,
    r2_score
)

In [41]:
models = {
                    "Random Forest" : RandomForestRegressor(),
                    "Decision Tree" : DecisionTreeRegressor(),
                    "Gradient Boosting" : GradientBoostingRegressor(),
                    "Linear Regression" : LinearRegression(),
                    "XGBRegressor" : XGBRegressor(),
                    "Catboost Regressor" : CatBoostRegressor(verbose = False),
                    "Adaboost Regressor" : AdaBoostRegressor(),
                    "Support Vector Regressor" : SVR()
                }

In [42]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [43]:
svr = SVR()

In [45]:
param_dist = {
    "kernel": ["linear", "poly", "rbf", "sigmoid"],
    "degree" : [2,3,4,5],
    "gamma" : ["scale", "auto"],
    "C" : [1,1.5,2,2.5,3,4],
    "epsilon" : [0.1,0.2,0.3]
}

In [None]:
randomCV = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=samples,cv=3)

In [46]:
svr_hp = RandomizedSearchCV(svr, param_distributions = param_dist, n_iter=5,cv=3)

In [50]:
svr_hp_fit = svr_hp.fit(X_train, X_test)
print(svr_hp_fit.best_params_)

{'kernel': 'sigmoid', 'gamma': 'auto', 'epsilon': 0.1, 'degree': 3, 'C': 4}


In [52]:
svr_model_basic = svr.fit(X_train, X_test)
svr_predictions_base = svr_model_basic.predict(y_train)

In [53]:
svr_tuned = SVR(kernel = 'sigmoid', gamma= 'auto', epsilon = 0.1, degree = 3, C = 4)
svr_model_tuned = svr_tuned.fit(X_train, X_test)
svr_predictions_tuned = svr_model_tuned.predict(y_train)

In [55]:
svr_base_mae = mean_absolute_error(y_test, svr_predictions_base)
svr_base_mse = mean_squared_error(y_test, svr_predictions_base)
svr_base_r2 = r2_score(y_test, svr_predictions_base)

svr_tuned_mae = mean_absolute_error(y_test, svr_predictions_tuned)
svr_tuned_mse = mean_squared_error(y_test, svr_predictions_tuned)
svr_tuned_r2 = r2_score(y_test, svr_predictions_tuned)

In [58]:
print(svr_base_mae)
print(svr_base_mse)
print(svr_base_r2)

print("*"*75)

print(svr_tuned_mae)
print(svr_tuned_mse)
print(svr_tuned_r2)

17901.10108582995
416930156.06737894
-0.009381091298470423
***************************************************************************
16231.785716460123
370872448.45654607
0.10212386581337562


In [59]:
y_test

array([ 5227., 37900.,  7838., ..., 14670.,  2410.,  3959.])

In [60]:
svr_predictions_tuned

array([19739.73171107, 23095.75221779, 19308.31918444, ...,
       18081.36368043, 16699.26206254, 18224.0065457 ])

# XGB

In [65]:
params={
 "learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
 "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
 "min_child_weight" : [ 1, 3, 5, 7 ],
 "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
    
}

In [66]:
xg = XGBRegressor()

In [67]:
xg_hp = RandomizedSearchCV(xg, param_distributions = params, n_iter=5,cv=3)

In [68]:
xg_hp_fit = xg_hp.fit(X_train, X_test)
print(xg_hp_fit.best_params_)

{'min_child_weight': 1, 'max_depth': 12, 'learning_rate': 0.2, 'gamma': 0.0, 'colsample_bytree': 0.5}


In [69]:
xg_model_basic = xg.fit(X_train, X_test)
xg_predictions_base = xg_model_basic.predict(y_train)

In [71]:
xg_tuned = XGBRegressor(min_child_weight = 1, max_depth= 12, learning_rate = 0.2, gamma = 0.0, colsample_bytree = 0.5)
xg_model_tuned = xg_tuned.fit(X_train, X_test)
xg_predictions_tuned = xg_model_tuned.predict(y_train)

In [72]:
xg_base_mae = mean_absolute_error(y_test, xg_predictions_base)
xg_base_mse = mean_squared_error(y_test, xg_predictions_base)
xg_base_r2 = r2_score(y_test, xg_predictions_base)

xg_tuned_mae = mean_absolute_error(y_test, xg_predictions_tuned)
xg_tuned_mse = mean_squared_error(y_test, xg_predictions_tuned)
xg_tuned_r2 = r2_score(y_test, xg_predictions_tuned)

In [73]:
print(xg_base_mae)
print(xg_base_mse)
print(xg_base_r2)

print("*"*75)

print(xg_tuned_mae)
print(xg_tuned_mse)
print(xg_tuned_r2)

1672.158820181698
10272339.605075836
0.9751308337622758
***************************************************************************
1763.335069780226
11405685.17097644
0.9723870226767035


In [74]:
y_test

array([ 5227., 37900.,  7838., ..., 14670.,  2410.,  3959.])

In [75]:
xg_predictions_tuned

array([ 6743.5996, 38567.04  ,  8425.729 , ...,  9241.555 ,  2617.2944,
        3075.1428], dtype=float32)

In [76]:
xg_predictions_base

array([ 4693.83  , 47258.062 ,  7751.873 , ..., 10442.13  ,  2458.941 ,
        3488.8477], dtype=float32)

# AdaBoostRegressor

In [77]:
ABR = AdaBoostRegressor()

In [89]:
params_ABR ={
 "n_estimators"    : [50,100,150,200,300,400,500 ] ,
 "learning_rate"        : [ 1, 1.5, 2, 3],
 "loss" : ["linear", "square", "exponential" ]
}

In [90]:
ABR_hp = RandomizedSearchCV(ABR, param_distributions = params_ABR, n_iter=5,cv=3)

In [91]:
ABR_hp_fit = ABR_hp.fit(X_train, X_test)
print(ABR_hp_fit.best_params_)

{'n_estimators': 50, 'loss': 'linear', 'learning_rate': 1}


In [92]:
ABR_model_basic = ABR.fit(X_train, X_test)
ABR_predictions_base = ABR_model_basic.predict(y_train)

In [93]:
ABR_tuned = AdaBoostRegressor(n_estimators = 50, loss= 'linear', learning_rate = 1)
ABR_model_tuned = ABR_tuned.fit(X_train, X_test)
ABR_predictions_tuned = ABR_model_tuned.predict(y_train)

In [94]:
ABR_base_mae = mean_absolute_error(y_test, ABR_predictions_base)
ABR_base_mse = mean_squared_error(y_test, ABR_predictions_base)
ABR_base_r2 = r2_score(y_test, ABR_predictions_base)

ABR_tuned_mae = mean_absolute_error(y_test, ABR_predictions_tuned)
ABR_tuned_mse = mean_squared_error(y_test, ABR_predictions_tuned)
ABR_tuned_r2 = r2_score(y_test, ABR_predictions_tuned)

In [95]:
print(ABR_base_mae)
print(ABR_base_mse)
print(ABR_base_r2)

print("*"*75)

print(ABR_tuned_mae)
print(ABR_tuned_mse)
print(ABR_tuned_r2)

4437.791230115997
39001000.479455344
0.9055792154805832
***************************************************************************
4541.2438031159345
39302536.47704471
0.9048492017603396


In [96]:
y_test

array([ 5227., 37900.,  7838., ..., 14670.,  2410.,  3959.])

In [100]:
ABR_predictions_base

array([ 5820.26258005, 44663.61455526,  5820.26258005, ...,
        7241.2008547 ,  7241.2008547 ,  5626.95940671])

In [99]:
ABR_predictions_tuned

array([ 6272.05394191, 44140.79404968,  6794.28802589, ...,
        7296.96521739,  6578.98765432,  6272.05394191])

# Cat Boost Regressor

In [44]:
cbr = CatBoostRegressor(verbose = False, loss_function = 'RMSE')

In [46]:
cbr_model_basic = cbr.fit(X_train, X_test)
cbr_predictions_base = cbr_model_basic.predict(y_train)

In [45]:
grid = {'iterations': [100, 150, 200],
        'learning_rate': [0.03, 0.1],
        'depth': [2, 4, 6, 8],
        'l2_leaf_reg': [0.2, 0.5, 1, 3]}
#cbr.grid_search(grid, train_dataset)

In [47]:
cbr_hp = RandomizedSearchCV(cbr, param_distributions = grid, n_iter=5,cv=3)

In [48]:
cbr_hp_fit = cbr_hp.fit(X_train, X_test)
print(cbr_hp_fit.best_params_)

{'learning_rate': 0.03, 'l2_leaf_reg': 0.2, 'iterations': 200, 'depth': 4}


In [49]:
cbr_tuned = CatBoostRegressor(verbose = False, learning_rate = 0.03, l2_leaf_reg = 0.2, iterations= 200, depth = 4)
cbr_model_tuned = cbr_tuned.fit(X_train, X_test)
cbr_predictions_tuned = cbr_model_tuned.predict(y_train)

In [50]:
cbr_base_mae = mean_absolute_error(y_test, cbr_predictions_base)
cbr_base_mse = mean_squared_error(y_test, cbr_predictions_base)
cbr_base_r2 = r2_score(y_test, cbr_predictions_base)

cbr_tuned_mae = mean_absolute_error(y_test, cbr_predictions_tuned)
cbr_tuned_mse = mean_squared_error(y_test, cbr_predictions_tuned)
cbr_tuned_r2 = r2_score(y_test, cbr_predictions_tuned)

In [51]:
print(cbr_base_mae)
print(cbr_base_mse)
print(cbr_base_r2)

print("*"*75)

print(cbr_tuned_mae)
print(cbr_tuned_mse)
print(cbr_tuned_r2)

1752.4135920836866
9653565.958629964
0.976628874643769
***************************************************************************
3543.7183289749323
28035586.113343667
0.9321263043834473


# RandomForestRegressor

In [53]:
rfr = RandomForestRegressor(random_state = 40)

In [54]:
from pprint import pprint

In [56]:
pprint(rfr.get_params())

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 40,
 'verbose': 0,
 'warm_start': False}


In [62]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
#max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [64]:
rf_random = RandomizedSearchCV(estimator = rfr, param_distributions = random_grid, 
                               n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rfr_hp_fit = rf_random.fit(X_train, X_test)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


  warn(


In [66]:
print(rfr_hp_fit.best_params_)

{'n_estimators': 1800, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 40, 'bootstrap': True}


In [68]:
rfr_tuned = RandomForestRegressor(random_state = 40, n_estimators= 1800, min_samples_split= 10, 
                                  min_samples_leaf= 1, max_features= 'auto', max_depth= 40, bootstrap= True)
rfr_model_tuned = rfr_tuned.fit(X_train, X_test)
rfr_predictions_tuned = rfr_model_tuned.predict(y_train)

  warn(


In [65]:
rfr_model_basic = rfr.fit(X_train, X_test)
rfr_predictions_base = rfr_model_basic.predict(y_train)

In [69]:
rfr_base_mae = mean_absolute_error(y_test, rfr_predictions_base)
rfr_base_mse = mean_squared_error(y_test, rfr_predictions_base)
rfr_base_r2 = r2_score(y_test, rfr_predictions_base)

rfr_tuned_mae = mean_absolute_error(y_test, rfr_predictions_tuned)
rfr_tuned_mse = mean_squared_error(y_test, rfr_predictions_tuned)
rfr_tuned_r2 = r2_score(y_test, rfr_predictions_tuned)

In [70]:
print(rfr_base_mae)
print(rfr_base_mse)
print(rfr_base_r2)

print("*"*75)

print(rfr_tuned_mae)
print(rfr_tuned_mse)
print(rfr_tuned_r2)

1508.0897803368507
11505544.622938115
0.9721452645761424
***************************************************************************
1537.782885948546
10823634.428093107
0.9737961580090677


# GradientBoostingRegressor

In [72]:
gbr = GradientBoostingRegressor(loss = 'absolute_error')

In [73]:
gbr_model_basic = gbr.fit(X_train, X_test)
gbr_predictions_base = gbr_model_basic.predict(y_train)

In [74]:
gbr_base_mae = mean_absolute_error(y_test, gbr_predictions_base)
gbr_base_mse = mean_squared_error(y_test, gbr_predictions_base)
gbr_base_r2 = r2_score(y_test, gbr_predictions_base)

In [75]:
print(gbr_base_mae)
print(gbr_base_mse)
print(gbr_base_r2)

3243.8500834663455
32321576.096129876
0.9217499927083094
