In [1]:
import numpy as np
import pandas as pd

In [2]:
import datetime
from scipy.stats import zscore
from sklearn.impute import SimpleImputer
import math

In [3]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from time import time

In [4]:
import statsmodels.formula.api as smf
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

In [14]:
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae

In [6]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [7]:
from warnings import filterwarnings
filterwarnings('ignore')

In [8]:
df = pd.read_csv("clean_train_data.csv")
df.head(10)

Unnamed: 0,airline,source,destination,route,duration,total_stops,add_info,price,day,month,day_part
0,3,0,2,18,170,4,8,3897,24,3,0
1,1,3,0,84,445,1,8,7662,5,1,1
2,4,2,1,118,1140,1,8,13882,6,9,1
3,3,3,0,91,325,0,8,6218,5,12,3
4,3,0,2,29,285,0,8,13302,3,1,2
5,8,3,0,64,145,4,8,3873,24,6,1
6,4,0,2,5,930,0,5,11087,3,12,3
7,4,0,2,5,1265,0,8,22270,3,1,1
8,4,0,2,5,1530,0,5,11087,3,12,1
9,6,2,1,104,470,0,8,8625,27,5,1


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10463 entries, 0 to 10462
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   airline      10463 non-null  int64
 1   source       10463 non-null  int64
 2   destination  10463 non-null  int64
 3   route        10463 non-null  int64
 4   duration     10463 non-null  int64
 5   total_stops  10463 non-null  int64
 6   add_info     10463 non-null  int64
 7   price        10463 non-null  int64
 8   day          10463 non-null  int64
 9   month        10463 non-null  int64
 10  day_part     10463 non-null  int64
dtypes: int64(11)
memory usage: 899.3 KB


### Modelling Prerequisites

In [10]:
# dependent and independent variables

x = df.drop("price", axis=1)
y = df["price"]

In [11]:
# train and test splits
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

# checking the train and test record counts
print("No.of.records in train data: {}".format(x_train.shape[0]))
print("No.of.records in test data: {}".format(x_test.shape[0]))

No.of.records in train data: 7324
No.of.records in test data: 3139


In [12]:
# let's scale the data for ANN Regressor

x_train_sc = x_train.apply(zscore)
x_test_sc = x_test.apply(zscore)

In [13]:
# constructing different models

annm = MLPRegressor(hidden_layer_sizes=(500), max_iter=5000, random_state=123)
rfrm = RandomForestRegressor(random_state=123)
dtrm = DecisionTreeRegressor(random_state=123)
lirm = LinearRegression()

In [29]:
# decision tree regressor-------------------------------------(raw model)

model = DecisionTreeRegressor(random_state=1).fit(x_train,y_train)

# evaluation metrics-----------------------------------------------------

train_score = model.score(x_train,y_train)
test_score  = model.score(x_test,y_test)

train_predictions = model.predict(x_train)
test_predictions  = model.predict(x_test)

train_rmse = np.sqrt(mse(y_train,train_predictions))
test_rmse  = np.sqrt(mse(y_test,test_predictions))

train_mae = mae(y_train,train_predictions)
test_mae  = mae(y_test,test_predictions)

print("Training Data------------------------")
print("")
print("Model Score: {}".format(round(train_score,2)))
print("RMSE Value : {}".format(round(train_rmse,2)))
print("MAPE Value : {}".format(round(train_mae,2)))
print("\n")
print("Testing Data-------------------------")
print("")
print("Model Score: {}".format(round(test_score,2)))
print("RMSE Value : {}".format(round(test_rmse,2)))
print("MAPE Value : {}".format(round(test_mae,2)))

Training Data------------------------

Model Score: 0.99
RMSE Value : 360.98
MAPE Value : 71.47


Testing Data-------------------------

Model Score: 0.78
RMSE Value : 2081.09
MAPE Value : 831.54


In [34]:
# random forest regressor-------------------------------------(raw model)

model = RandomForestRegressor(random_state=1).fit(x_train,y_train)

# evaluation metrics-----------------------------------------------------

train_score = model.score(x_train,y_train)
test_score  = model.score(x_test,y_test)

train_predictions = model.predict(x_train)
test_predictions  = model.predict(x_test)

train_rmse = np.sqrt(mse(y_train,train_predictions))
test_rmse  = np.sqrt(mse(y_test,test_predictions))

train_mae = mae(y_train,train_predictions)
test_mae  = mae(y_test,test_predictions)

print("Training Data------------------------")
print("")
print("Model Score: {}".format(round(train_score,2)))
print("RMSE Value : {}".format(round(train_rmse,2)))
print("MAPE Value : {}".format(round(train_mae,2)))
print("\n")
print("Testing Data-------------------------")
print("")
print("Model Score: {}".format(round(test_score,2)))
print("RMSE Value : {}".format(round(test_rmse,2)))
print("MAPE Value : {}".format(round(test_mae,2)))

Training Data------------------------

Model Score: 0.98
RMSE Value : 721.51
MAPE Value : 307.37


Testing Data-------------------------

Model Score: 0.86
RMSE Value : 1700.7
MAPE Value : 748.57


In [37]:
# linear regressor--------------------------------------------(raw model)

model = LinearRegression().fit(x_train,y_train)

# evaluation metrics-----------------------------------------------------

train_score = model.score(x_train,y_train)
test_score  = model.score(x_test,y_test)

train_predictions = model.predict(x_train)
test_predictions  = model.predict(x_test)

train_rmse = np.sqrt(mse(y_train,train_predictions))
test_rmse  = np.sqrt(mse(y_test,test_predictions))

train_mae = mae(y_train,train_predictions)
test_mae  = mae(y_test,test_predictions)

print("Training Data------------------------")
print("")
print("Model Score: {}".format(round(train_score,2)))
print("RMSE Value : {}".format(round(train_rmse,2)))
print("MAPE Value : {}".format(round(train_mae,2)))
print("\n")
print("Testing Data-------------------------")
print("")
print("Model Score: {}".format(round(test_score,2)))
print("RMSE Value : {}".format(round(test_rmse,2)))
print("MAPE Value : {}".format(round(test_mae,2)))

Training Data------------------------

Model Score: 0.4
RMSE Value : 3641.48
MAPE Value : 2546.54


Testing Data-------------------------

Model Score: 0.44
RMSE Value : 3350.73
MAPE Value : 2461.23


In [38]:
# ann regressor-----------------------------------------------(raw model)

model = MLPRegressor(hidden_layer_sizes=(500), random_state=1).fit(x_train_sc,y_train)

# evaluation metrics-----------------------------------------------------

train_score = model.score(x_train_sc,y_train)
test_score  = model.score(x_test_sc,y_test)

train_predictions = model.predict(x_train_sc)
test_predictions  = model.predict(x_test_sc)

train_rmse = np.sqrt(mse(y_train,train_predictions))
test_rmse  = np.sqrt(mse(y_test,test_predictions))

train_mae = mae(y_train,train_predictions)
test_mae  = mae(y_test,test_predictions)

print("Training Data------------------------")
print("")
print("Model Score: {}".format(round(train_score,2)))
print("RMSE Value : {}".format(round(train_rmse,2)))
print("MAPE Value : {}".format(round(train_mae,2)))
print("\n")
print("Testing Data-------------------------")
print("")
print("Model Score: {}".format(round(test_score,2)))
print("RMSE Value : {}".format(round(test_rmse,2)))
print("MAPE Value : {}".format(round(test_mae,2)))

Training Data------------------------

Model Score: 0.41
RMSE Value : 3586.45
MAPE Value : 2550.03


Testing Data-------------------------

Model Score: 0.45
RMSE Value : 3317.62
MAPE Value : 2469.23


In [45]:
# tweaking random forest model

model = model = RandomForestRegressor(random_state=1)
parameters = {"max_depth":[10,15], "max_features":[5,6], "n_estimators":[100,200]}

grid = GridSearchCV(estimator=model, param_grid=parameters, cv=5).fit(x_train,y_train)

In [46]:
grid.best_params_

{'max_depth': 15, 'max_features': 5, 'n_estimators': 100}

In [47]:
# random forest regressor----------------------------------(tweaked model)

model = grid.best_estimator_.fit(x_train,y_train)

# evaluation metrics-----------------------------------------------------

train_score = model.score(x_train,y_train)
test_score  = model.score(x_test,y_test)

train_predictions = model.predict(x_train)
test_predictions  = model.predict(x_test)

train_rmse = np.sqrt(mse(y_train,train_predictions))
test_rmse  = np.sqrt(mse(y_test,test_predictions))

train_mae = mae(y_train,train_predictions)
test_mae  = mae(y_test,test_predictions)

print("Training Data------------------------")
print("")
print("Model Score: {}".format(round(train_score,2)))
print("RMSE Value : {}".format(round(train_rmse,2)))
print("MAPE Value : {}".format(round(train_mae,2)))
print("\n")
print("Testing Data-------------------------")
print("")
print("Model Score: {}".format(round(test_score,2)))
print("RMSE Value : {}".format(round(test_rmse,2)))
print("MAPE Value : {}".format(round(test_mae,2)))

Training Data------------------------

Model Score: 0.96
RMSE Value : 909.82
MAPE Value : 482.54


Testing Data-------------------------

Model Score: 0.88
RMSE Value : 1538.68
MAPE Value : 798.55
