In [1]:
# Importing all the Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)
import os

In [2]:
# Lets see what is in the Data
df=pd.read_csv('data/Clean_Dataset.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


In [3]:
# Droping the useless column 'Unnamed: 0'
df=df.drop('Unnamed: 0',axis=1)
df=df.drop('flight',axis=1)

In [4]:
obcol=[]
for col in df.columns:
    if df[col].dtype=='object':
        print(f"{col}:{df[col].unique()}")
        obcol.append(col)


airline:['SpiceJet' 'AirAsia' 'Vistara' 'GO_FIRST' 'Indigo' 'Air_India']
source_city:['Delhi' 'Mumbai' 'Bangalore' 'Kolkata' 'Hyderabad' 'Chennai']
departure_time:['Evening' 'Early_Morning' 'Morning' 'Afternoon' 'Night' 'Late_Night']
stops:['zero' 'one' 'two_or_more']
arrival_time:['Night' 'Morning' 'Early_Morning' 'Afternoon' 'Evening' 'Late_Night']
destination_city:['Mumbai' 'Bangalore' 'Kolkata' 'Hyderabad' 'Chennai' 'Delhi']
class:['Economy' 'Business']


In [5]:
# A Quick Information about the Data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300153 entries, 0 to 300152
Data columns (total 10 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   airline           300153 non-null  object 
 1   source_city       300153 non-null  object 
 2   departure_time    300153 non-null  object 
 3   stops             300153 non-null  object 
 4   arrival_time      300153 non-null  object 
 5   destination_city  300153 non-null  object 
 6   class             300153 non-null  object 
 7   duration          300153 non-null  float64
 8   days_left         300153 non-null  int64  
 9   price             300153 non-null  int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 22.9+ MB


In [6]:
# Stastical Description of Data
df.describe()

Unnamed: 0,duration,days_left,price
count,300153.0,300153.0,300153.0
mean,12.221021,26.004751,20889.660523
std,7.191997,13.561004,22697.767366
min,0.83,1.0,1105.0
25%,6.83,15.0,4783.0
50%,11.25,26.0,7425.0
75%,16.17,38.0,42521.0
max,49.83,49.0,123071.0


In [7]:
df.corr(numeric_only=True).T

Unnamed: 0,duration,days_left,price
duration,1.0,-0.039157,0.204222
days_left,-0.039157,1.0,-0.091949
price,0.204222,-0.091949,1.0


In [8]:
# Size of the data 
df.shape

(300153, 10)

# Model Building

In [9]:
# Creating a Back up File
df_bk=df.copy()

In [10]:
# Coverting the labels into a numeric form using Label Encoder
"""from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
for col in df.columns:
    if df[col].dtype=='object':
        df[col]=le.fit_transform(df[col])"""

"from sklearn.preprocessing import LabelEncoder\nle=LabelEncoder()\nfor col in df.columns:\n    if df[col].dtype=='object':\n        df[col]=le.fit_transform(df[col])"

In [11]:
def encoding(df):
    mapping_airline = {'SpiceJet':0, 'AirAsia':1, 'Vistara':2, 'GO_FIRST':3, 'Indigo':4, 'Air_India':5}
    mapping_city = {'Delhi':0, 'Mumbai':1, 'Bangalore':2, 'Kolkata':3, 'Hyderabad':4, 'Chennai':5}
    mapping_time = {'Evening':0, 'Early_Morning':1, 'Morning':2, 'Afternoon':3, 'Night':4, 'Late_Night':5}
    mapping_stops = {'zero':0, 'one':1, 'two_or_more':2}
    mapping_class = {'Economy':0, 'Business':1}

    df['airline'] = df['airline'].map(mapping_airline)
    df['source_city'] = df['source_city'].map(mapping_city)
    df['destination_city'] = df['destination_city'].map(mapping_city)
    df['departure_time'] = df['departure_time'].map(mapping_time)
    df['arrival_time'] = df['arrival_time'].map(mapping_time)
    df['stops'] = df['stops'].map(mapping_stops)
    df['class'] = df['class'].map(mapping_class)

encoding(df)

In [12]:
# storing the Dependent Variables in X and Independent Variable in Y
x=df.drop(['price'],axis=1)
y=df['price']

In [13]:
# Splitting the Data into Training set and Testing Set
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.30,random_state=42)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((210107, 9), (90046, 9), (210107,), (90046,))

In [14]:
# Scaling the values to convert the int values to Machine Languages
from sklearn.preprocessing import MinMaxScaler
mmscaler=MinMaxScaler(feature_range=(0,1))
x_train=mmscaler.fit_transform(x_train)
x_test=mmscaler.fit_transform(x_test)
x_train=pd.DataFrame(x_train)
x_test=pd.DataFrame(x_test)  

In [15]:
# Build the Regression / Regressor models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn import linear_model
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import BaggingRegressor

# Create objects of Regression / Regressor models with default hyper-parameters

modelmlg = LinearRegression()
modeldcr = DecisionTreeRegressor()
modelKNN = KNeighborsRegressor(n_neighbors=5)
modelbag = BaggingRegressor()
modelRE=Ridge()
modelLO=linear_model.Lasso(alpha=0.1)

# Evalution matrix for all the algorithms

MM = [modelmlg, modeldcr, modelKNN, modelbag, modelRE, modelLO]

for models in MM:
    
    # Fit the model with train data
    
    models.fit(x_train, y_train)
    
    # Predict the model with test data

    y_pred = models.predict(x_test)
    
    # Print the model name
    
    print('Model Name: ', models)
    
    # Evaluation metrics for Regression analysis

    from sklearn import metrics

    print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred),3))  
    print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred),3))  
    print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3))
    print('R2_score:', round(metrics.r2_score(y_test, y_pred),6))
    print('Root Mean Squared Log Error (RMSLE):', round(np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),3))
    
    # Define the function to calculate the MAPE - Mean Absolute Percentage Error

    def MAPE (y_test, y_pred):
        y_test, y_pred = np.array(y_test), np.array(y_pred)
        return np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    
    # Evaluation of MAPE 

    result = MAPE(y_test, y_pred)
    print('Mean Absolute Percentage Error (MAPE):', round(result, 2), '%')
    
    # Calculate Adjusted R squared values 

    r_squared = round(metrics.r2_score(y_test, y_pred),6)
    adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),6)
    print('Adj R Square: ', adjusted_r_squared)
    print('------------------------------------------------------------------------------------------------------------')
    #-------------------------------------------------------------------------------------------
    new_row = {'Model Name' : models,
               'Mean_Absolute_Error_MAE' : metrics.mean_absolute_error(y_test, y_pred),
               'Adj_R_Square' : adjusted_r_squared,
               'Root_Mean_Squared_Error_RMSE' : np.sqrt(metrics.mean_squared_error(y_test, y_pred)),
               'Mean_Absolute_Percentage_Error_MAPE' : result,
               'Mean_Squared_Error_MSE' : metrics.mean_squared_error(y_test, y_pred),
               'Root_Mean_Squared_Log_Error_RMSLE': np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),
               'R2_score' : metrics.r2_score(y_test, y_pred)}
    #------------------------------------------------------------

Model Name:  LinearRegression()
Mean Absolute Error (MAE): 4588.927
Mean Squared Error (MSE): 49792119.191
Root Mean Squared Error (RMSE): 7056.353
R2_score: 0.903254
Root Mean Squared Log Error (RMSLE): 8.862
Mean Absolute Percentage Error (MAPE): 44.19 %
Adj R Square:  0.903251
------------------------------------------------------------------------------------------------------------
Model Name:  DecisionTreeRegressor()
Mean Absolute Error (MAE): 2237.771
Mean Squared Error (MSE): 26737530.118
Root Mean Squared Error (RMSE): 5170.835
R2_score: 0.948049
Root Mean Squared Log Error (RMSLE): 8.551
Mean Absolute Percentage Error (MAPE): 13.78 %
Adj R Square:  0.948047
------------------------------------------------------------------------------------------------------------
Model Name:  KNeighborsRegressor()
Mean Absolute Error (MAE): 1953.486
Mean Squared Error (MSE): 15732299.663
Root Mean Squared Error (RMSE): 3966.396
R2_score: 0.969432
Root Mean Squared Log Error (RMSLE): 8.286
Me

<p> From the Above Results, The Top 3 Models by comparing Errors , Adj_R_Square and R2_Score values are<br>
    
1. <b>Bagging Regressor</b>
2. <b>KNN Regressor</b>
3. <b>DecisionTreeRegressor</b><br>

<p>Training the Data with <b>Bagging Regressor</b></p>
</p>

In [16]:
#Trainig the model with
modelbag.fit(x_train, y_train)
    
# Predict the model with test data

y_pred = modelbag.predict(x_test)

In [17]:
out=pd.DataFrame({'Price_actual':y_test,'Price_pred':y_pred})
result=df_bk.merge(out,left_index=True,right_index=True)

In [18]:
result.sample(10)

Unnamed: 0,airline,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price,Price_actual,Price_pred
244968,Air_India,Bangalore,Early_Morning,one,Early_Morning,Delhi,Business,21.92,4,36610,36610,37904.7
78973,Air_India,Mumbai,Morning,one,Night,Chennai,Economy,13.58,14,9201,9201,9201.0
59362,Indigo,Mumbai,Late_Night,zero,Late_Night,Bangalore,Economy,1.58,38,2124,2124,2124.0
258650,Vistara,Bangalore,Early_Morning,one,Evening,Hyderabad,Business,11.92,44,56588,56588,49146.8
67223,GO_FIRST,Mumbai,Afternoon,one,Night,Kolkata,Economy,9.83,33,4961,4961,5657.8
259937,Vistara,Bangalore,Evening,one,Evening,Chennai,Business,25.5,24,44280,44280,45402.2
225273,Vistara,Mumbai,Morning,zero,Afternoon,Delhi,Business,2.17,5,34460,34460,42344.8
157125,Air_India,Hyderabad,Night,zero,Night,Delhi,Economy,2.17,39,2276,2276,2318.0
13042,AirAsia,Delhi,Afternoon,one,Night,Bangalore,Economy,5.42,17,4917,4917,4927.3
206613,Indigo,Chennai,Evening,zero,Evening,Hyderabad,Economy,1.33,49,1551,1551,1484.1


In [19]:
import pickle

# Save the model to disk
with open('model.pkl', 'wb') as f:
    pickle.dump(modelbag, f)

with open('preprocessor.pkl', 'wb') as f:
    pickle.dump(mmscaler, f)
