# Flight Fare Prediction

### Step 1: Importing the Relevant Libraries

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score

from math import sqrt
from sklearn.linear_model import LinearRegression,Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

**Reading the Datasets**

In [2]:
os.chdir(r"D:\2.Praxis( all Stuff)\3. subject wise records\3.Term 3\3.Data Engineering And Model Deployment (DEMD)\project\Flight-Fare-Prediction-Deployment")
os.getcwd()

airline = pd.read_excel('Data_Train.xlsx')
airline.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


## EDA

In [3]:
airline.dropna(axis = 0, inplace = True) # droping the na (just in case only)

**1.Droping the Duplicate Rows:**

In [4]:
airline = airline.drop_duplicates()
airline.shape

(10462, 11)

**2.Converting the Date_of_journey into date, month and year.**

In [5]:
airline.sort_values('Date_of_Journey', inplace = True)

airline['year'] = pd.DatetimeIndex(airline['Date_of_Journey']).year
airline['month'] = pd.DatetimeIndex(airline['Date_of_Journey']).month
airline['Day'] = pd.DatetimeIndex(airline['Date_of_Journey']).day

In [6]:
## 'No info' is same as 'No Info'. So replacing them with single common label.
airline['Additional_Info'].replace('No Info', 'No info', inplace = True)

## Converting the sparses
airline['Airline'].replace(['Trujet', 'Vistara Premium economy'], 'Another', inplace = True)

**3.Converting the Total_Stops into numbers and dropping the rows with NaN.**

In [7]:
airline.dropna(axis = 0, inplace = True) # droping the na (just in case only)

In [8]:
# function to convert the stops to number
def convert_into_stops(X):
    if X == '4 stops':
        return 4
    elif X == '3 stops':
        return 3
    elif X == '2 stops':
        return 2
    elif X == '1 stop':
        return 1
    elif X == 'non stop':
        return 0

In [9]:
airline['Total_Stops'] = airline['Total_Stops'].map(convert_into_stops) # calling the function 

In [10]:
# just incase after the above process done we got nan, this will solve the issue
airline.fillna(0, inplace  = True) # filling 0 in the place of nan
airline['Total_Stops'] = airline['Total_Stops'].apply(lambda x : int(x)) # solving through lambda

**4.Converting the flight Dep_Time into proper time i.e. mid_night, morning, afternoon and evening.**

In [11]:
def flight_dep_time(X):
    '''
    This function takes the flight Departure time 
    and convert into appropriate format.
    '''
    if int(X[:2]) >= 0 and int(X[:2]) < 6:
        return 'mid_night'
    elif int(X[:2]) >= 6 and int(X[:2]) < 12:
        return 'morning'
    elif int(X[:2]) >= 12 and int(X[:2]) < 18:
        return 'afternoon'
    elif int(X[:2]) >= 18 and int(X[:2]) < 24:
        return 'evening'

In [12]:
# altering the dep_time and saving it in a new name as flight_time
airline['flight_time'] = airline['Dep_Time'].apply(flight_dep_time)

**5.Converting the flight duration into seconds.**

In [13]:
def convert_into_seconds(X):
    '''
    This function takes the total time of flight from
    one city to another and converts it into the seconds.
    '''
    a = [int(s) for s in re.findall(r'-?\d+\.?\d*', X)]
    if len(a) == 2:
        hr = a[0] * 3600
        min = a[1] * 60
    else:
        hr = a[0] * 3600
        min = 0   
    total = hr + min
    return total

In [14]:
airline['Duration(sec)'] = airline['Duration'].map(convert_into_seconds) # calling the function and solving it

In [15]:
df = airline.copy()

In [16]:
# droping unwated feature for the model building
df.drop(['Date_of_Journey', 'Route', 'Dep_Time', 'Arrival_Time', 'Duration','year','Day'], axis = 1, inplace = True)

In [17]:
df

Unnamed: 0,Airline,Source,Destination,Total_Stops,Additional_Info,Price,month,flight_time,Duration(sec)
8536,Jet Airways,Banglore,New Delhi,1,No info,25735,1,afternoon,69900
10149,Air India,Banglore,New Delhi,2,Change airports,17461,1,morning,26100
5701,Air India,Banglore,New Delhi,2,No info,25430,1,morning,138900
4829,Jet Airways,Banglore,New Delhi,1,No info,27992,1,mid_night,52500
6558,IndiGo,Banglore,New Delhi,0,No info,11934,1,evening,10200
...,...,...,...,...,...,...,...,...,...
6944,Air India,Kolkata,Banglore,2,No info,11642,9,afternoon,45900
8086,Jet Airways,Kolkata,Banglore,1,No info,13401,9,evening,77700
3683,IndiGo,Delhi,Cochin,1,No info,6069,9,afternoon,18000
3693,Jet Airways,Kolkata,Banglore,1,No info,14571,9,afternoon,25500


### Step 5: Binary Conversion for the Classification Variables

In [18]:
### Label Encoding
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
df['Airline'] = labelencoder.fit_transform(df['Airline'])
df['Source'] = labelencoder.fit_transform(df['Source'])
df['Destination'] = labelencoder.fit_transform(df['Destination'])
df['Additional_Info'] = labelencoder.fit_transform(df['Additional_Info'])
df['flight_time'] = labelencoder.fit_transform(df['flight_time'])

## change column name:
df.rename(columns={'Duration(sec)': 'Duration'}, inplace=True)

In [19]:
# limiting the size of the dataset
df1 = df.iloc[:8000,].copy()

In [20]:
X = df1.drop(['Price'], axis = 1)
y = (df1['Price']) # applying np.log(due to the price is right skewed), using this we adjusted the skewness

# 20% data as validation set
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=22)

# model training
lr = RandomForestRegressor()
model = lr.fit(X_train,y_train)
y_pred = model.predict(X_test)
y_pred

array([ 2083.815     ,  4786.00334524,  5801.745     , ...,
        7403.45166667, 17046.98      ,  6880.81754978])

In [21]:
# checking the r2_score
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.7082900959196191

In [22]:
## predicting the result
model.predict([[5,2,1,2,5,3,1,88500]])

array([8096.79664286])

In [23]:
#Saving our model into a file
import pickle

with open("Pickle_file.pkl", 'wb') as file:
    pickle.dump(lr,file)

In [24]:
with open("Pickle_file.pkl", 'rb') as file:
    pickle_LR_Model = pickle.load(file)

In [25]:
ypredict = pickle_LR_Model.predict(X_test)

In [26]:

pickle_LR_Model.predict([[5,2,1,2,5,3,1,88500]])

# Airline ,  Source ,  Destination ,  Total_Stops , Additional_Info , month , flight_time , Duration

array([8096.79664286])