# Flight Price Prediction


In [None]:
#Importing necessary Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
#Loading dataset
# Reading excel file in a dataframe
df = pd.read_excel('Data_Train.xlsx')
df

In [None]:
# Checking top 5 records
#Displaying first five rows of the dataset.
df.head()

In [None]:
#Exploratory Data Analysis(EDA)
# Checking the dimensions of the dataset
df.shape

In [None]:
# To get good overview of the dataset
df.info()

In [None]:
# Checking the dtypes of dataset
df.dtypes

In [None]:
# Converting Date and time columns from object type to Datetime type
df['Date_of_Journey'] = pd.to_datetime(df['Date_of_Journey'])
df['Dep_Time'] = pd.to_datetime(df['Dep_Time'])
df['Arrival_Time']=pd.to_datetime(df['Arrival_Time'])


In [None]:
# Rechecking the datatypes
df.dtypes

In [None]:
# Extracting Day from Date_of_journey column
df['Journey_Day'] = pd.to_datetime(df.Date_of_Journey,format="%d/%m/%Y").dt.day

# Extracting Month from Date_of_journey column
df['Journey_Month'] = pd.to_datetime(df.Date_of_Journey, format="%d/%m/%Y").dt.month

In [None]:
# Dropping Date_of_journey column
df.drop("Date_of_Journey",axis=1,inplace=True)

In [None]:
# Duration

df['Duration']=df['Duration'].str.replace('h','*60').str.replace(' ','+').str.replace('m','*1').apply(eval)

#now lets convert this column into a numeric

df['Duration']=pd.to_numeric(df['Duration'])

In [None]:
# Extracting Hours from Dep_Time column
df['Dep_Hour']=pd.to_datetime(df['Dep_Time']).dt.hour

# Extracting Minutes from Dep_Time column
df['Dep_Min']=pd.to_datetime(df['Dep_Time']).dt.minute

In [None]:
# Dropping Dep_Time column
df.drop("Dep_Time",axis=1,inplace=True)

In [None]:
# Extracting Arrival_Hour from Arrival_Time column
df['Arrival_Hour']=pd.to_datetime(df['Arrival_Time']).dt.hour

# Extracting Arrival_Min from Arrival_Time column
df['Arrival_Min']=pd.to_datetime(df['Arrival_Time']).dt.minute

# Dropping Arruval_Time column
df.drop("Arrival_Time",axis=1,inplace=True)

In [None]:
# Lets check dataframe now
df.head()

In [None]:
# Checking the dimension of dataframe now
df.shape

In [None]:
# Checking the value counts of each column
for i in df.columns:
    print(df[i].value_counts())
    print("\n")

In [None]:
# Replacing "Jet Airwaus Business" as "Jet Airways" in the column Airline
df["Airline"]=df["Airline"].replace("Jet Airways Business","Jet Airways")

# Replacing "Multiple carriers Premium economy" as "Multiple carriers" in Airline column
df['Airline']=df["Airline"].replace("Multiple carriers Premium economy","Multiple carriers")

# Replacing "Vistara Premium economy" as "Vistara" in Airline column
df["Airline"] = df["Airline"].replace("Vistara Premium economy","Vistara")

# Replacing "New Delhi" as "Delhi" in Destination column
df["Destination"] = df["Destination"].replace("New Delhi","Delhi")

# In the column "Additional Info", "No Info" and "No info" are same so replacing it by "No Info
df['Additional_Info'] = df['Additional_Info'].replace("No info", "No Info")

# Replacing "1 Long Layover" and "2 Long layover" as "Long layover" in the column Additional Info
df['Additional_Info']=df['Additional_Info'].replace(["1 Long layover","2 Long layover"],"Long layover")

In [None]:
# Rechecking the unique values in the above columns
val_count = ["Airline","Destination","Additional_Info"]

for i in val_count:
    print(df[i].value_counts())
    print("\n")

In [None]:
# Checking number of unique values in each column
df.nunique()

In [None]:
# Checking for null values in the dataframe
df.isnull().sum()

In [None]:
#Handling Null values using imputation techniques.
# Checking the mode of Categorical columns "Route" and "Total_Stops"
print("The mode of Route is:",df["Route"].mode())
print("The mode of Total_Stops is:",df["Total_Stops"].mode())

In [None]:
# Filling the missing values in "Route" withs its mode
df['Route'] = df['Route'].fillna(df['Route'].mode()[0])

# Filling the mising values in "Total_Stops" by its mode
df['Total_Stops']=df['Total_Stops'].fillna(df['Total_Stops'].mode()[0])

In [None]:
# Checking for missing values now
df.isnull().sum()

In [None]:
# Lest check missing values using heatmap
sns.heatmap(df.isnull())

In [None]:
# Checking for columns in the dataset
df.columns.tolist()

In [None]:
# Checking the uniqueness of target column
df["Price"].unique()

In [None]:
 Checking the list of counts in the target column.
df["Price"].value_counts()

In [None]:
# Checking whether the dataset contains any space
df.loc[df['Price']==" "]

Description of Dataset

In [None]:
# Statistical summary of the dataset
df.describe()

Description Visualization

In [None]:
plt.figure(figsize=(10,12))
sns.heatmap(round(df.describe()[1:].transpose(),2),linewidth = 2, annot = True, fmt = '.2f')
plt.xticks(fontsize=18)
plt.yticks(fontsize=12)
plt.title("Variable Summary")
plt.show()

Lets separate numerical and categorical column.

In [None]:
# Checking categorical columns
categorical_col = []
for i in df.dtypes.index:
    if df.dtypes[i]=='object':
        categorical_col.append(i) 
print(categorical_col)

In [None]:
# Checking numerical column
numerical_col = []
for i in df.dtypes.index:
    if df.dtypes[i]!='object':
        numerical_col.append(i)
print(numerical_col)

Data Visualization


Univariate Analysis


Plotting Categorical columns

In [None]:
# Visualizing the various Airlines present in the dataset
plt.figure(figsize=(8,5))
sns.countplot(df["Airline"])
plt.title("Count of flights in different Airlines")
plt.xticks(rotation=90)
plt.show()

In [None]:
# Visualizing the various Source from which place the service begins
plt.figure(figsize=(8,5))
sns.countplot(df["Source"])
plt.title("Count of different Airline Sources")
plt.xticks(rotation=90)
plt.show()


In [None]:
# Visualizing the destination from which place the service ends
plt.figure(figsize=(8,5))
sns.countplot(df["Destination"])
plt.title("Count of destination")
plt.xticks(rotation=90)
plt.show()

In [None]:
# Visualizing the Total Stops between the source and the destination
plt.figure(figsize=(8,5))
sns.countplot(df["Total_Stops"])
plt.title("Count of Total Stops between the source and the destination")
plt.show()

In [None]:
# Visualizing the Additional Information about the flight
plt.figure(figsize=(8,5))
sns.countplot(df["Additional_Info"])
plt.title("Count of Additional Information about the flight")
plt.xticks(rotation=75)
plt.show()

Distribution of skewness

Plotting numerical columns

In [None]:
# Checking numerical columns
plt.figure(figsize=(15,15))
plotnumber=1
for col in numerical_col:
    if plotnumber<=8:
        ax = plt.subplot(3,3,plotnumber)
        sns.distplot(df[col],color='indigo')
        plt.xlabel(col,fontsize=20)
    plotnumber+=1
plt.tight_layout()
        


Bivariate Analysis

In [None]:
# Lets check which Airline is expensive based on Price of tickets
plt.figure(figsize=(15,5))
sns.barplot(df['Airline'],df["Price"],data=df)
plt.title("Checking which Airline is expensive based on ticket price")
plt.show()

In [None]:
# Let's check how prices changes in each destination 

plt.figure(figsize=(8,6))
sns.barplot(df["Destination"],df["Price"],data=df,palette="husl" )
plt.title("Checking deastination based on ticket price")
plt.show()

In [None]:
# Let's check the price vs total stops

plt.figure(figsize=(8,6))
sns.barplot(df["Total_Stops"],df["Price"],data=df,palette="ch:.28")
plt.title("Checking how price changes based on Stops")
plt.show()

In [None]:
#Let's compare Additional_Info and Price of the flights

plt.figure(figsize=(8,6))
sns.barplot(df["Additional_Info"],df["Price"],data=df)
plt.title("Checking how the prices changes with Additional_Info")
plt.xticks(rotation=90)
plt.show()

In [None]:
fig, axes = plt.subplots(2,2,figsize=(18,15))

# Checking relation between Journey_Day and Price
sns.boxenplot(x='Journey_Day',y='Price',ax = axes[0,0],data=df)

# Checking relation between Journey_Mionth and Price
sns.barplot(x='Journey_Month',y='Price',ax = axes[0,1],data=df,color='c')

# Checking relation between Dep_Min and Price
sns.boxplot(x='Dep_Min',y='Price',ax=axes[1,0],data=df)

# Checking relation between Dep_Hour and Price
sns.barplot(x='Dep_Hour',y='Price',ax=axes[1,1],data=df,palette="bright")

In [None]:
# Checking how the price pays role in Airline on the basis of destination
plt.figure(figsize=(8,6))
sns.stripplot(x=df['Airline'],y=df['Price'],hue=df["Destination"])
plt.title("Comparing Airline and Price on the basis of Destination")
plt.xticks(rotation=75)
plt.show()

In [None]:
#Checking relation between Arrival_Hour and Price
plt.figure(figsize=(8,6))
plt.title("Checking relation between Arrival_Hour and Price")
sns.stripplot(x='Arrival_Hour',y='Price',data=df, palette="ch:.28")
plt.xticks(rotation=75)
plt.show()

In [None]:
# Checking how the price pays role in Source on the basis of Total_Stops
plt.figure(figsize=(8,6))
sns.stripplot(x=df['Source'],y=df['Price'],hue=df["Total_Stops"])
plt.title("Comparing Source and Price on the basis of Total_Stops")
plt.xticks(rotation=75)
plt.show()

In [None]:
sns.pairplot(df,hue="Price",palette='husl')
plt.show()

Outliers

In [None]:
# Identifying the outliers using boxplot

plt.figure(figsize=(15,15),facecolor="white")
plotnumber=1
for col in numerical_col:
    if plotnumber<=8:
        ax = plt.subplot(3,3,plotnumber)
        sns.boxplot(df[col],color="darkorange")
        plt.xlabel(col,fontsize=15)
    plotnumber+=1
plt.tight_layout()  

In [None]:
Removing Outliers


Zscore method

In [None]:
from scipy.stats import zscore
# Features containing outliers
features = df[["Duration","Journey_Month"]]
z = np.abs(zscore(features))
z

In [None]:
# Creating new dataframe
new_df = df[(z<3).all(axis=1)]
new_df

In [None]:
# Checking the dimensions of both dataframes.
print(df.shape)
print(new_df.shape)

In [None]:
#Checking data loss
loss = (10683-10617)/10683*100
loss

IQR(InterQuantileRange)

In [None]:
# 1st quantile
Q1 = features.quantile(0.25)

#3rd quantile
Q3 = features.quantile(0.75)

IQR = Q3-Q1

df1 = df[~((df<(Q1-1.5*IQR))|(df>(Q3+1.5*IQR))).any(axis=1)]

In [None]:
df1.shape

In [None]:
# Checking data loss using IQR
loss = (10683-9657)/10683*100
loss

Checking Skewness

In [None]:
# Checking skewness 
new_df.skew()

In [None]:
new_df['Duration'] = np.log1p(new_df['Duration'])
new_df['Journey_Month'] = np.log1p(new_df['Journey_Month'])

In [None]:
# Checking skewness again
new_df.skew()

In [None]:
# Checking how the data has been distributed after removing skewness.
skew = ["Duration","Journey_Month"]
plt.figure(figsize=(15,15),facecolor="white")
plotnumber=1
for column in new_df[skew]:
    if plotnumber<=2:
        ax = plt.subplot(2,2,plotnumber)
        sns.distplot(df[column],color='g',kde_kws={"shade": True},hist=False)
        plt.xlabel(column,fontsize=20)
    plotnumber+=1
plt.show()


Label Encoding Categorical data

In [None]:
categorical_col

In [None]:
from sklearn.preprocessing import LabelEncoder
lbl = LabelEncoder()
new_df[categorical_col]=new_df[categorical_col].apply(lbl.fit_transform)
new_df

Correlation

In [None]:
# Checking the correlation between features and label
cor = new_df.corr()
cor

In [None]:
# Visualizing the correlation matrix by plotting heatmap.
plt.figure(figsize=(20,15))
sns.heatmap(new_df.corr(),linewidths=.1,fmt=".1g",linecolor="black",annot=True,cmap="YlGnBu")

In [None]:
cor["Price"].sort_values(ascending=False)

Visualizing the correlation between features and label using bar plot.

In [None]:
plt.figure(figsize=(22,7))
new_df.corr()["Price"].sort_values(ascending=False).drop(["Price"]).plot(kind="bar",color="purple")
plt.xlabel('Features',fontsize=20)
plt.ylabel('Target',fontsize=20)
plt.title('Correlation between label and features using barplot')
plt.show()

Separating the features and label.

In [None]:
x = new_df.drop("Price",axis=1)
y = new_df["Price"]

In [None]:
x.shape

In [None]:
y.shape

Feature Scaling using Standard Scalarization

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x = pd.DataFrame(scaler.fit_transform(x),columns=x.columns)
x


Checking VIF

In [None]:
# Finding variance inflation factor
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame()
vif["VIF Values"] = [variance_inflation_factor(x.values,i) for i in range(len(x.columns))]
vif['Features'] = x.columns
vif

Modeling

Finding the best random state

In [None]:
from sklearn.model_selection import train_test_split as TTS
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [None]:
from sklearn.ensemble import RandomForestRegressor
maxAccu = 0
maxRS = 0
for i in range(1,200):
    x_train,x_test,y_train,y_test = TTS(x,y,test_size=0.30,random_state=i)
    mod = RandomForestRegressor()
    mod.fit(x_train,y_train)
    pred = mod.predict(x_test)
    acc = r2_score(y_test,pred)
    if acc>maxAccu:
        maxAccu = acc
        maxRs = i
print("Maximum r2_score is ",maxAccu,"at random_state",maxRS)   

Creating new train_test_split

In [None]:
x_train,x_test,y_train,y_test = TTS(x,y,test_size=0.30,random_state=maxRS)

Regression Algorithms

In [None]:
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor,BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn import metrics

Random Forest Regressor

In [None]:
# Checking R2 score for RandomForestRegressor
RFR = RandomForestRegressor()
RFR.fit(x_train,y_train)
predRFR = RFR.predict(x_test)
print("R2_score:",r2_score(y_test,predRFR))
print("MAE:",metrics.mean_absolute_error(y_test,predRFR))
print("MSE:",metrics.mean_squared_error(y_test,predRFR))
print("RSME:",np.sqrt(metrics.mean_squared_error(y_test,predRFR)))

In [None]:
# Visualizing the predicted values
sns.regplot(y_test,predRFR,color='darkorange')
plt.show()

Decision Tree Regressor

In [None]:
# Checking R2 score for DecisionTreeRegressor
DTR = DecisionTreeRegressor()
DTR.fit(x_train,y_train)
predDTR = DTR.predict(x_test)
print("R2_score:",r2_score(y_test,predDTR))
print("MAE:",metrics.mean_absolute_error(y_test,predDTR))
print("MSE:",metrics.mean_squared_error(y_test,predDTR))
print("RSME:",np.sqrt(metrics.mean_squared_error(y_test,predDTR)))

In [None]:
# Visualizing the predicted values
sns.regplot(y_test,predDTR,color='darkorange')
plt.show()