Importing The Necessary Libraries:

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns


# Importing Dataset:


In [None]:
df = pd.read_excel("/Users/shubh/Downloads/Flight_Ticket_Participant_Datasets/Data_Train.xlsx")
df

# Exploratory Data Analysis (EDA):

In [None]:
df.columns

Getting Information Of Dataset:

In [None]:
df.info()

In [None]:
df.dtypes

Describing The Dataset to Know More :

In [None]:
df.describe()

Checking For Null Values In Dataset:

In [None]:
df.isnull().sum()

In [None]:
df.isnull().sum().any()

Dropping The NAN Values:

In [None]:
df.dropna(inplace= True)

In [None]:
df[df.duplicated()].head()

Here we will be removing those repeated values from the dataset and keeping the in-place attribute to be true so that there will be no changes.



In [None]:
df.drop_duplicates(keep= 'first',inplace= True)
df.head()

In [None]:
df.shape

Checking the Additional_info column and having the count of unique types of values.



In [None]:
df['Additional_Info'].value_counts()

Checking the different Airlines :

In [None]:
df['Airline'].value_counts()

In [None]:
df['Airline'].unique()


Checking Diffrent Airline Routes:

In [None]:
df["Route"].unique()

# Data Visualization:

A] visualization with Price vs Airline:

In [None]:
sns.catplot(y = "Price", x = "Airline", data = df.sort_values("Price", ascending = False), kind="boxen", height = 8, aspect = 3)
plt.show()

Here with the help of the cat plot we are trying to plot the boxplot between the price of the flight and airline and we can conclude that Jet Airways has the most outliers in terms of price.

B] Visualization for Price vs Source:



In [None]:
sns.catplot(y = "Price", x = "Source", data = df.sort_values("Price", ascending = False), kind="violin", height = 4, aspect = 3)
plt.show()

Now with the help of cat plot only we are plotting a box plot between the price of the flight and the source place i.e. the place from where passengers will travel to the destination and we can see that Banglore as the source location has the most outliers while Chennai has the least.

C] Visualizaton for Price vs Destination:

In [None]:
sns.catplot(y = "Price", x = "Destination", data = df.sort_values("Price", ascending = False), kind="box", height = 4, aspect = 3)
plt.show()

Here we are plotting the box plot with the help of a cat plot between the price of the flight and the destination to which the passenger is travelling and figured out that New Delhi has the most outliers and Kolkata has the least.

# Feature Engineering:

Lets See Our Data:

In [None]:
df.head(15)

Here we are dividing the features and labels and then converting the hours in minutes.



In [None]:
df["Journey_day"] = pd.to_datetime(df.Date_of_Journey, format="%d/%m/%Y").dt.day
df["Journey_month"] = pd.to_datetime(df["Date_of_Journey"], format = "%d/%m/%Y").dt.month
df.drop(["Date_of_Journey"], axis = 1, inplace = True)

Similarly, we can extract ‘Departure_Hour’ and ‘Departure_Minute’ as well as ‘Arrival_Hour and ‘Arrival_Minute’ from ‘Dep_Time’ and ‘Arrival_Time’ variables respectively.

In [None]:
df["Dep_hour"] = pd.to_datetime(df["Dep_Time"]).dt.hour
df["Dep_min"] = pd.to_datetime(df["Dep_Time"]).dt.minute
df.drop(["Dep_Time"], axis = 1, inplace = True)

In [None]:
df["Arrival_hour"] = pd.to_datetime(df.Arrival_Time).dt.hour
df["Arrival_min"] = pd.to_datetime(df.Arrival_Time).dt.minute
df.drop(["Arrival_Time"], axis = 1, inplace = True)


In [None]:
df.head(15)

D] Visualizaton of  Bar chart for Months (Duration) vs Number of Flights:

In [None]:
plt.figure(figsize = (22,7))
plt.title('Count of flights month wise')
ax=sns.countplot(x = 'Journey_month', data = df)
plt.xlabel('Month')
plt.ylabel('Count of flights')
for p in ax.patches:
    ax.annotate(int(p.get_height()), (p.get_x()+0.25, p.get_height()+1), va='bottom', color= 'black')

Here in the above graph we have plotted the count plot for journey in a month vs several flights and got to see that May has the most number of flights.

E] Visualization of  Bar chart for Types of Airline vs Number of Flights

In [None]:
plt.figure(figsize = (22,7))
plt.title('Count of flights with different Airlines')
ax=sns.countplot(x = 'Airline', data =df)
plt.xlabel('Airline')
plt.ylabel('Count of flights')
plt.xticks(rotation = 45)
for p in ax.patches:
    ax.annotate(int(p.get_height()), (p.get_x()+0.25, p.get_height()+1), va='bottom', color= 'black')


Now from the above graph we can see that between the type of airline and count of flights we can see that Jet Airways has the most flight boarded.

F] Visualizaton of Ticket Price vs Airlines:

In [None]:
plt.figure(figsize = (15,4))
plt.title('Price VS Airlines')
plt.scatter(df['Airline'], df['Price'])
plt.xticks
plt.xlabel('Airline')
plt.ylabel('Price of ticket')
plt.xticks(rotation = 90);

# Checking For Correlation:

In [None]:
plt.figure(figsize=(25,12))


sns.heatmap(df.corr(),annot=True,linewidths=0.1,linecolor='black',fmt='0.2f');

# Using the Label Encoder Technique:

Dealing with Categorical Data and Numerical Data



In [None]:
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')


for column in df.columns:
    if df[column].dtype == np.number:
        continue
    df[column]= LabelEncoder().fit_transform(df[column])
        
    


In [None]:
df.dtypes

In [None]:
df

# Sperating The Target Variable:

In [None]:
x=df.drop("Price",axis=1)



y=df["Price"]

In [None]:
x

In [None]:
y

# Checking For Skewness:

In [None]:
# Calculating Mean Absolute Percentage Error
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [None]:
df.skew()

In [None]:

x.skew().sort_values(ascending=False)
sns.displot(x);

In [None]:
from sklearn.preprocessing import power_transform

x_new = power_transform(x)
sns.displot(x_new);



Validating The Skewness Is Removed Or Not:

# Checking For Outliners:

In [None]:
# checking for the outliners  

# Removing the outliers:

# Z Score Technique:

from scipy.stats import zscore
z=np.abs(zscore(x))
z.shape




In [None]:
threshold=3
print(np.where(z>3))

In [None]:
x_new=x[(z<3).all(axis=1)]
print(x.shape)
print(x_new.shape)

In [None]:
# checking the still outliers are present or not if they ARE present the they help in testing model or not



from scipy.stats import zscore
(np.abs(zscore(x))<3).all()

Here there is only one outliner present in the Additional_Info Column, So that this outliner can be neglected.

# Splitting The Data:

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,random_state = 0)

# Finding The Best Model:

A] Linear Regression:

In [None]:
for i in range(0,100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = i)
    lr = LinearRegression()
    lr.fit(x_train, y_train)
    pred_test = lr.predict(x_test)
    pred_train = lr.predict(x_train)
    
    print(f"At random state {i}, the training accuracy is:- {r2_score(y_train, pred_train)}")
    print(f"At randoom stae {i}, the training accuracy is:- {r2_score(y_test, pred_test)}")
    print("\n")

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 33)

In [None]:
lr.fit(x_train, y_train)

In [None]:
pred_test = lr.predict(x_test)

In [None]:
print(r2_score(y_test, pred_test))

here r2_score of our model is 93.33 which is good one.

# B] Lasso Regression:

In [None]:
# Performing GridSearchCV on Lasso Regression
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from math import sqrt
from sklearn.metrics import mean_squared_error as mse


params = {'alpha' : [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]}
lasso_regressor = GridSearchCV(Lasso(), params ,cv = 15,scoring = 'neg_mean_absolute_error', n_jobs = -1)
lasso_regressor.fit(x_train, y_train)

In [None]:
# Predicting train and test results
y_train_pred = lasso_regressor.predict(x_train)
y_test_pred = lasso_regressor.predict(x_test)

In [None]:
print("Train Results for Lasso Regressor Model:")
print("Root Mean Squared Error: ", sqrt(mse(y_train.values, y_train_pred)))
# print("Mean Absolute % Error: ", round(mean_absolute_percentage_error(y_train.values, y_train_pred)))
print("R-Squared: ", r2_score(y_train.values, y_train_pred))

In [None]:
print("Test Results for Lasso Regressor Model:")
print("Root Mean squared Error: ", sqrt(mse(y_test, y_test_pred)))
print("Mean Absolute % Error: ", round(mean_absolute_percentage_error(y_test, y_test_pred)))
print("R-Squared: ", r2_score(y_test, y_test_pred))

# Decision Tree Regression:

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV



In [None]:
# Performing GridSearchCV on Decision Tree Regression
depth = list(range(3,30))
param_grid = dict(max_depth = depth)
tree = GridSearchCV(DecisionTreeRegressor(), param_grid, cv = 10)
tree.fit(x_train,y_train)

In [None]:
# Predicting train and test results
y_train_pred = tree.predict(x_train)
y_test_pred = tree.predict(x_test)

In [None]:
print("Train Results for Decision Tree Regressor Model:")
print("Root Mean squared Error: ", sqrt(mse(y_train.values, y_train_pred)))
print("R-Squared: ", r2_score(y_train.values, y_train_pred))


In [None]:
print("Test Results for Decision Tree Regressor Model:")
print("Root Mean Squared Error: ", sqrt(mse(y_test, y_test_pred)))
print("Mean Absolute % Error: ", round(mean_absolute_percentage_error(y_test, y_test_pred)))
print("R-Squared: ", r2_score(y_test, y_test_pred))

From Above Three Models We Find The Best Accuracy in the DECISION TREE REGRESSOR.


# Model Saving:

In [None]:
# model Saving

import pickle
filename = 'price.pkl'
pickle.dump(lr, open(filename, 'wb'))

# Conclusion:

In [None]:
# Conclusion

import numpy as np
a = np.array(y_test)
predicated = np.array(tree.predict(x_test))
df_com = pd.DataFrame({'original':a, "predicted":predicated}, index=range(len(a)))
df_com