# EDA

### import required packages

In [1]:

import pandas as pd # for data wrangling purpose
import numpy as np # Basic computation library
import seaborn as sns # For Visualization 
import matplotlib.pyplot as plt # ploting package
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler,LabelEncoder

%matplotlib inline
import warnings # Filtering warnings
warnings.filterwarnings('ignore')

### load the data

In [2]:
df = pd.read_csv('updated_output10.csv')
df.head()

In [3]:
df.tail()

In [4]:
print('No of Rows:',df.shape[0])
print('No of Columns:',df.shape[1])

In [5]:
df.info()

## Handling NA values

In [6]:
# check how many NA records are present
df.isna().sum()

In [7]:
df.isnull().sum()

In [8]:
df.drop_duplicates(inplace=True)

In [9]:
df.isna().sum()

In [10]:
df.dropna(inplace=True)

In [11]:
df.isna().sum()

In [12]:
df.describe()

### Data Integrity Check
#### Since dataset is large, Let check for any entry which is repeated or duplicated in dataset.

In [13]:
# View only the duplicate rows
duplicate_rows = df[df.duplicated()]
duplicate_rows.shape

## Preprocessing

In [14]:
# Sort columns by datatypes
df.columns.to_series().groupby(df.dtypes).groups

### Expected Datatypes
{int64: ['days_left', 'price'], float64: ['duration'], object: ['airline', 'flight', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class']}

In [15]:
# Spliting data into categorical and Numerical Variable
Numerical =['duration', 'days_left', 'price']

Categorical = [ 'airline', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class']

In [16]:
for i in Categorical:
    print('Unique value counts of ',i, 'Enlisted as Below Table :')
    print('-'*40)
    print(df[i].value_counts())
    print("*"*120)

#### We also need to do some feature engineering on Source and Destination City columns, We can see at some place 'GOX' is mention and other places 'Goa' is mention.


In [17]:
# Replacing "GOX" as "Goa" in source_city column
df["source_city"] = df["source_city"].replace("GOX","Goa")
df['source_city'].value_counts()

In [18]:
df["source_city"] = df["source_city"].replace("Bengaluru","Bangalore")
df['source_city'].value_counts()

In [19]:
df["airline"] = df["airline"].replace("Air_India","Air India")
df['airline'].value_counts()

In [20]:
# Replacing "GOX" as "Goa" in destination_city column
df["destination_city"] = df["destination_city"].replace("GOX","Goa")
df['destination_city'].value_counts()

In [21]:
df["destination_city"] = df["destination_city"].replace("Bengaluru","Bangalore")
df['destination_city'].value_counts()

In [22]:
df = df[~df['class'].isin(['OperatedBy: Vistara', 'OperatedBy: Air India Express'])]

In [23]:
# remove the unwanted columns
df.drop(['flight'], axis=1, inplace=True)

In [24]:
df['days_left'] = df['days_left'].astype(int)
df['price'] = df['price'].astype(int)

In [25]:
df.columns.to_series().groupby(df.dtypes).groups

In [26]:
df.head()

In [27]:
# Exporting a single DataFrame to an Excel file
df.to_excel('output.xlsx', sheet_name='Sheet1', index=False)


In [28]:
for i in Categorical:
    print('Unique value counts of ',i, 'Enlisted as Below Table :')
    print('-'*40)
    print(df[i].value_counts())
    print("*"*120)

In [29]:
# As Airline is Nominal Categorical data we will perform OneHotEncoding
airline = df[["airline"]]
Current_airline_List = airline['airline']
New_airline_List = []

for carrier in Current_airline_List:
  if carrier in ['Vistara','Air_India','Indigo','GO_FIRST','AirAsia','SpiceJet','Air India','Air India Express']:
    New_airline_List.append(carrier)
  else:
    New_airline_List.append('Other')

airline['airline'] = pd.DataFrame(New_airline_List)
airline['airline'].value_counts()

In [30]:
df['airline'] = airline['airline']

In [31]:
df['airline'].value_counts()

In [32]:
df.describe()

In [33]:
df[Categorical].describe().T.style.background_gradient(cmap='summer_r')

#### Observation :
- The Minimum flight ticket price is Rs. 1013.00 and Maximum flight ticket price is Rs.189754.00.
- The ( mean > median ) for Price,Duration and departure_time which mean right skew data.
- Presense of outliers in Duration & Price columns as we see huge difference in 75% and max.
- High value of std. deviation indicating spread of data.
- There are 9 airline in dataset with maximum flight run by Vistara Airlines.
- Long route flight have atleast one 1 stop.

In [34]:
inputs_cols=['airline',  'source_city', 'departure_time','stops', 'arrival_time',
            'destination_city', 'class', 'duration','days_left' ]
targets_col='price'

In [35]:
Numerical =['duration', 'days_left', 'price']

Categorical = [ 'airline', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class']

## Removing outlier from duration

In [36]:
sns.boxplot(df[inputs_cols])

In [37]:
# remove the outlier
df = df[df.duration <=30]

In [38]:
# remove the outlier
df = df[df.price <=100000]

In [39]:
df

In [40]:
sns.boxplot(df[inputs_cols])

In [41]:
plt.figure(figsize=(18,10),facecolor='white')
plotnumber=1

for column in Numerical:
    if plotnumber<=6:
        ax=plt.subplot(2,3,plotnumber)
        sns.boxplot(df[column],color='g')
        plt.xlabel(column,fontsize=20)
    plotnumber+=1
plt.show()

In [42]:
plt.figure(figsize=(18,10),facecolor='white')
plotnumber=1

for column in Numerical:
    if plotnumber<=6:
        ax=plt.subplot(2,3,plotnumber)
        sns.boxplot(df[column],color='g')
        plt.xlabel(column,fontsize=20)
    plotnumber+=1
plt.show()

In [43]:
df.head()

## Encoding

In [44]:
airline = df[["airline"]]
airline = pd.get_dummies(airline, drop_first= True, dtype=int) 
# drop_first= True means we drop the first column to prevent multicollinearity
airline.head()

In [45]:
# As Source is Nominal Categorical data we will perform OneHotEncoding
source_city = df[["source_city"]]
source_city = pd.get_dummies(source_city, drop_first= True, dtype=int) 
# drop_first= True means we drop the first column to prevent multicollinearity
source_city.head()

In [46]:
# As destination_city is Nominal Categorical data we will perform OneHotEncoding
destination_city = df[["destination_city"]]
destination_city = pd.get_dummies(destination_city, drop_first= True, dtype=int) 
# drop_first= True means we drop the first column to prevent multicollinearity
destination_city.head()


In [47]:
df['stops'].value_counts()

In [48]:
# As this is case of Ordinal Categorical type we perform LabelEncoder
# Here Values are assigned with corresponding keys
df.replace({"zero": 0, "one": 1, "2 stops": 2, "two_or_more": 3}, inplace = True)
df.head()

In [49]:
import pickle

In [50]:
from sklearn.preprocessing import LabelEncoder

# encode the class Column
class_encoder = LabelEncoder()
class_encoder.fit(df['class'])
df['class'] = class_encoder.transform(df['class'])

# save the geography encoder 
with open("class_encoder.pkl", "wb") as file:
    pickle.dump(class_encoder, file)

In [51]:
from sklearn.preprocessing import LabelEncoder

# encode the class Column
departure_time_encoder = LabelEncoder()
departure_time_encoder.fit(df['departure_time'])
df['departure_time'] = departure_time_encoder.transform(df['departure_time'])

# save the geography encoder 
with open("departure_time_encoder.pkl", "wb") as file:
    pickle.dump(departure_time_encoder, file)

In [52]:
from sklearn.preprocessing import LabelEncoder

# encode the class Column
arrival_time_encoder = LabelEncoder()
arrival_time_encoder.fit(df['arrival_time'])
df['arrival_time'] = arrival_time_encoder.transform(df['arrival_time'])

# save the geography encoder 
with open("arrival_time_encoder.pkl", "wb") as file:
    pickle.dump(arrival_time_encoder, file)

In [53]:
# # Using Label Encoder on class variable
# from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()
# df['class'] = le.fit_transform(df['class'])
# df['departure_time'] = le.fit_transform(df['departure_time'])
# df['arrival_time'] = le.fit_transform(df['arrival_time'])
df.head()

In [54]:
df.drop(['airline','source_city','destination_city'],axis=1,inplace = True)

In [55]:
df.head()

In [56]:
final_df = pd.concat([df,airline,source_city,destination_city],axis=1)
final_df.head()

In [57]:
final_df.shape

In [58]:
final_df.columns

In [59]:
x = final_df.drop('price',axis=1)
y = final_df['price']

In [60]:
print(x.shape, y.shape)

## Spliting data into train,test data

In [61]:
from sklearn.model_selection import train_test_split

In [62]:
x_train, x_test, y_train, y_test =  train_test_split(x, y, train_size=0.7,random_state=42)
print('Training feature matrix size:',x_train.shape)
print('Training target vector size:',y_train.shape)
print('Test feature matrix size:',x_test.shape)
print('Test target vector size:',y_test.shape)

## Applying Different Models

In [63]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import  GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import  Ridge
from sklearn.linear_model import  Lasso
from xgboost import XGBRegressor

## Linear Regression Base Model

In [64]:
# create the model
model = LinearRegression()

# train the model
model.fit(x_train, y_train)

In [65]:
y_pred=model.predict(x_test)

In [66]:
model.predict(x_test)

In [67]:
print('\033[1m'+' Error :'+'\033[0m')
print('Mean absolute error :', mean_absolute_error(y_test,y_pred))
print('Mean squared error :', mean_squared_error(y_test,y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test,y_pred)))
print('\n')
from sklearn.metrics import r2_score
print('\033[1m'+' R2 Score :'+'\033[0m')
print(r2_score(y_test,y_pred,multioutput='variance_weighted'))

## Pick Best model for this dataset¶

In [68]:
# # pick different model and test which is better 
# Linear_reg = LinearRegression(),
# Ride_reg= Ridge()
# XGBoost = XGBRegressor()
# Random_forest = RandomForestRegressor()
# Ada_boost = AdaBoostRegressor()
# Grad_boost = GradientBoostingRegressor()

In [69]:
# pick different model and test which is better 

Linear_reg = LinearRegression(),
Ride_reg= Ridge()
XGBoost = XGBRegressor()
Random_forest = RandomForestRegressor()
Ada_boost = AdaBoostRegressor()
Grad_boost = GradientBoostingRegressor()
model_li = [Ride_reg,XGBoost,Random_forest,Ada_boost,Grad_boost]
for model in model_li:
    model.fit(x_train,y_train)
    model.score(x_train,y_train)
    y_pred = model.predict(x_test)
    print('\n')                                        
    print('\033[1m'+' Error of ', model, ':' +'\033[0m')
    print('Mean absolute error :', mean_absolute_error(y_test,y_pred))
    print('Mean squared error :', mean_squared_error(y_test,y_pred))
    print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test,y_pred)))
    print('\n')

    print('\033[1m'+' R2 Score for test data:'+'\033[0m')
    print(r2_score(y_test,y_pred)) 
    print('==============================================================================================================')

In [70]:
# our Final Model  
final_model = RandomForestRegressor()
final_model.fit(x_train,y_train)
print("The R2 score of Random Forest : ",r2_score(y_test,Random_forest.predict(x_test)))

### Save the model

<!-- ## Standard Scaling -->

In [71]:
import pickle

with open("Flight_Fare_Prediction_Model.pkl", "wb") as file:
    pickle.dump(final_model, file)

In [72]:
# sns.boxplot(df[inputs_cols])

In [73]:
# from sklearn.preprocessing import StandardScaler
# scaler= StandardScaler()
# array_scaled = scaler.fit_transform(df[inputs_cols])

In [74]:
# sns.boxplot(df[inputs_cols])

In [75]:
# scaler = MinMaxScaler()
# scaler.fit(df[inputs_cols])
# df[inputs_cols] = scaler.transform(df[inputs_cols])

In [76]:
# sns.boxplot(df[inputs_cols])

In [77]:
# df.corr()

In [78]:
# plt.figure(figsize=(25,15))
# sns.heatmap(df.corr(), vmin=-1, vmax=1, annot=True, square=True, fmt='0.3f', 
#             annot_kws={'size':10}, cmap="gist_stern")
# plt.xticks(fontsize=12)
# plt.yticks(fontsize=12)
# plt.show()

In [79]:
# from sklearn.linear_model import LinearRegression
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.tree import DecisionTreeRegressor
# import xgboost as xgb
# from sklearn.ensemble import ExtraTreesRegressor
# from sklearn.svm import SVR
# from sklearn.ensemble import AdaBoostRegressor
# from sklearn.ensemble import  GradientBoostingRegressor
# from sklearn.neighbors import KNeighborsRegressor
# from sklearn.metrics import mean_squared_error, mean_absolute_error
# from sklearn.metrics import r2_score
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import  Ridge
# from sklearn.linear_model import  Lasso
# from xgboost import XGBRegressor

<!-- ## Spliting data into train,test data -->

In [80]:
# x = df.drop(['price'],axis=1)
# y = df['price']

In [81]:
# x.tail()

In [82]:
# y.head()

In [83]:
# x.describe()

In [84]:
# x_train, x_test, y_train, y_test =  train_test_split(x, y, train_size=0.7,random_state=42)
# print('Training feature matrix size:',x_train.shape)
# print('Training target vector size:',y_train.shape)
# print('Test feature matrix size:',x_test.shape)
# print('Test target vector size:',y_test.shape)

<!-- ## Linear Regression Base Model -->

In [85]:
# # create the model
# model = LinearRegression()

# # train the model
# model.fit(x_train, y_train)

In [86]:
# y_pred=model.predict(x_test)

In [87]:
# model.predict(x_test)

In [88]:
# print('\033[1m'+' Error :'+'\033[0m')
# print('Mean absolute error :', mean_absolute_error(y_test,y_pred))
# print('Mean squared error :', mean_squared_error(y_test,y_pred))
# print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test,y_pred)))
# print('\n')
# from sklearn.metrics import r2_score
# print('\033[1m'+' R2 Score :'+'\033[0m')
# print(r2_score(y_test,y_pred,multioutput='variance_weighted'))

<!-- ## Pick Best model for this dataset¶ -->

In [89]:
# # pick different model and test which is better 
# Linear_reg = LinearRegression(),
# Ride_reg= Ridge()
# XGBoost = XGBRegressor()
# Random_forest = RandomForestRegressor()
# Ada_boost = AdaBoostRegressor()
# Grad_boost = GradientBoostingRegressor()

In [90]:
# Linear_reg = LinearRegression(),
# Ride_reg= Ridge()
# XGBoost = XGBRegressor()
# Random_forest = RandomForestRegressor()
# Ada_boost = AdaBoostRegressor()
# Grad_boost = GradientBoostingRegressor()
# model_li = [Ride_reg,XGBoost,Random_forest,Ada_boost,Grad_boost]
# for model in model_li:
#     model.fit(x_train,y_train)
#     model.score(x_train,y_train)
#     y_pred = model.predict(x_test)
#     print('\n')                                        
#     print('\033[1m'+' Error of ', model, ':' +'\033[0m')
#     print('Mean absolute error :', mean_absolute_error(y_test,y_pred))
#     print('Mean squared error :', mean_squared_error(y_test,y_pred))
#     print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test,y_pred)))
#     print('\n')

#     print('\033[1m'+' R2 Score for test data:'+'\033[0m')
#     print(r2_score(y_test,y_pred)) 
#     prediction = model.predict([[0.777778,0.375,0.4,1.0,1.0,1.0,1.0,0.045938,0.0]])
#     print('Prediction =',prediction)
#     print('==============================================================================================================')

In [91]:
# Linear_reg = LinearRegression(),
# Ride_reg= Ridge()
# XGBoost = XGBRegressor()
# Random_forest = RandomForestRegressor()
# Ada_boost = AdaBoostRegressor()
# Grad_boost = GradientBoostingRegressor()
# model_li = [Ride_reg,XGBoost,Random_forest,Ada_boost,Grad_boost]
# for model in model_li:
#     model.fit(x_train,y_train)
#     model.score(x_train,y_train)
#     y_pred = model.predict(x_train)
#     print('\n')                                        
#     print('\033[1m'+' Error of ', model, ':' +'\033[0m')
#     print('Mean absolute error :', mean_absolute_error(y_train,y_pred))
#     print('Mean squared error :', mean_squared_error(y_train,y_pred))
#     print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_train,y_pred)))
#     print('\n')

#     print('\033[1m'+' R2 Score for train data :'+'\033[0m')
#     print(r2_score(y_train,y_pred)) 
#     print('==============================================================================================================')

In [92]:
# x.head()

In [93]:
# model1 = RandomForestRegressor()
# model1.fit(x_train,y_train)
# model1.score(x_train,y_train)
# y_pred = model1.predict(x_train)
# print('\n')                                        
# print('\033[1m'+' Error of ', model1, ':' +'\033[0m')
# print('Mean absolute error :', mean_absolute_error(y_train,y_pred))
# print('Mean squared error :', mean_squared_error(y_train,y_pred))
# print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_train,y_pred)))
# print('\n')

# print('\033[1m'+' R2 Score for train data :'+'\033[0m')
# print(r2_score(y_train,y_pred)) 

In [94]:
# prediction1 = model1.predict([[0.777778,0.375,0.4,1.0,1.0,1.0,1.0,0.045938,0.0]])
# prediction1

In [95]:
# prediction2 = model1.predict([[0.777778,0.375,0.2,1.0,0.8,1.0,1.0,0.051423,0.0]])
# prediction2

In [96]:
# prediction3 = model1.predict([[0.111111,0.25,1.0,0.5,0.8,0.750,1.0,0.348646,0.877551]])
# prediction3

<!-- # Regression using DL -->