In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
flight = pd.read_excel('Flight.xlsx')
flight.head(2)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662


In [3]:
flight.shape

(10683, 11)

In [4]:
flight.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          10683 non-null  object
 1   Date_of_Journey  10683 non-null  object
 2   Source           10683 non-null  object
 3   Destination      10683 non-null  object
 4   Route            10682 non-null  object
 5   Dep_Time         10683 non-null  object
 6   Arrival_Time     10683 non-null  object
 7   Duration         10683 non-null  object
 8   Total_Stops      10682 non-null  object
 9   Additional_Info  10683 non-null  object
 10  Price            10683 non-null  int64 
dtypes: int64(1), object(10)
memory usage: 918.2+ KB


In [5]:
flight['Duration'].value_counts()

2h 50m     550
1h 30m     386
2h 45m     337
2h 55m     337
2h 35m     329
          ... 
31h 30m      1
30h 25m      1
42h 5m       1
4h 10m       1
47h 40m      1
Name: Duration, Length: 368, dtype: int64

In [6]:
flight.isnull().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              1
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        1
Additional_Info    0
Price              0
dtype: int64

In [7]:
flight.dropna(inplace=True)
flight.isnull().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              0
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        0
Additional_Info    0
Price              0
dtype: int64

In [8]:
#feature Engineering is creating a new feature from an existing feature
flight['Journey_Day'] = pd.to_datetime(flight.Date_of_Journey , format = '%d/%m/%Y').dt.day
flight['Journey_Month'] = pd.to_datetime(flight.Date_of_Journey , format ='%d/%m/%Y').dt.month
flight.head(2)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Journey_Day,Journey_Month
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897,24,3
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662,1,5


In [9]:
flight['Dep_hour'] = pd.to_datetime(flight.Dep_Time ).dt.hour
flight['Dep_Minute'] = pd.to_datetime(flight.Dep_Time).dt.minute
flight.head(2)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Journey_Day,Journey_Month,Dep_hour,Dep_Minute
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897,24,3,22,20
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662,1,5,5,50


In [10]:
flight['Arrival_hour'] = pd.to_datetime(flight.Arrival_Time ).dt.hour
flight['Arrival_Minute'] = pd.to_datetime(flight.Arrival_Time).dt.minute
flight.head(2)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Journey_Day,Journey_Month,Dep_hour,Dep_Minute,Arrival_hour,Arrival_Minute
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897,24,3,22,20,1,10
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662,1,5,5,50,13,15


In [11]:
flight.drop(columns=["Dep_Time","Arrival_Time","Date_of_Journey"],inplace=True)

In [12]:
flight.head()

Unnamed: 0,Airline,Source,Destination,Route,Duration,Total_Stops,Additional_Info,Price,Journey_Day,Journey_Month,Dep_hour,Dep_Minute,Arrival_hour,Arrival_Minute
0,IndiGo,Banglore,New Delhi,BLR → DEL,2h 50m,non-stop,No info,3897,24,3,22,20,1,10
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,7h 25m,2 stops,No info,7662,1,5,5,50,13,15
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,19h,2 stops,No info,13882,9,6,9,25,4,25
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,5h 25m,1 stop,No info,6218,12,5,18,5,23,30
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,4h 45m,1 stop,No info,13302,1,3,16,50,21,35


In [13]:
duration = list(flight.Duration)

for i in range(len(duration)):
    if len(duration[i].split())!= 2:
        if 'h' in duration[i]:
            duration[i] = duration[i] + '0m'
        else:
            duration[i] ='0h' + duration[i]
duration
duration_hour=[]
duration_minute=[]

for i in range(len(duration)):
    duration_hour.append(int(duration[i].split(sep='h')[0] ))
    duration_minute.append(int(duration[i].split('m')[0][-1]))

In [14]:
flight["Duration_Hour"] = duration_hour
flight["Duration_Minute"] = duration_minute

In [15]:
flight.drop(columns="Duration",inplace=True)

In [16]:
flight.head()

Unnamed: 0,Airline,Source,Destination,Route,Total_Stops,Additional_Info,Price,Journey_Day,Journey_Month,Dep_hour,Dep_Minute,Arrival_hour,Arrival_Minute,Duration_Hour,Duration_Minute
0,IndiGo,Banglore,New Delhi,BLR → DEL,non-stop,No info,3897,24,3,22,20,1,10,2,0
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,2 stops,No info,7662,1,5,5,50,13,15,7,5
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,2 stops,No info,13882,9,6,9,25,4,25,19,0
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,1 stop,No info,6218,12,5,18,5,23,30,5,5
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,1 stop,No info,13302,1,3,16,50,21,35,4,5


In [17]:
flight['Airline'].value_counts()

Jet Airways                          3849
IndiGo                               2053
Air India                            1751
Multiple carriers                    1196
SpiceJet                              818
Vistara                               479
Air Asia                              319
GoAir                                 194
Multiple carriers Premium economy      13
Jet Airways Business                    6
Vistara Premium economy                 3
Trujet                                  1
Name: Airline, dtype: int64

In [18]:
Airline.head()

NameError: name 'Airline' is not defined

In [None]:
Airline = flight['Airline']
Airline = pd.get_dummies(Airline , drop_first=True)
Airline.head ()

In [None]:
flight['Source'].value_counts()

In [None]:
sns.catplot(y='Price' , x='Source' ,data = flight.sort_values('Price',ascending = False) ,kind='boxen' ,height = 6,aspect =3)

In [None]:
Source = flight['Source']
Source = pd.get_dummies(Source , drop_first=True)
Source.head ()

In [None]:
Destination = flight['Destination']
Destination = pd.get_dummies(Destination , drop_first=True)
Destination.head()

In [None]:
flight.drop(columns=['Route','Additional_Info'],axis=1,inplace=True)

In [None]:
flight.head()

In [None]:
flight['Total_Stops'].value_counts()

In [None]:
flight.replace({'non-stop' : 0, '1 stop':1,'2 stops':2,'3 stops':3,'4 stops':4 },inplace =True)

In [None]:
flight.head()

In [None]:
flight = pd.concat([flight,Airline,Source,Destination],axis =1)

In [None]:
flight.head()

In [None]:
flight.drop(columns =['Airline','Source','Destination'],inplace = True)

In [None]:
flight.head()

In [None]:
flight.shape

# TEST DATA

In [None]:
flight_test = pd.read_excel('Flight.xlsx')
flight_test.head()

In [None]:
'''print("_" *50)
print("Shape")
print(flight_test.shape)

print("_" *50)
print("Info")
print(flight_test.info())



print("_" *50)
print("Check null values")
print(flight_test.isnull().sum())

print("_" *50)
print("Drop null values")
print(flight_test.dropna(inplace=True))

print("_" *50)
print("Check null values")
print(flight_test.isnull().sum())'''

'''print("_" *50)
print("Fetch Departure hours and minutes from Dep_Time")
flight_test['Dep_hour'] = pd.to_datetime(flight_test.Dep_Time) .dt.hour
flight_test['Dep_minute'] = pd.to_datetime(flight_test.Dep_Time) .dt.minute
print(flight_test.head())'''

'''print("_" *50)
print("Fetch Journey Day and Month from Date_of_Journey")
flight_test['Journey_Day'] = pd.to_datetime(flight_test.Date_of_Journey , format = '%d/%m/%Y').dt.day
flight_test['Journey_Month'] = pd.to_datetime(flight_test.Date_of_Journey , format ='%d/%m/%Y').dt.month
print(flight_test.head())'''


'''print("_" *50)
print("Fetch Journey hours and minutes from Arrival_Time")
flight_test['Arrival_hour'] = pd.to_datetime(flight_test.Arrival_Time ).dt.hour
flight_test['Arrival_Minute'] = pd.to_datetime(flight_test.Arrival_Time).dt.minute
print(flight_test.head())'''

'''print("_" *50)
print("Drop Dep_Time and Date_of_journey columns")
flight_test.drop(columns=['Dep_Time','Date_of_Journey'],inplace=True)'''

'''print("_" *50)
print("Duration count values")
print(flight_test['Duration'].value_counts())'''

'''print("_" *50)
print("Fetch hours and minutes from Durations")
duration=list(flight_test.Duration)

''''''for i in range(len(duration)):
    if len(duration[i].split())!=2:
        if 'h' in duration[i]:
            duration[i] = duration[i] + '0m'
        else:
            duration[i] = '0h' + duration[i] 
duration
duration_hour=[]
duration_minute=[]'''

'''for i in range(len(duration)):
    duration_hour.append(int(duration[i].split(sep='h')[0] ))
    duration_minute.append(int(duration[i].split('m')[0][-1]))
    
flight_test["Duration_Hour"] = duration_hour
flight_test["Duration_Minute"] = duration_minute
flight_test.drop(columns='Duration',inplace=True)'''

'''print("_" *50) 
print("Count Airline Values")
print(flight_test['Airline'].value_counts())

print("_" *50) 
print("OneHotEncoder Airline Column")
Airline=flight_test['Airline']
Airline=pd.get_dummies(Airline,drop_first=True)
print(Airline.head())'''

'''print("_" *50) 
print("Count Source values")
print(flight_test['Source'].value_counts())

print("_" *50) 
print("Count Source values")
print(flight_test['Source'].value_counts())

print("_" *50) 
print("OneHotEncoder Airline Column")
Source = flight_test[['Source']]
Source = pd.get_dummies(Source,drop_first=True)
print(Source.head())

print("_" *50) 
print("Count Destination values")
print(flight_test['Destination'].value_counts())

print("_" *50) 
print("OneHotEncoder Destination Column")
Destination=flight_test['Destination']
Destination=pd.get_dummies(Destination,drop_first=True)
print(Destination.head())

print("_" *50)
print("Show route")
print(flight_test['Route'])'''



'''print("_" *50) 
print("Drop Route and Additional Info")
flight_test.drop(columns=['Route','Additional Info'],axis=1,inplace=True)

print("_" *50) 
print("Normalize total stops")
flight_test.replace({'non-stop' : 0, '1 stop':1,'2 stops':2,'3 stops':3,'4 stops':4 },inplace =True)

print("_" *50) 
print("Comcat flight,airline,source and Destination")
flight_test= pd.concat([flight_test,Airline,Source,Destination],axis =1)

print("_" *50)
print("Show all rows and columns")
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

print("_" *50)
print("Drop Airline,Sourtce and Destination")
flight_test.drop(columns =['Airline','Source','Destination'],inplace = True)

print("_" *50)
print("Show head")
print(flight_test.head())

print("_" *50)
print("Shape")
print(flight_test.shape)'''



In [19]:
flight_test.drop(columns=['Price'],axis=1,inplace=True)
flight_test.head()

NameError: name 'flight_test' is not defined

In [None]:
flight.head()

In [None]:
flight.shape, flight_test.shape

In [None]:
X=flight.drop(columns='Price',axis=1)
y= flight['Price']
X.head()

In [None]:
y.head()

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(flight.corr() ,annot=True, cmap ='PuBuGn')
plt.show()

# Fit model using Random Forest

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test =train_test_split(X,y,test_size=0.2 ,random_state=51)

In [None]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()
rfr.fit(X_train ,y_train)

In [None]:
pred = rfr.predict(X_test)

In [None]:
rfr.score(X_train,y_train)

In [None]:
rfr.score(X_test,y_test)

In [None]:
sns.distplot(y_test - pred)

In [None]:
plt.scatter(y_test , pred ,alpha=0.8)
plt.xlabel('y_test')
plt.xlabel('pred')

In [None]:
from sklearn import metrics
print('MAE' , metrics.mean_absolute_error(y_test,pred))
print('MSE' , metrics.mean_squared_error(y_test,pred))
print('RMSE' , np.sqrt(metrics.mean_absolute_error(y_test,pred)))

In [None]:
metrics.r2_score(y_test,pred)

# Hyperparameter Tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [int(x) for x in np.linspace(100,2000,10)]
max_depth = [int(x) for x in np.linspace(100,200,10)]
max_samples_split=[2,4,6,8,10,12,14]
min_samples_leaf=[1,3,5,7,8,10]
max_features=['sqrt' , 'log2' ,'auto' ,'None']

random_search = {
    'n_estimators ':n_estimators,
    'max_depth' :max_depth,
    'max_samples_split' :max_samples_split,
    'min_samples_leaf' :min_samples_leaf,
    'max_features':max_features,  
}
random_search

In [None]:
rfr_random=RandomizedSearchCV(estimator=rfr, param_distributions=random_search, n_iter =10,cv=5,verbose=2,random_state=51,n_jobs=1)

In [None]:
rfr_random.fit(X_train,y_train)

In [None]:
rfr_random.best_params

In [None]:
prediction = rfr_random.predict(X_test)

In [None]:
plt.figure(figsize =(8,8))
sns.distplot(y_test-prediction)
plt.show()

In [20]:
print('MAE' , metrics.mean_absolute_error(y_test,pred))
print('MSE' , metrics.mean_squared_error(y_test,pred))
print('RMSE' , np.sqrt(metrics.mean_absolute_error(y_test,pred)))

NameError: name 'metrics' is not defined

In [None]:
metrics.r2_score(y_test,prediction)

# Save the model

In [None]:
import pickle
file =open('flight fare prediction.pkl','wb')
pickle.dump(rfr_random,file)

In [None]:
model = open('flight fare prediction.pkl','rb')
mod - pickle.load(model)

In [None]:
predicted_data =mod.predict(X_test)

In [23]:
metrics.r2_score(y_test,predicted_data)

NameError: name 'metrics' is not defined