In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn import linear_model
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

In [2]:
df_train = pd.read_excel('C:/Users/suman/Downloads/Data_Train.xlsx') 
df_test = pd.read_excel('C:/Users/suman/Downloads/Test_set.xlsx')

In [3]:
flight_df = pd.concat([df_train, df_test], ignore_index=True, sort=False)

In [4]:
bins = [1759, 5277, 8372, 12373, 79512]
labels = ['low', 'medium', 'high', 'very high']
flight_df['Price'] = pd.cut(flight_df['Price'], bins=bins, labels=labels)

In [5]:
# Drop null values
flight_df.dropna(inplace = True)

In [6]:
flight_df["Journey_date"]=flight_df["Date_of_Journey"].str.split("/").str[0].astype(int)
flight_df["Journey_month"]=flight_df["Date_of_Journey"].str.split("/").str[1].astype(int)
flight_df["Journey_year"]=flight_df["Date_of_Journey"].str.split("/").str[2].astype(int)

In [7]:
flight_df.drop('Date_of_Journey',axis=1,inplace = True)

In [8]:
# Total_Stops 
flight_df["Total_Stops"]=flight_df["Total_Stops"].str.split(" ").str[0]
flight_df["Total_Stops"]=flight_df["Total_Stops"].replace("non-stop","0")
flight_df.head()

Unnamed: 0,Airline,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Journey_date,Journey_month,Journey_year
0,IndiGo,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,0,No info,low,24,3,2019
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2,No info,medium,1,5,2019
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2,No info,very high,9,6,2019
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1,No info,medium,12,5,2019
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1,No info,very high,1,3,2019


In [9]:
# Extracting hours and min from Arrival time, Departure time
# Arrival_Time
flight_df["Arrival_Time"]=flight_df["Arrival_Time"].str.split(" ").str[0]
flight_df['Arrival_hour']=flight_df["Arrival_Time"].str.split(':').str[0].astype(int)
flight_df['Arrival_min']=flight_df["Arrival_Time"].str.split(':').str[1].astype(int)
flight_df=flight_df.drop(["Arrival_Time"],axis=1)
#Dep_Time
flight_df['Dep_hour']=flight_df["Dep_Time"].str.split(':').str[0].astype(int)
flight_df['Dep_min']=flight_df["Dep_Time"].str.split(':').str[1].astype(int)
flight_df=flight_df.drop(["Dep_Time"],axis=1)

flight_df.head()


Unnamed: 0,Airline,Source,Destination,Route,Duration,Total_Stops,Additional_Info,Price,Journey_date,Journey_month,Journey_year,Arrival_hour,Arrival_min,Dep_hour,Dep_min
0,IndiGo,Banglore,New Delhi,BLR → DEL,2h 50m,0,No info,low,24,3,2019,1,10,22,20
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,7h 25m,2,No info,medium,1,5,2019,13,15,5,50
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,19h,2,No info,very high,9,6,2019,4,25,9,25
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,5h 25m,1,No info,medium,12,5,2019,23,30,18,5
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,4h 45m,1,No info,very high,1,3,2019,21,35,16,50


In [10]:
flight_df.drop(["Route"], axis = 1, inplace = True)

In [11]:
flight_df["Duration_hr"]=flight_df["Duration"].str.split('h').str[0]
flight_df['Duration_hr'].str.contains('m').sum()

1

In [12]:
flight_df[flight_df['Duration_hr'].str.contains('m')]

Unnamed: 0,Airline,Source,Destination,Duration,Total_Stops,Additional_Info,Price,Journey_date,Journey_month,Journey_year,Arrival_hour,Arrival_min,Dep_hour,Dep_min,Duration_hr
6474,Air India,Mumbai,Hyderabad,5m,2,No info,very high,6,3,2019,16,55,16,50,5m


In [13]:
flight_df["Duration_hr"]=flight_df["Duration_hr"].replace("5m","0")

In [14]:
flight_df["Duration_min"]=flight_df["Duration"].str.split(' ').str[1].str.split('m').str[0]
flight_df['Duration_min'].fillna("0",inplace=True)
flight_df["Duration"]=(flight_df["Duration_hr"].astype(int)*60) + flight_df["Duration_min"].astype(int)
flight_df=flight_df.drop(['Duration_hr','Duration_min'],axis=1)
flight_df.head()

Unnamed: 0,Airline,Source,Destination,Duration,Total_Stops,Additional_Info,Price,Journey_date,Journey_month,Journey_year,Arrival_hour,Arrival_min,Dep_hour,Dep_min
0,IndiGo,Banglore,New Delhi,170,0,No info,low,24,3,2019,1,10,22,20
1,Air India,Kolkata,Banglore,445,2,No info,medium,1,5,2019,13,15,5,50
2,Jet Airways,Delhi,Cochin,1140,2,No info,very high,9,6,2019,4,25,9,25
3,IndiGo,Kolkata,Banglore,325,1,No info,medium,12,5,2019,23,30,18,5
4,IndiGo,Banglore,New Delhi,285,1,No info,very high,1,3,2019,21,35,16,50


In [15]:
# '1' is most frequently occuring value. So fill Total_Stops column null values by '1'
flight_df["Total_Stops"]=flight_df["Total_Stops"].fillna('1')
flight_df["Total_Stops"]=flight_df["Total_Stops"].astype(int)

In [16]:
flight_df

Unnamed: 0,Airline,Source,Destination,Duration,Total_Stops,Additional_Info,Price,Journey_date,Journey_month,Journey_year,Arrival_hour,Arrival_min,Dep_hour,Dep_min
0,IndiGo,Banglore,New Delhi,170,0,No info,low,24,3,2019,1,10,22,20
1,Air India,Kolkata,Banglore,445,2,No info,medium,1,5,2019,13,15,5,50
2,Jet Airways,Delhi,Cochin,1140,2,No info,very high,9,6,2019,4,25,9,25
3,IndiGo,Kolkata,Banglore,325,1,No info,medium,12,5,2019,23,30,18,5
4,IndiGo,Banglore,New Delhi,285,1,No info,very high,1,3,2019,21,35,16,50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10678,Air Asia,Kolkata,Banglore,150,0,No info,low,9,4,2019,22,25,19,55
10679,Air India,Kolkata,Banglore,155,0,No info,low,27,4,2019,23,20,20,45
10680,Jet Airways,Banglore,Delhi,180,0,No info,medium,27,4,2019,11,20,8,20
10681,Vistara,Banglore,New Delhi,160,0,No info,very high,1,3,2019,14,10,11,30


In [17]:
from sklearn.preprocessing import LabelEncoder
la=LabelEncoder()
for i in ["Airline","Source","Destination","Additional_Info"]:
    flight_df[i]=la.fit_transform(flight_df[i])
flight_df.head()

Unnamed: 0,Airline,Source,Destination,Duration,Total_Stops,Additional_Info,Price,Journey_date,Journey_month,Journey_year,Arrival_hour,Arrival_min,Dep_hour,Dep_min
0,3,0,5,170,0,8,low,24,3,2019,1,10,22,20
1,1,3,0,445,2,8,medium,1,5,2019,13,15,5,50
2,4,2,1,1140,2,8,very high,9,6,2019,4,25,9,25
3,3,3,0,325,1,8,medium,12,5,2019,23,30,18,5
4,3,0,5,285,1,8,very high,1,3,2019,21,35,16,50


In [18]:
flight_df.corr()

Unnamed: 0,Airline,Source,Destination,Duration,Total_Stops,Additional_Info,Journey_date,Journey_month,Journey_year,Arrival_hour,Arrival_min,Dep_hour,Dep_min
Airline,1.0,-0.014531,0.017778,-0.159068,-0.198777,-0.061105,0.025706,0.025501,,-0.006488,-0.070571,-0.035893,-0.060674
Source,-0.014531,1.0,-0.593728,0.162729,0.193799,-0.022448,0.004445,0.184247,,0.026767,0.021636,0.058511,-0.057759
Destination,0.017778,-0.593728,1.0,-0.257028,-0.295135,0.026629,-0.041328,-0.364377,,-0.039088,0.017556,-0.07341,0.12755
Duration,-0.159068,0.162729,-0.257028,1.0,0.737939,-0.16771,-0.022157,0.014329,,0.050876,-0.070059,0.002463,-0.018657
Total_Stops,-0.198777,0.193799,-0.295135,0.737939,1.0,-0.082238,-0.009133,0.053843,,0.037398,-0.107393,-0.061103,-0.002117
Additional_Info,-0.061105,-0.022448,0.026629,-0.16771,-0.082238,1.0,-0.016435,-0.051275,,0.026537,0.041485,-0.051181,-0.019603
Journey_date,0.025706,0.004445,-0.041328,-0.022157,-0.009133,-0.016435,1.0,-0.037896,,-0.002884,-0.017203,0.002175,-0.008466
Journey_month,0.025501,0.184247,-0.364377,0.014329,0.053843,-0.051275,-0.037896,1.0,,-0.004704,-0.101116,0.039528,-0.058784
Journey_year,,,,,,,,,,,,,
Arrival_hour,-0.006488,0.026767,-0.039088,0.050876,0.037398,0.026537,-0.002884,-0.004704,,1.0,-0.155011,0.005977,0.043852


We can remove Journey year from the final dataset, Which has least correaltion value.

In [19]:
flight_df=flight_df.drop(["Journey_year"],axis=1)
flight_df.head()

Unnamed: 0,Airline,Source,Destination,Duration,Total_Stops,Additional_Info,Price,Journey_date,Journey_month,Arrival_hour,Arrival_min,Dep_hour,Dep_min
0,3,0,5,170,0,8,low,24,3,1,10,22,20
1,1,3,0,445,2,8,medium,1,5,13,15,5,50
2,4,2,1,1140,2,8,very high,9,6,4,25,9,25
3,3,3,0,325,1,8,medium,12,5,23,30,18,5
4,3,0,5,285,1,8,very high,1,3,21,35,16,50


In [20]:
X = flight_df.drop(['Price'], axis=1)
y = flight_df['Price'].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=30)

In [21]:
performance = pd.DataFrame({"model": [], "R2_score_test": [], "RMSE": []})

In [22]:
flight_df.isnull().sum()

Airline            0
Source             0
Destination        0
Duration           0
Total_Stops        0
Additional_Info    0
Price              0
Journey_date       0
Journey_month      0
Arrival_hour       0
Arrival_min        0
Dep_hour           0
Dep_min            0
dtype: int64

# Linear Regression

In [None]:
modelmlg = LinearRegression()
modelmlg.fit(X_train, y_train)
y_pred = modelmlg.predict(X_test)
performance = pd.concat([performance, pd.DataFrame({'model':"LinearRegression", 
                                                    'R2_score_test': round(metrics.r2_score(y_test, y_pred),6), 
                                                    'RMSE': round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3)
                                                     }, index=[0])])

# DecisionTreeRegressor

In [None]:
modeldcr = DecisionTreeRegressor()
modeldcr.fit(X_train, y_train)
y_pred = modeldcr.predict(X_test)
performance = pd.concat([performance, pd.DataFrame({'model':"DecisionTreeRegressor", 
                                                    'R2_score_test': round(metrics.r2_score(y_test, y_pred),6), 
                                                    'RMSE': round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3)
                                                     }, index=[0])])

# RandomForestRegressor

In [None]:
modelrfr = RandomForestRegressor()
modelrfr.fit(X_train, y_train)
y_pred = modelrfr.predict(X_test)
performance = pd.concat([performance, pd.DataFrame({'model':"RandomForestRegressor", 
                                                    'R2_score_test': round(metrics.r2_score(y_test, y_pred),6), 
                                                    'RMSE': round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3)
                                                     }, index=[0])])

# RandomForestRegressor- Randomsearch CV

In [None]:
from sklearn.model_selection import RandomizedSearchCV
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
min_samples_split = [2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10]

In [None]:
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

In [None]:
rf_random = RandomizedSearchCV(estimator = modelrfr, param_distributions = random_grid,scoring='neg_mean_squared_error',
                               n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs = 1)
rf_random.fit(X_train, y_train)
rf_prediction = rf_random.predict(X_test)
performance = pd.concat([performance, pd.DataFrame({'model':"RandomForestRegressor- Randomsearch CV", 
                                                    'R2_score_test': round(metrics.r2_score(y_test, y_pred),6), 
                                                    'RMSE': round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3)
                                                     }, index=[0])])

# SVR

In [None]:
modelSVR = SVR()
modelSVR.fit(X_train, y_train)
y_pred = modelSVR.predict(X_test)
performance = pd.concat([performance, pd.DataFrame({'model':"SVR", 
                                                    'R2_score_test': round(metrics.r2_score(y_test, y_pred),6), 
                                                    'RMSE': round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3)
                                                     }, index=[0])])

# KNeighborsRegressor

In [None]:
modelKNN = KNeighborsRegressor(n_neighbors=5)
modelKNN.fit(X_train, y_train)
y_pred = modelKNN.predict(X_test)
performance = pd.concat([performance, pd.DataFrame({'model':"KNeighborsRegressor", 
                                                    'R2_score_test': round(metrics.r2_score(y_test, y_pred),6), 
                                                    'RMSE': round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3)
                                                     }, index=[0])])

# Ridge

In [None]:
modelRE = Ridge()
modelRE.fit(X_train, y_train)
y_pred = modelRE.predict(X_test)
performance = pd.concat([performance, pd.DataFrame({'model':"Ridge", 
                                                    'R2_score_test': round(metrics.r2_score(y_test, y_pred),6), 
                                                    'RMSE': round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3)
                                                     }, index=[0])])

# Lasso

In [None]:
modelLO =linear_model.Lasso(alpha=0.1)
modelLO.fit(X_train, y_train)
y_pred = modelLO.predict(X_test)
performance = pd.concat([performance, pd.DataFrame({'model':"Lasso", 
                                                    'R2_score_test': round(metrics.r2_score(y_test, y_pred),6), 
                                                    'RMSE': round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3)
                                                     }, index=[0])])

In [None]:
performance.sort_values(by=['R2_score_test'])