In [1]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.tree import DecisionTreeRegressor,ExtraTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
import pandas as pd
from sklearn.metrics import r2_score,root_mean_squared_error

In [2]:
from sklearn.model_selection import cross_val_score

In [12]:
df = pd.read_excel(r'data\flight-price.xlsx')
df.head(2)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662


In [13]:
df = df.dropna(axis=0)
df = df.drop_duplicates()
df = df.reset_index().drop(labels=['index'],axis=1)

In [14]:
df['Day_of_journey']  = df['Date_of_Journey'].str.split('/').str[0]
df['Month_of_journey']  = df['Date_of_Journey'].str.split('/').str[1]
df['Year_of_journey']  = df['Date_of_Journey'].str.split('/').str[2]

df['Day_of_journey'] = df['Day_of_journey'].astype(int)
df['Month_of_journey'] = df['Month_of_journey'].astype(int)
df['Year_of_journey'] = df['Year_of_journey'].astype(int)

In [15]:
df['Dep_hour'] = df['Dep_Time'].str.split(':').str[0]
df['Dep_minute'] = df['Dep_Time'].str.split(':').str[1]
df['Dep_hour'] = df['Dep_hour'].astype(int)
df['Dep_minute'] = df['Dep_minute'].astype(int)

In [16]:
df['Arrival_Time'] = df['Arrival_Time'].str.split(" ").str[0]
df['Arrival_hour'] = df['Arrival_Time'].str.split(":").str[0]
df['Arrival_minute'] = df['Arrival_Time'].str.split(":").str[1]
df['Arrival_hour'] = df['Arrival_hour'].astype(int)
df['Arrival_minute'] = df['Arrival_minute'].astype(int)

In [17]:
duration = pd.to_timedelta(df['Duration'])
for i  in range(0,len(duration)):
    df.loc[i,'Duration_mins'] = duration[i].seconds //60

In [18]:
df['Total_Stops'] = df['Total_Stops'].map({'non-stop':0,'2 stops':2,'1 stop':1,'3 stops':3,'4 stops':4})
df['Total_Stops'] = df['Total_Stops'].astype(int)

In [19]:
df = df.drop(labels=['Route','Dep_Time','Arrival_Time','Duration'],axis=1)

In [20]:
df['Arrival_time_sinceMidnight'] = df['Arrival_hour']*60 + df['Arrival_minute']
df['Dep_time_sinceMidnight'] = df['Dep_hour']*60 + df['Dep_minute']

In [21]:
from geopy.geocoders import Nominatim
from geopy.distance import geodesic

In [23]:
locatitons = set()
locatitons.update(df['Source'].unique())
locatitons.update(df['Destination'].unique())
geolocator = Nominatim(user_agent='Self_EDA_Project')
coordinates = {}
for city in locatitons:
    coordinates[city] = geolocator.geocode(city)

coordinates['Banglore'] = geolocator.geocode('Bangalore, India')

for i in range(len(df)):
    source = coordinates[df.loc[i,'Source']]
    destination = coordinates[df.loc[i,'Destination']]
    df.loc[i,'Distance (in Km)'] = geodesic((source.latitude,source.longitude),(destination.latitude,destination.longitude)).km

In [24]:
df['Premium'] = df['Airline'].apply(lambda x: 1 if 'Premium' in x or 'Business' in x else 0)

def airline(x):
    if 'Trujet' in x or 'GoAir' in x or 'Multiple carriers' in x:
        return 'Multiple carriers'
    elif 'Jet Airways' in x:
        return 'Jet Airways'
    elif 'Vistara' in x:
        return 'Vistara'
    else: 
        return x
    
df['Airline'] = df['Airline'].apply(airline)

In [25]:
df = df.drop(['Date_of_Journey','Source','Destination'],axis=1)

In [26]:
df['Additional_Info'] = df['Additional_Info'].apply(lambda x: 'No Info' if 'No info' in x else x)

In [27]:
df.head(2)

Unnamed: 0,Airline,Total_Stops,Additional_Info,Price,Day_of_journey,Month_of_journey,Year_of_journey,Dep_hour,Dep_minute,Arrival_hour,Arrival_minute,Duration_mins,Arrival_time_sinceMidnight,Dep_time_sinceMidnight,Distance (in Km),Premium
0,IndiGo,0,No Info,3897,24,3,2019,22,20,1,10,170.0,70,1340,1733.814838,0
1,Air India,2,No Info,7662,1,5,2019,5,50,13,15,445.0,795,350,1555.067481,0


In [28]:
X = df.drop(labels=['Year_of_journey','Price'],axis=1)
y = df['Price']

In [29]:
categorical = [col for col in X if X[col].dtype=='O']
numerical = [col for col in X if X[col].dtype!='O']

In [30]:
numerical.remove('Premium')

In [31]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.15,random_state=21)

In [32]:
from sklearn.preprocessing import OneHotEncoder,RobustScaler
from sklearn.compose import ColumnTransformer

transformer = ColumnTransformer(
    [('OHE',OneHotEncoder(handle_unknown='ignore',sparse_output=False,drop='first'),categorical),('Scalling',RobustScaler(),numerical)],
    remainder='passthrough',
    n_jobs=-1,
    verbose=2
)
transformer.set_output(transform='pandas')
X_train = transformer.fit_transform(X_train)
X_test = transformer.transform(X_test)

In [33]:
X_train.head()

Unnamed: 0,OHE__Airline_Air India,OHE__Airline_IndiGo,OHE__Airline_Jet Airways,OHE__Airline_Multiple carriers,OHE__Airline_SpiceJet,OHE__Airline_Vistara,OHE__Additional_Info_1 Short layover,OHE__Additional_Info_2 Long layover,OHE__Additional_Info_Business class,OHE__Additional_Info_Change airports,...,Scalling__Month_of_journey,Scalling__Dep_hour,Scalling__Dep_minute,Scalling__Arrival_hour,Scalling__Arrival_minute,Scalling__Duration_mins,Scalling__Arrival_time_sinceMidnight,Scalling__Dep_time_sinceMidnight,Scalling__Distance (in Km),remainder__Premium
9001,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.333333,-0.5,0.857143,-0.454545,0.2,-0.37931,-0.436508,-0.487603,-0.348524,0
313,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.666667,-0.2,-0.428571,0.636364,-1.0,0.577586,0.65873,-0.264463,0.651476,0
3117,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.4,-0.285714,-0.181818,0.2,-0.103448,-0.150794,-0.454545,0.651476,0
695,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.333333,0.6,-0.714286,-1.181818,0.2,0.232759,-1.198413,0.512397,0.651476,0
2979,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.333333,0.6,-0.142857,-0.454545,0.0,1.017241,-0.444444,0.545455,0.651476,0


In [34]:
len(y_train)

8892

# Model Selection

In [35]:
models = {
    'Linear Regression':LinearRegression(),
    'Ridge Regression':Ridge(),
    'Lasso Regression':Lasso(),
    'Decision Tree':DecisionTreeRegressor(),
    'Extra Trees':ExtraTreeRegressor(),
    'Support Vector Machine':SVR(),
    'Random Forest':RandomForestRegressor(),
    'AdaBoost':AdaBoostRegressor(),
    'Gradient Boost':GradientBoostingRegressor(),
    'XG Boost':XGBRegressor()
}
for model_name,model in models.items():
        cross = cross_val_score(model,X_train,y_train,scoring='r2',cv=5).mean()


        model.fit(X_train,y_train)
        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)

        r2_train = r2_score(y_train,y_pred_train)
        r2_test = r2_score(y_test,y_pred_test)
        RMSE_train = root_mean_squared_error(y_train,y_pred_train)
        RMSE_test = root_mean_squared_error(y_test,y_pred_test)

        print('|',model_name,'|',end='')
        print(' {:.3f} |'.format(cross),end='')
        print(' {:.3f} |'.format(r2_train),end='')
        print(' {:.3f} |'.format(RMSE_train),end='')
        print(' {:.3f} |'.format(r2_test),end='')
        print(' {:.3f} |'.format(RMSE_test))

| Linear Regression | 0.637 | 0.647 | 2723.765 | 0.684 | 2713.048 |
| Ridge Regression | 0.638 | 0.646 | 2727.619 | 0.683 | 2719.006 |
| Lasso Regression | 0.636 | 0.647 | 2726.939 | 0.683 | 2716.536 |
| Decision Tree | 0.807 | 0.996 | 301.245 | 0.819 | 2055.784 |
| Extra Trees | 0.794 | 0.996 | 301.245 | 0.802 | 2149.396 |
| Support Vector Machine | 0.041 | 0.057 | 4455.752 | 0.043 | 4723.392 |
| Random Forest | 0.880 | 0.981 | 625.224 | 0.924 | 1329.784 |
| AdaBoost | 0.391 | 0.356 | 3682.395 | 0.389 | 3772.550 |
| Gradient Boost | 0.808 | 0.831 | 1886.371 | 0.858 | 1820.988 |
| XG Boost | 0.876 | 0.971 | 778.397 | 0.931 | 1264.201 |


## Model Performance

| Model Name | Cross val score | R2 Train | RMSE Train | R2 Test | RMSE Test |
|------------|-----------------|----------|------------|---------|-----------|
| Linear Regression | 0.637 | 0.647 | 2723.765 | 0.684 | 2713.048 |
| Ridge Regression | 0.638 | 0.646 | 2727.620 | 0.683 | 2719.027 |
| Lasso Regression | 0.636 | 0.647 | 2726.941 | 0.683 | 2716.555 |
| Decision Tree | 0.807 | 0.996 | 301.245 | 0.829 | 1994.828 |
| Extra Trees | 0.794 | 0.996 | 301.245 | 0.762 | 2357.402 |
| Support Vector Machine | 0.041 | 0.043 | 4488.690 | 0.030 | 4755.974 |
| Random Forest | 0.880 | 0.981 | 636.214 | 0.928 | 1297.556 |
| AdaBoost | 0.391 | 0.324 | 3771.737 | 0.335 | 3937.313 |
| Gradient Boost | 0.808 | 0.831 | 1886.371 | 0.855 | 1841.166 |
| XG Boost | 0.876 | 0.971 | 778.397 | 0.931 | 1264.201 |

### Observations

1. XG boost has the best r2 score on test data
2. Random forest has the best r2 score on training data means it fit very well
3. Randaom forest has the best cross val score
4. SVM performs worst and its output is equivalent to just predicting the mean of the data 