In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
os.chdir(r'C:\Users\91938\Desktop\Preprocessor')

In [3]:
train = pd.read_csv('business.csv')
test = pd.read_csv('economy.csv')
data = pd.read_csv('Clean_Dataset.csv')

In [4]:
train.head()

Unnamed: 0,date,airline,ch_code,num_code,dep_time,from,time_taken,stop,arr_time,to,price
0,11-02-2022,Air India,AI,868,18:00,Delhi,02h 00m,non-stop,20:00,Mumbai,25612
1,11-02-2022,Air India,AI,624,19:00,Delhi,02h 15m,non-stop,21:15,Mumbai,25612
2,11-02-2022,Air India,AI,531,20:00,Delhi,24h 45m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,20:45,Mumbai,42220
3,11-02-2022,Air India,AI,839,21:25,Delhi,26h 30m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,23:55,Mumbai,44450
4,11-02-2022,Air India,AI,544,17:15,Delhi,06h 40m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,23:55,Mumbai,46690


In [5]:
test.head()

Unnamed: 0,date,airline,ch_code,num_code,dep_time,from,time_taken,stop,arr_time,to,price
0,11-02-2022,SpiceJet,SG,8709,18:55,Delhi,02h 10m,non-stop,21:05,Mumbai,5953
1,11-02-2022,SpiceJet,SG,8157,06:20,Delhi,02h 20m,non-stop,08:40,Mumbai,5953
2,11-02-2022,AirAsia,I5,764,04:25,Delhi,02h 10m,non-stop,06:35,Mumbai,5956
3,11-02-2022,Vistara,UK,995,10:20,Delhi,02h 15m,non-stop,12:35,Mumbai,5955
4,11-02-2022,Vistara,UK,963,08:50,Delhi,02h 20m,non-stop,11:10,Mumbai,5955


In [6]:
data.head()

Unnamed: 0.1,Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


In [7]:
data['airline'].value_counts(dropna = False)

Vistara      127859
Air_India     80892
Indigo        43120
GO_FIRST      23173
AirAsia       16098
SpiceJet       9011
Name: airline, dtype: int64

In [8]:
data['flight'].nunique()

1561

In [9]:
data['source_city'].value_counts(dropna = False)

Delhi        61343
Mumbai       60896
Bangalore    52061
Kolkata      46347
Hyderabad    40806
Chennai      38700
Name: source_city, dtype: int64

In [10]:
data['destination_city'].value_counts(dropna = False)

Mumbai       59097
Delhi        57360
Bangalore    51068
Kolkata      49534
Hyderabad    42726
Chennai      40368
Name: destination_city, dtype: int64

In [11]:
data['class'].value_counts(dropna = False)

Economy     206666
Business     93487
Name: class, dtype: int64

In [12]:
data['days_left'].nunique()

49

In [13]:
data['duration'].value_counts(dropna = False)

2.17     4242
2.25     4036
2.75     2879
2.08     2755
2.83     2323
         ... 
37.17       1
38.75       1
38.50       1
36.25       1
41.50       1
Name: duration, Length: 476, dtype: int64

In [14]:
data['arrival_time'].value_counts(dropna = False)

Night            91538
Evening          78323
Morning          62735
Afternoon        38139
Early_Morning    15417
Late_Night       14001
Name: arrival_time, dtype: int64

In [15]:
data['stops'].value_counts(dropna = False)

one            250863
zero            36004
two_or_more     13286
Name: stops, dtype: int64

In [16]:
data['price'].unique()

array([ 5953,  5956,  5955, ..., 87051, 74731, 77105], dtype=int64)

In [17]:
data['flight'] = data['flight'].str[0:2]

In [18]:
data.head()

Unnamed: 0.1,Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,0,SpiceJet,SG,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,1,SpiceJet,SG,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,2,AirAsia,I5,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,3,Vistara,UK,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,4,Vistara,UK,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


In [19]:
data['flight'].unique()

array(['SG', 'I5', 'UK', 'G8', '6E', 'AI'], dtype=object)

In [20]:
one_hot = pd.get_dummies(data.drop(columns = ['Unnamed: 0', 'duration', 'days_left', 'price']))

In [21]:
one_hot.head()

Unnamed: 0,airline_AirAsia,airline_Air_India,airline_GO_FIRST,airline_Indigo,airline_SpiceJet,airline_Vistara,flight_6E,flight_AI,flight_G8,flight_I5,...,arrival_time_Morning,arrival_time_Night,destination_city_Bangalore,destination_city_Chennai,destination_city_Delhi,destination_city_Hyderabad,destination_city_Kolkata,destination_city_Mumbai,class_Business,class_Economy
0,0,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,1
1,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,1
2,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,1
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
4,0,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,1,0,1


In [22]:
datas = data.drop(columns = ['Unnamed: 0', 'airline', 'flight', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class'])

In [23]:
datas.head()

Unnamed: 0,duration,days_left,price
0,2.17,1,5953
1,2.33,1,5953
2,2.17,1,5956
3,2.25,1,5955
4,2.33,1,5955


In [24]:
data_preprocessed = pd.concat([one_hot, datas], axis = 1)

In [25]:
data_preprocessed.head(10)

Unnamed: 0,airline_AirAsia,airline_Air_India,airline_GO_FIRST,airline_Indigo,airline_SpiceJet,airline_Vistara,flight_6E,flight_AI,flight_G8,flight_I5,...,destination_city_Chennai,destination_city_Delhi,destination_city_Hyderabad,destination_city_Kolkata,destination_city_Mumbai,class_Business,class_Economy,duration,days_left,price
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,1,2.17,1,5953
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,1,2.33,1,5953
2,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,1,2.17,1,5956
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,1,2.25,1,5955
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,1,2.33,1,5955
5,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,1,2.33,1,5955
6,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,1,2.08,1,6060
7,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,1,2.17,1,6060
8,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,1,0,1,2.17,1,5954
9,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,1,0,1,2.25,1,5954


In [26]:
x = data_preprocessed.drop(columns = 'price')

In [27]:
y = data_preprocessed['price']

In [28]:
x.head(1)

Unnamed: 0,airline_AirAsia,airline_Air_India,airline_GO_FIRST,airline_Indigo,airline_SpiceJet,airline_Vistara,flight_6E,flight_AI,flight_G8,flight_I5,...,destination_city_Bangalore,destination_city_Chennai,destination_city_Delhi,destination_city_Hyderabad,destination_city_Kolkata,destination_city_Mumbai,class_Business,class_Economy,duration,days_left
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,1,2.17,1


In [29]:
y.head(1)

0    5953
Name: price, dtype: int64

In [30]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 30)

### Linear Regression

In [31]:
model = LinearRegression()

In [32]:
model.fit(x_train, y_train)

LinearRegression()

In [33]:
model.coef_

array([ 6.41722361e+14, -9.36102733e+14, -7.24356037e+14, -9.96822679e+13,
        1.39061766e+14, -4.30148782e+14, -6.15904543e+14,  2.20515922e+14,
        8.76922598e+12, -1.35730917e+15, -8.54648577e+14, -2.85438029e+14,
        2.82465800e+14,  2.82465800e+14,  2.82465800e+14,  2.82465800e+14,
        2.82465800e+14,  2.82465800e+14,  4.16311091e+12,  4.16311091e+12,
        4.16311091e+12,  4.16311091e+12,  4.16311091e+12,  4.16311091e+12,
       -5.22640147e+12, -5.22640146e+12, -5.22640147e+12, -2.08910450e+13,
       -2.08910450e+13, -2.08910450e+13, -2.08910450e+13, -2.08910450e+13,
       -2.08910450e+13, -2.22019675e+12, -2.22019675e+12, -2.22019675e+12,
       -2.22019675e+12, -2.22019675e+12, -2.22019675e+12, -1.39027991e+13,
       -1.39027991e+13,  4.25055940e+01, -1.31340058e+02])

In [34]:
model.intercept_

471198342999805.7

In [35]:
train_pred = model.predict(x_train)
test_pred = model.predict(x_test)

In [36]:
print('Train RMSE :', np.sqrt(mean_squared_error(y_train, train_pred)))
print('Train r2_score :', r2_score(y_train, train_pred))

print('Test RMSE :', np.sqrt(mean_squared_error(y_test, test_pred)))
print('Test r2_score :', r2_score(y_test, test_pred))

Train RMSE : 6744.588766881771
Train r2_score : 0.9116565593898968
Test RMSE : 6791.943682800942
Test r2_score : 0.9106460508686807


### Decision Tree

In [37]:
tree = DecisionTreeRegressor()

In [38]:
tree.fit(x_train, y_train)

DecisionTreeRegressor()

In [39]:
train_pred_tree = model.predict(x_train)
test_pred_tree = model.predict(x_test)

In [40]:
print('Train RMSE tree :', np.sqrt(mean_squared_error(y_train, train_pred_tree)))
print('Train r2_score tree:', r2_score(y_train, train_pred_tree))

print('Test RMSE tree:', np.sqrt(mean_squared_error(y_test, test_pred_tree)))
print('Test r2_score tree:', r2_score(y_test, test_pred_tree))

Train RMSE tree : 6744.588766881771
Train r2_score tree: 0.9116565593898968
Test RMSE tree: 6791.943682800942
Test r2_score tree: 0.9106460508686807


In [47]:
test_pred = model.predict(data_preprocessed.drop(columns = 'price'))

In [49]:
output = data_preprocessed[['price']]

In [50]:
output.to_csv('Flight_price_prediction_linear_regression.csv', index = False)