In [95]:
# import dependencies
import warnings
warnings.filterwarnings('ignore')

In [96]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from matplotlib import pyplot as plt
import datetime as dt

In [130]:
from sklearn.preprocessing import LabelEncoder
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

ModuleNotFoundError: No module named 'xgboost'

In [98]:
# Load the data
file_path = Path("../Resources/flight_data_clean_all.csv")
df = pd.read_csv(file_path, skiprows=0)[:-2]

df.head()

Unnamed: 0.1,Unnamed: 0,searchDate,flightDate,startingAirport,destinationAirport,travelDuration,isBasicEconomy,isRefundable,isNonStop,totalFare,totalTravelDistance,searchDaysBeforeFlight
0,0,2022-09-28,2022-10-05,SFO,ORD,0 days 07:05:00,0,0,0,451.6,1933.0,7
1,1,2022-09-28,2022-10-05,SFO,ORD,0 days 07:52:00,0,0,0,451.6,1933.0,7
2,2,2022-09-28,2022-10-05,SFO,ORD,0 days 08:34:00,0,0,0,451.6,1933.0,7
3,3,2022-09-28,2022-10-05,SFO,PHL,0 days 07:49:00,0,0,0,103.99,2590.0,7
4,4,2022-09-28,2022-10-05,SFO,PHL,0 days 13:19:00,0,0,0,109.59,2590.0,7


In [99]:
#drop first two columns
df = df.iloc[: , 2:]
df.head()

Unnamed: 0,flightDate,startingAirport,destinationAirport,travelDuration,isBasicEconomy,isRefundable,isNonStop,totalFare,totalTravelDistance,searchDaysBeforeFlight
0,2022-10-05,SFO,ORD,0 days 07:05:00,0,0,0,451.6,1933.0,7
1,2022-10-05,SFO,ORD,0 days 07:52:00,0,0,0,451.6,1933.0,7
2,2022-10-05,SFO,ORD,0 days 08:34:00,0,0,0,451.6,1933.0,7
3,2022-10-05,SFO,PHL,0 days 07:49:00,0,0,0,103.99,2590.0,7
4,2022-10-05,SFO,PHL,0 days 13:19:00,0,0,0,109.59,2590.0,7


In [100]:
#check for null values
df.isna().sum()

flightDate                     0
startingAirport                0
destinationAirport             0
travelDuration                 0
isBasicEconomy                 0
isRefundable                   0
isNonStop                      0
totalFare                      0
totalTravelDistance       116321
searchDaysBeforeFlight         0
dtype: int64

In [101]:
#drop null values. still have over 1.5 million rows of data.
df = df.dropna()
df.isna().sum()

flightDate                0
startingAirport           0
destinationAirport        0
travelDuration            0
isBasicEconomy            0
isRefundable              0
isNonStop                 0
totalFare                 0
totalTravelDistance       0
searchDaysBeforeFlight    0
dtype: int64

In [102]:
# check df datatypes
df.dtypes

flightDate                 object
startingAirport            object
destinationAirport         object
travelDuration             object
isBasicEconomy              int64
isRefundable                int64
isNonStop                   int64
totalFare                 float64
totalTravelDistance       float64
searchDaysBeforeFlight      int64
dtype: object

In [103]:
# start new df and encode startingAirport and endingAirport
le = LabelEncoder()
df2 = df.copy()
df2['startingAirport'] = le.fit_transform(df2['startingAirport'])
df2['destinationAirport'] = le.fit_transform(df2['destinationAirport'])

In [104]:
#convert travelDuration to minutes
df2['travelDurationDays'] = pd.to_numeric(df2['travelDuration'].str.slice(0,1))
df2['travelDurationHours'] = pd.to_numeric(df2['travelDuration'].str.slice(7,9))
df2['travelDurationMinutes'] = pd.to_numeric(df2['travelDuration'].str.slice(10,12))
df2['travelDurationFinal'] = ((df2['travelDurationDays']*1440)+(df2['travelDurationHours']*60)+(df2['travelDurationMinutes']))
df2 = df2.drop(columns=['travelDurationDays','travelDurationHours','travelDurationMinutes','travelDuration'])
df2.head()

Unnamed: 0,flightDate,startingAirport,destinationAirport,isBasicEconomy,isRefundable,isNonStop,totalFare,totalTravelDistance,searchDaysBeforeFlight,travelDurationFinal
0,2022-10-05,15,13,0,0,0,451.6,1933.0,7,425
1,2022-10-05,15,13,0,0,0,451.6,1933.0,7,472
2,2022-10-05,15,13,0,0,0,451.6,1933.0,7,514
3,2022-10-05,15,14,0,0,0,103.99,2590.0,7,469
4,2022-10-05,15,14,0,0,0,109.59,2590.0,7,799


In [105]:
# convert flight date to number format by day of week
df2['flightDate'] = pd.to_datetime(df2['flightDate'])
s = df2['flightDate']
s = pd.Series(s)
s = s.dt.dayofweek
df2['dayOfWeek'] = s

In [107]:
df2['flightDate'] = pd.to_datetime(df2['flightDate']).astype('int64')/ 10**9

In [108]:
df2.head()

Unnamed: 0,flightDate,startingAirport,destinationAirport,isBasicEconomy,isRefundable,isNonStop,totalFare,totalTravelDistance,searchDaysBeforeFlight,travelDurationFinal,dayOfWeek
0,1664928000.0,15,13,0,0,0,451.6,1933.0,7,425,2
1,1664928000.0,15,13,0,0,0,451.6,1933.0,7,472,2
2,1664928000.0,15,13,0,0,0,451.6,1933.0,7,514,2
3,1664928000.0,15,14,0,0,0,103.99,2590.0,7,469,2
4,1664928000.0,15,14,0,0,0,109.59,2590.0,7,799,2


In [109]:
df2.dtypes

flightDate                float64
startingAirport             int32
destinationAirport          int32
isBasicEconomy              int64
isRefundable                int64
isNonStop                   int64
totalFare                 float64
totalTravelDistance       float64
searchDaysBeforeFlight      int64
travelDurationFinal         int64
dayOfWeek                   int64
dtype: object

In [110]:
# Creating the scaler instance
#from sklearn.preprocessing import StandardScaler
#data_scaler = StandardScaler()
#X = pd.DataFrame(data_scaler.fit_transform(df2.drop(["totalFare"],axis = 1),),columns=['flightDate','startingAirport','destinationAirport',
                                                                                      'isBasicEconomy','isRefundable','isNonStop','totalTravelDistance',
                                                                                      'searchDaysBeforeFlight','travelDurationFinal'])
#y = df2["totalFare"]

IndentationError: unexpected indent (3520715096.py, line 5)

In [111]:
#X.head()

In [112]:
# Create our features
X = df2.copy()
X = X.drop("totalFare", axis=1)

# Create our target
y = df2["totalFare"]

In [113]:
#split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
X_train.shape 

(1226787, 10)

In [114]:
# fit a linear regression model
regr = LinearRegression()
regr.fit(X_train, y_train) 

LinearRegression()

In [115]:
#predict flight costs
y_prediction = regr.predict(X_test)
y_prediction

array([189.74336775, 302.78603497, 321.92313843, ...,  85.93930449,
       392.29014034, 181.50906728])

In [116]:
# evaluate linear regression model
score = r2_score(y_test,y_prediction)
print('r2 score: ',score)
print('mean_sqrd_error: ',mean_squared_error(y_test,y_prediction))
print('root_mean_squared error of: ',np.sqrt(mean_squared_error(y_test,y_prediction)))

r2 score:  0.4311934359391294
mean_sqrd_error:  21270.609921908966
root_mean_squared error of:  145.84447168785303


In [117]:
#initialize random forest regressor model
X_train, y_train = make_regression(n_features=10)
forestRegr = RandomForestRegressor(max_depth=50, min_samples_split=20, n_estimators=100, random_state=0)

In [118]:
#fit random forest regessor data
forestRegr.fit(X_train, y_train)

RandomForestRegressor(max_depth=50, min_samples_split=20, random_state=0)

In [119]:
#run random forest regressor model
forestPredict = forestRegr.predict(X_test)
forestPredict

array([171.13310522, 131.47108607,  93.43024114, ..., 171.13310522,
        93.43024114, 171.13310522])

In [120]:
#evaluate random forest regressor model
forestScore = r2_score(y_test, forestPredict)
print('r2 score: ',forestScore)
print('mean_sqrd_error: ',mean_squared_error(y_test,forestPredict))
print('root_mean_squared error of: ',np.sqrt(mean_squared_error(y_test,forestPredict)))

r2 score:  -1.1753217395677407
mean_sqrd_error:  81346.49474973754
root_mean_squared error of:  285.2130690374085


In [121]:
forestRegr.feature_importances_

array([0.01358845, 0.0569177 , 0.10327355, 0.10812218, 0.42752967,
       0.20498083, 0.00627851, 0.00881101, 0.04546116, 0.02503694])

In [127]:
#how many flights were nonstop
nonStop = df.loc[df["isNonStop"] == 1, :]
nonStop

Unnamed: 0,flightDate,startingAirport,destinationAirport,travelDuration,isBasicEconomy,isRefundable,isNonStop,totalFare,totalTravelDistance,searchDaysBeforeFlight
20,2022-10-05,SFO,PHL,0 days 05:19:00,0,0,1,438.6,2516.0,7
21,2022-10-05,SFO,PHL,0 days 05:20:00,0,0,1,438.6,2516.0,7
22,2022-10-05,SFO,PHL,0 days 05:21:00,0,0,1,438.6,2516.0,7
23,2022-10-05,SFO,PHL,0 days 05:21:00,0,0,1,438.6,2516.0,7
69,2022-10-06,ATL,BOS,0 days 02:32:00,0,0,1,298.6,947.0,8
...,...,...,...,...,...,...,...,...,...,...
1752009,2022-11-09,CLT,LGA,0 days 01:57:00,1,0,1,110.1,543.0,39
1752010,2022-11-09,CLT,LGA,0 days 02:00:00,1,0,1,110.1,543.0,39
1752011,2022-11-09,CLT,LGA,0 days 02:00:00,1,0,1,110.1,543.0,39
1752012,2022-11-09,CLT,LGA,0 days 02:03:00,1,0,1,110.1,543.0,39


In [124]:
#how many flights were refundable
refundable = df.loc[df["isRefundable"] == 1, :]
refundable

Unnamed: 0,flightDate,startingAirport,destinationAirport,travelDuration,isBasicEconomy,isRefundable,isNonStop,totalFare,totalTravelDistance,searchDaysBeforeFlight
393695,2022-09-30,DFW,DEN,0 days 03:20:00,0,1,0,472.41,752.0,1
399870,2022-09-30,ORD,DEN,0 days 04:30:00,0,1,0,492.41,1045.0,1
472951,2022-10-07,DFW,DEN,0 days 03:20:00,0,1,0,422.4,752.0,8
850161,2022-10-01,DFW,DEN,0 days 03:20:00,0,1,0,472.41,752.0,1
918542,2022-10-07,DFW,DEN,0 days 03:20:00,0,1,0,422.4,752.0,7
1360062,2022-10-02,DFW,DEN,0 days 03:20:00,0,1,0,472.41,752.0,1
1365556,2022-10-02,ORD,DEN,0 days 04:30:00,0,1,0,492.41,1045.0,1
1418041,2022-10-07,DFW,DEN,0 days 03:20:00,0,1,0,422.4,752.0,6


In [128]:
#score linear regression
regrScore = regr.score(X_test,y_test)
regrScore

0.4311934359391294

In [None]:
XGBR = XGBRegressor()