In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import bar
import seaborn as sns
# sklearn stuff
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [2]:
#referencing the github link won't work now that the repository has been taken down
#download the excel file, then convert to csv
df = pd.read_csv('flights2DCMDVA.csv', index_col=[])
df.head()

Unnamed: 0,year,month,day,dep_time,dep_delay,arr_time,arr_delay,carrier,tailnum,flight,origin,dest,air_time,distance,hour,minute
0,2013,1,1,629.0,-1.0,721.0,-19.0,WN,N273WN,4646,LGA,BWI,40.0,185,6.0,29.0
1,2013,1,1,848.0,853.0,1001.0,851.0,MQ,N942MQ,3944,JFK,BWI,41.0,184,8.0,48.0
2,2013,1,1,1059.0,-1.0,1201.0,-14.0,WN,N505SW,321,LGA,BWI,43.0,185,10.0,59.0
3,2013,1,1,1158.0,-2.0,1256.0,-4.0,WN,N783SW,1568,EWR,BWI,38.0,169,11.0,58.0
4,2013,1,1,1316.0,2.0,1412.0,-3.0,EV,N11551,4340,EWR,BWI,38.0,169,13.0,16.0


In [3]:
df.describe()

Unnamed: 0,year,month,day,dep_time,dep_delay,arr_time,arr_delay,flight,air_time,distance,hour,minute
count,17166.0,17166.0,17166.0,16227.0,16227.0,16180.0,16164.0,17166.0,16164.0,17166.0,16227.0,16227.0
mean,2013.0,6.244961,15.555983,1383.986874,13.155605,1497.763412,10.839396,3666.89153,45.743752,212.319352,13.489554,35.031429
std,0.0,3.442076,8.741055,491.095434,41.697742,501.357545,45.410635,1390.491929,6.551112,14.252636,4.943119,19.029893
min,2013.0,1.0,1.0,2.0,-32.0,1.0,-62.0,63.0,31.0,169.0,0.0,0.0
25%,2013.0,3.0,8.0,956.0,-6.0,1109.0,-13.0,2187.0,41.0,212.0,9.0,19.0
50%,2013.0,6.0,15.0,1435.0,-3.0,1543.0,-3.0,3761.0,45.0,214.0,14.0,38.0
75%,2013.0,9.0,23.0,1819.0,11.0,1922.0,16.0,4418.0,49.0,228.0,18.0,53.0
max,2013.0,12.0,31.0,2400.0,853.0,2400.0,851.0,6181.0,131.0,229.0,24.0,59.0


In [4]:
#model can't handle nan values, so drop them
df.dropna(inplace=True)

In [5]:
df['carrier'] = df.carrier.factorize()[0]
df['dest'] = df.dest.factorize()[0]
df['origin'] = df.origin.factorize()[0]

In [6]:
df.head()

Unnamed: 0,year,month,day,dep_time,dep_delay,arr_time,arr_delay,carrier,tailnum,flight,origin,dest,air_time,distance,hour,minute
0,2013,1,1,629.0,-1.0,721.0,-19.0,0,N273WN,4646,0,0,40.0,185,6.0,29.0
1,2013,1,1,848.0,853.0,1001.0,851.0,1,N942MQ,3944,1,0,41.0,184,8.0,48.0
2,2013,1,1,1059.0,-1.0,1201.0,-14.0,0,N505SW,321,0,0,43.0,185,10.0,59.0
3,2013,1,1,1158.0,-2.0,1256.0,-4.0,0,N783SW,1568,2,0,38.0,169,11.0,58.0
4,2013,1,1,1316.0,2.0,1412.0,-3.0,2,N11551,4340,2,0,38.0,169,13.0,16.0


In [7]:
#Split dataset into two (training 80%, validation (testing) %20, random_state=1).
X = df.loc[:,['month', 'day', 'distance']].values
Y = df.loc[:,'arr_delay'].values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=1)

#fit the model
model = LinearRegression()
model.fit(X_train, Y_train)

# evaluate the model
yhat = model.predict(X_test)

# evaluate predictions
max_relative_error = np.amax(abs((Y_test-yhat)/yhat))
print('max_relative_error (%):', 100*max_relative_error)
print('Rsquared:',np.corrcoef(Y_test, yhat)[0,1]**2)

max_relative_error (%): 3824.460137644558
Rsquared: 0.0030031150292381026


In [8]:
#download the test dataset and predic the arrival delays of these flights
df_test = pd.read_csv('flights_test_data.csv', index_col=[])
df_test.drop(['year','carrier', 'origin', 'dest'], axis=1, inplace=True)

In [9]:
#convert df to numpy array
test_info = df_test.to_numpy()

In [10]:
model.predict(test_info)

array([13.07651325, 13.86611448, 12.84184608, 11.98386038, 12.57365303,
       12.13784459, 10.90271901, 10.5844062 ,  9.30816553, 10.27556096,
        8.55850522,  9.68059612,  8.41532125,  8.63063937,  7.35306602,
        7.43170504,  7.43303771,  8.31600051,  8.34114317,  8.85096412])

In [11]:
#Part 5: Multiple Logistic Regression Model, guess the 3 cancelled flights in test dataset
#Split dataset into two (training 80%, validation (testing) %20, random_state=1).

X = df.loc[:,['dep_delay']].values
Y = df.loc[:,'arr_delay'].values
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

#fit the model
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
logreg.score(X_test, y_test)
y_predict = logreg.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [12]:
df.groupby(['day']).agg(['mean']).sort_values(('arr_delay', 'mean'))['arr_delay']

Unnamed: 0_level_0,mean
day,Unnamed: 1_level_1
15,0.314974
29,0.794606
5,1.120075
4,2.203901
6,3.255361
30,3.312766
20,3.719039
14,4.583181
16,6.038961
21,6.757798


In [13]:
#Split dataset into two (training 80%, validation (testing) %20, random_state=1).
X = df.loc[:,['dep_delay']].values
Y = df.loc[:,'arr_delay'].values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=1)

#fit the model
model = LinearRegression()
model.fit(X_train, Y_train)

# evaluate the model
yhat = model.predict(X_test)

# evaluate predictions
max_relative_error = np.amax(abs((Y_test-yhat)/yhat))
print('max_relative_error (%):', 100*max_relative_error)
print('Rsquared:',np.corrcoef(Y_test, yhat)[0,1]**2)

max_relative_error (%): 10461.047087356215
Rsquared: 0.8770655977094658


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=f8475077-1815-4082-a9b9-b766f4497f53' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>