In [130]:
pip install statsmodels

Note: you may need to restart the kernel to use updated packages.


### Linear Regression for DepDelay

In [131]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
from sklearn.metrics import confusion_matrix, precision_score, recall_score, roc_curve, auc

In [132]:
train = pd.read_csv('sfo_weather_train.csv')
test = pd.read_csv('sfo_weather_test.csv')

In [133]:
train.columns

Index(['Precip', 'Air_Temp_Max', 'Air_Temp_Min', 'DayOfWeek',
       'Operating_Airline', 'Dest', 'DestState', 'DepDelay', 'DepDel15',
       'DepTimeBlk', 'Distance', 'DistanceGroup', 'DepDelayBinary'],
      dtype='object')

In [152]:
x_train = train.drop(columns=['DepDelay', 'DepDel15', 'DepDelayBinary'])
x_test = test.drop(columns=['DepDelay', 'DepDel15', 'DepDelayBinary'])

y_train_reg = train.loc[:, 'DepDelay']
y_train_class = train.loc[:, 'DepDelayBinary']
y_train_delay15 = train.loc[:, 'DepDel15']

y_test_reg = test.loc[:, 'DepDelay']
y_test_class = test.loc[:, 'DepDelayBinary']
y_test_delay15 = test.loc[:, 'DepDel15']

In [153]:
#ols = smf.ols(formula='DepDelay ~ Precip + Air_Temp_Max + Air_Temp_Min + Distance + DayOfWeek_2 + DayOfWeek_3 + DayOfWeek_4 + DayOfWeek_5 + DayOfWeek_6 + DayOfWeek_7 + Operating_Airline_AS + Operating_Airline_B6 + Operating_Airline_DL + Operating_Airline_F9 + Operating_Airline_HA + Operating_Airline_OO + Operating_Airline_QX + Operating_Airline_UA + Operating_Airline_WN + Dest_ACV + Dest_ANC + Dest_ASE + Dest_ATL + Dest_AUS + Dest_BDL + Dest_BFL + Dest_BIH + Dest_BNA + Dest_BOI + Dest_BOS + Dest_BUR + Dest_BWI + Dest_BZN + Dest_CLE + Dest_CLT + Dest_CMH + Dest_DAL + Dest_DCA + Dest_DEN + Dest_DFW + Dest_DTW + Dest_EGE + Dest_EUG + Dest_EWR + Dest_FAT + Dest_FCA + Dest_FLL + Dest_GEG + Dest_HDN + Dest_HNL + Dest_IAD + Dest_IAH + Dest_IND + Dest_JAC + Dest_JFK + Dest_KOA + Dest_LAS + Dest_LAX + Dest_LIH + Dest_MCO + Dest_MDW + Dest_MFR + Dest_MIA + Dest_MRY + Dest_MSO + Dest_MSP + Dest_MSY + Dest_MTJ + Dest_OGG + Dest_OMA + Dest_ONT + Dest_ORD + Dest_OTH + Dest_PAE + Dest_PDX + Dest_PHL + Dest_PHX + Dest_PIT + Dest_PSC + Dest_PSP + Dest_RDD + Dest_RDM + Dest_RDU + Dest_RNO + Dest_RSW + Dest_SAN + Dest_SAT + Dest_SBA + Dest_SBP + Dest_SDF + Dest_SEA + Dest_SLC + Dest_SMF + Dest_SNA + Dest_STL + Dest_STS + Dest_SUN + Dest_TPA + Dest_TUS + DestState_AZ + DestState_CA + DestState_CO + DestState_CT + DestState_FL + DestState_GA + DestState_HI + DestState_ID + DestState_IL + DestState_IN + DestState_KY + DestState_LA + DestState_MA + DestState_MD + DestState_MI + DestState_MN + DestState_MO + DestState_MT + DestState_NC + DestState_NE + DestState_NJ + DestState_NM + DestState_NV + DestState_NY + DestState_OH + DestState_OR + DestState_PA + DestState_TN + DestState_TX + DestState_UT + DestState_VA + DestState_WA + DestState_WY', data=train)
ols = smf.ols(formula = 'DepDelay ~ Precip + Air_Temp_Max + Air_Temp_Min + DayOfWeek + Operating_Airline + Dest + DestState + DepTimeBlk + Distance + DistanceGroup',
                     data = train)
model1 =ols.fit()
print(model1.summary())

                            OLS Regression Results                            
Dep. Variable:               DepDelay   R-squared:                       0.013
Model:                            OLS   Adj. R-squared:                  0.011
Method:                 Least Squares   F-statistic:                     5.953
Date:                Fri, 16 Dec 2022   Prob (F-statistic):           1.92e-78
Time:                        18:52:57   Log-Likelihood:            -2.6693e+05
No. Observations:               51530   AIC:                         5.341e+05
Df Residuals:                   51418   BIC:                         5.351e+05
Df Model:                         111                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
Intercept                 

In [154]:
y_pred_reg = model1.predict(x_test)
pd.DataFrame(y_pred_reg, y_test_reg)

Unnamed: 0_level_0,0
DepDelay,Unnamed: 1_level_1
-4.0,
5.0,10.788989
-10.0,
-7.0,
16.0,14.018292
...,...
-3.0,
6.0,13.703833
3.0,11.045487
144.0,28.782486


In [155]:
def OSR2(model, X_test, y_test, y_train):
    
    y_pred = model.predict(X_test)
    SSE = np.sum((y_test - y_pred)**2)
    SST = np.sum((y_test - np.mean(y_train))**2)
                 
    return (1 - SSE/SST)

In [156]:
OSR2(model1, x_test, y_test_reg, y_train_reg)

0.007477117506624609

In [157]:
y_pred_reg_binary = [1 if x > 0 else 0 for x in y_pred_reg]
cm = confusion_matrix(y_test_class, y_pred_reg_binary)
print ("Confusion Matrix : \n", cm)

acc = (cm.ravel()[0]+cm.ravel()[3])/sum(cm.ravel())
TPR = cm.ravel()[3]/(cm.ravel()[3]+cm.ravel()[2])
FPR = cm.ravel()[1]/(cm.ravel()[1]+cm.ravel()[0])
PRE = cm.ravel()[3]/(cm.ravel()[3]+cm.ravel()[1])
print('Accuracy is: %.4f' %acc)
print('TPR is: %.4f' % TPR)
print('FPR is: %.4f' % FPR)
print('PRE is: %.4f' % PRE)

Confusion Matrix : 
 [[  574 13530]
 [  119  7862]]
Accuracy is: 0.3820
TPR is: 0.9851
FPR is: 0.9593
PRE is: 0.3675


#### Prediciting if a Flight is late by more than 15 minutes

In [159]:
y_pred_reg_binary15 = [1 if x > 15 else 0 for x in y_pred_reg]
cm = confusion_matrix(y_test_delay15, y_pred_reg_binary15)
print ("Confusion Matrix : \n", cm)

acc = (cm.ravel()[0]+cm.ravel()[3])/sum(cm.ravel())
TPR = cm.ravel()[3]/(cm.ravel()[3]+cm.ravel()[2])
FPR = cm.ravel()[1]/(cm.ravel()[1]+cm.ravel()[0])
PRE = cm.ravel()[3]/(cm.ravel()[3]+cm.ravel()[1])
print('Accuracy is: %.4f' %acc)
print('TPR is: %.4f' % TPR)
print('FPR is: %.4f' % FPR)
print('PRE is: %.4f' % PRE)

Confusion Matrix : 
 [[16604  1629]
 [ 3226   626]]
Accuracy is: 0.7802
TPR is: 0.1625
FPR is: 0.0893
PRE is: 0.2776


In [174]:
x_train.loc[:,['Precip']]
#['Precip, Air_Temp_Max, Air_Temp_Min,DayOfWeek,Operating_Airline ,Dest ,DestState,DepTimeBlk , Distance , DistanceGroup']

Unnamed: 0,Precip
0,0.12
1,0.00
2,0.00
3,0.12
4,0.06
...,...
51525,0.03
51526,0.00
51527,0.01
51528,0.10


In [183]:
# values = sm.add_constant(x_train.loc[:,['Precip', 'Air_Temp_Max', 'Air_Temp_Min','DayOfWeek','Operating_Airline' ,'Dest' ,'DestState','DepTimeBlk', 'Distance' , 'DistanceGroup']]).values
# variance_inflation_factor(values, 0)
#x_train.loc[:,['Precip', 'Air_Temp_Max', 'Air_Temp_Min','DayOfWeek','Operating_Airline' ,'Dest' ,'DestState','DepTimeBlk', 'Distance' , 'DistanceGroup']]
x_train.loc[:,['Precip', 'Air_Temp_Max', 'Air_Temp_Min','DayOfWeek','Operating_Airline' ,'Dest' ,'DestState','DepTimeBlk', 'Distance' , 'DistanceGroup']]

Unnamed: 0,Precip,Air_Temp_Max,Air_Temp_Min,DayOfWeek,Operating_Airline,Dest,DestState,DepTimeBlk,Distance,DistanceGroup
0,0.12,57,44,7,AS,SEA,WA,2100-2159,679.0,3
1,0.00,69,53,7,UA,LAS,NV,1900-1959,414.0,2
2,0.00,68,53,2,UA,SNA,CA,1000-1059,372.0,2
3,0.12,57,45,5,DL,SLC,UT,1000-1059,599.0,3
4,0.06,65,50,3,AS,LAS,NV,1700-1759,414.0,2
...,...,...,...,...,...,...,...,...,...,...
51525,0.03,67,51,3,UA,PDX,OR,1900-1959,550.0,3
51526,0.00,73,55,5,QX,SEA,WA,0600-0659,679.0,3
51527,0.01,70,54,3,WN,SAN,CA,2000-2059,447.0,2
51528,0.10,56,44,7,UA,SNA,CA,1900-1959,372.0,2


In [184]:
# Attempt to improve model using VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor

def VIF(df, columns):
    values = sm.add_constant(df.loc[:,columns]).values
    num_columns = len(columns)+1
    vif = [variance_inflation_factor(values, i) for i in range(num_columns)]
    return pd.Series(vif[1:], index=columns)

vif_list = VIF(x_train,['Precip', 'Air_Temp_Max', 'Air_Temp_Min','DayOfWeek', 'Distance' , 'DistanceGroup'])

In [195]:
vif_df = pd.DataFrame(vif_list)
#ind = vif_df[vif_df[0] < 10].index
vif_df.sort_values(0)

Unnamed: 0,0
DayOfWeek,1.02653
Precip,2.971195
Air_Temp_Min,17.550114
Air_Temp_Max,20.759909
DistanceGroup,227.234729
Distance,227.239925


In [186]:
#improved model
ols2 = smf.ols(formula='DepDelay ~ Precip + DayOfWeek', data=train)
model2 =ols2.fit()
print(model2.summary())

                            OLS Regression Results                            
Dep. Variable:               DepDelay   R-squared:                       0.002
Model:                            OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     47.79
Date:                Fri, 16 Dec 2022   Prob (F-statistic):           1.84e-21
Time:                        19:15:20   Log-Likelihood:            -2.6721e+05
No. Observations:               51530   AIC:                         5.344e+05
Df Residuals:                   51527   BIC:                         5.345e+05
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      8.7706      0.461     19.021      0.0

In [196]:
OSR2(model2, x_test, y_test_reg, y_train_reg)

0.000979034233431375

In [200]:
y_pred_reg2 = model2.predict(x_test)
pd.DataFrame(y_pred_reg2, y_test_reg)

Unnamed: 0_level_0,0
DepDelay,Unnamed: 1_level_1
-4.0,
5.0,10.783699
-10.0,
-7.0,
16.0,8.137151
...,...
-3.0,
6.0,6.863706
3.0,10.537142
144.0,7.512885


In [201]:
y_pred_reg_binary2 = [1 if x > 0 else 0 for x in y_pred_reg2]
cm = confusion_matrix(y_test_class, y_pred_reg_binary2)
print ("Confusion Matrix : \n", cm)

acc = (cm.ravel()[0]+cm.ravel()[3])/sum(cm.ravel())
TPR = cm.ravel()[3]/(cm.ravel()[3]+cm.ravel()[2])
FPR = cm.ravel()[1]/(cm.ravel()[1]+cm.ravel()[0])
PRE = cm.ravel()[3]/(cm.ravel()[3]+cm.ravel()[1])
print('Accuracy is: %.4f' %acc)
print('TPR is: %.4f' % TPR)
print('FPR is: %.4f' % FPR)
print('PRE is: %.4f' % PRE)

Confusion Matrix : 
 [[   40 14064]
 [   52  7929]]
Accuracy is: 0.3608
TPR is: 0.9935
FPR is: 0.9972
PRE is: 0.3605


In [202]:
y_pred_reg_binary15_ = [1 if x > 15 else 0 for x in y_pred_reg2]
cm = confusion_matrix(y_test_delay15, y_pred_reg_binary15_)
print ("Confusion Matrix : \n", cm)

acc = (cm.ravel()[0]+cm.ravel()[3])/sum(cm.ravel())
TPR = cm.ravel()[3]/(cm.ravel()[3]+cm.ravel()[2])
FPR = cm.ravel()[1]/(cm.ravel()[1]+cm.ravel()[0])
PRE = cm.ravel()[3]/(cm.ravel()[3]+cm.ravel()[1])
print('Accuracy is: %.4f' %acc)
print('TPR is: %.4f' % TPR)
print('FPR is: %.4f' % FPR)
print('PRE is: %.4f' % PRE)

Confusion Matrix : 
 [[18233     0]
 [ 3852     0]]
Accuracy is: 0.8256
TPR is: 0.0000
FPR is: 0.0000
PRE is: nan


  PRE = cm.ravel()[3]/(cm.ravel()[3]+cm.ravel()[1])
