In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import  classification_report , mean_squared_error, mean_absolute_error ,r2_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression

In [2]:
ds = pd.read_csv('/content/drive/My Drive/flight project /finaldata.csv')
ds.dropna(inplace = True)
features = ['CRSDepTime', 'CRSArrTime','DepDelayMinutes', 'Dest', 'windspeedKmph', 'weatherCode',
            'precipMM', 'pressure', 'cloudcover', 'DewPointF', 'WindGustKmph', 'Origin',
            'tempF', 'WindChillF', 'ArrDel15', 'ArrDelayMinutes']
ds = ds.loc[:, features]
classification_target = 'ArrDel15'
regression_target = 'ArrDelayMinutes'

In [3]:
X = ds.drop([classification_target, regression_target], axis = 1)
X['Origin'] = LabelEncoder().fit_transform(X['Origin'])
X['Dest'] = LabelEncoder().fit_transform(X['Dest'])
y_c = ds[classification_target]
y_r = ds[regression_target]
X_train, X_test, y_train_c, y_test_c , y_train_r, y_test_r= train_test_split(X, y_c, y_r, test_size = 0.20, random_state = 0)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [4]:
# converting the y_test and y_train to array
y_train_r_array = np.array(y_train_r)
y_test_r_array = np.array(y_test_r)

In [6]:
# aeparating data pf delayed flights
X_train_delayed = []
y_train_delayed = []
X_test_delayed = []
y_test_delayed = []
check = pd.DataFrame(y_train_c)
check.reset_index(inplace = True)
for i in check[check['ArrDel15'] == 1].index:
  X_train_delayed.append(X_train[i])
  y_train_delayed.append(y_train_r_array[i])
check = pd.DataFrame(y_test_c)
check.reset_index(inplace = True)
for i in check[check['ArrDel15'] == 1].index:
  X_test_delayed.append(X_test[i])
  y_test_delayed.append(y_test_r_array[i])

In [7]:
# training the regression model 
regressor = GradientBoostingRegressor()
regressor = regressor.fit(X_train_delayed, y_train_delayed)
y_pred_regressor = regressor.predict(X_test_delayed)
print('r2_score:', r2_score(y_test_delayed, y_pred_regressor))
print('mean_squared_error:', mean_squared_error(y_test_delayed, y_pred_regressor) ** 0.5)
print('mean_absolute_error:', mean_absolute_error(y_test_delayed, y_pred_regressor))


r2_score: 0.9452650870680406
mean_squared_error: 17.070280443026864
mean_absolute_error: 11.732287128893626


In [8]:
# training the classification model
classifier = RandomForestClassifier(n_estimators = 100)
classifier = classifier.fit(X_train, y_train_c)
y_pred_classifier = classifier.predict(X_test)
print(classification_report(y_test_c, y_pred_classifier))

              precision    recall  f1-score   support

         0.0       0.92      0.98      0.95    292543
         1.0       0.89      0.69      0.78     77745

    accuracy                           0.92    370288
   macro avg       0.91      0.84      0.86    370288
weighted avg       0.92      0.92      0.91    370288



In [9]:
# getting values for predicting the delay
X_regressor_test = []
check = pd.DataFrame(y_pred_classifier)
for i in check[check[0] == 1].index:
  X_regressor_test.append(X_test[i])

#predicting the dealy
y_predict_delay = regressor.predict(X_regressor_test)

In [10]:
y_test_true = []
for i in check[check[0] == 1].index:
  y_test_true.append(y_test_r_array[i])
print('r2_score:', r2_score(y_test_true, y_predict_delay))
print('mean_squared_error:', mean_squared_error(y_test_true, y_predict_delay) ** 0.5 )
print('mean_absolute_error:', mean_absolute_error(y_test_true, y_predict_delay))

r2_score: 0.9465727795150413
mean_squared_error: 18.668052325573413
mean_absolute_error: 13.659656558332225
