In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import csv
import os
import tabulate
from sklearn.preprocessing import LabelEncoder
import matplotlib
from matplotlib import pyplot as plt
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.decomposition import PCA
from sklearn import linear_model, decomposition, datasets
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_curve
from sklearn.model_selection import cross_val_score, KFold
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
get_ipython().magic('matplotlib inline')
matplotlib.rcParams.update({'font.size': 12})
matplotlib.rc('xtick', labelsize=8) 

In [None]:
startTime = datetime.now()
folderPath = "C:/Users/sulayako/datamodels/Airline-delay-prediction-in-Python-master/data"
os.chdir(folderPath)
fdata_list = []

for filename in os.listdir(os.getcwd()):
    if filename.endswith('.csv') and "On_Time_On_Time_Performance_2017_1.csv" in filename:
        temp = pd.read_csv(filename)
        fdata_list.append(temp)

# Concatenate all DataFrames in the list
fdata = pd.concat(fdata_list, ignore_index=True)

In [None]:
folderPath = "C:/Users/sulayako/datamodels/Airline-delay-prediction-in-Python-master/data"
os.chdir(folderPath)
fileName = "On_Time_On_Time_Performance_2017_1.csv"
fdata = pd.read_csv(fileName)
fdata.shape

Data Wrangling

In [None]:
#Handles class imbalance through sampling
classDistribution = fdata['ArrDel15'].value_counts()
print('Class imbalance:')
print(classDistribution)
zero = fdata[fdata['ArrDel15']==0].tail(classDistribution.min())
one = fdata[fdata['ArrDel15']==1]
data = pd.concat([zero, one])  

del zero, one

# Combine and sort the DataFrame
data.sort_values(['Year', 'Month', 'DayofMonth', 'DayOfWeek'], ascending=[False, False, False, False], inplace=True)

# Print balanced class distribution
print('Class imbalance evened out:')
print(data['ArrDel15'].value_counts())

# Print number of columns
print(len(data.columns))
len(data.columns)


In [None]:
data.head(2)

In [None]:
data.tail(2)

In [None]:
#Removing sparse columns
data = data.iloc[:,0:56]
print('Dimension reduced to:')
len(data.columns)

In [None]:
print('Dimension reduced to:')
print(len(data.columns))
print('Sparsity per variable:')
((len(data.index) - data.count())/len(data.index))#Sparsity per variable retained

In [None]:
data.drop(['DepDelay','DepDel15','ArrDelay','DepDelayMinutes','ArrDelayMinutes'],
          axis=1, inplace=True)#To be excluded as per the task
data.drop(['CancellationCode'], axis=1, inplace=True)#Removed due to 100% sparsity as seen in above cell
data["FlightDate"] = data["FlightDate"].apply(lambda x: int(''.join(x.split("-"))))#Formatting date for convinience

In [None]:
print('Datatype of variables:')
data.dtypes

In [None]:
# Define attributes for analysis
avgLate = data['ArrDel15'].mean()
attributes = ['Month', 'DayOfWeek', 'DayofMonth', 'DepTimeBlk', 'ArrTimeBlk', 'UniqueCarrier',
               'ArrivalDelayGroups', 'DepartureDelayGroups']

fig, axes = plt.subplots(nrows=len(attributes), ncols=1, figsize=(15, 5 * len(attributes)))

# Iterate through attributes
for i, pred in enumerate(attributes):
    
    group = data.groupby(pred)['ArrDel15'].mean().reset_index()
    group = group.sort_values(by=pred)

    
    sns.barplot(x=pred, y='ArrDel15', data=group, ax=axes[i])
    axes[i].axhline(y=avgLate, color='red', linestyle='--', label='Average')
    
    axes[i].set_ylabel('Percent of Flights that Arrive Late')
    axes[i].set_title(pred)
    axes[i].legend()

plt.tight_layout()
plt.show()

Conversion of categorical values to numericals

In [None]:
le = LabelEncoder()

data["Unique_Carrier"] = le.fit_transform(data["UniqueCarrier"])
UniqueCarrier = list(le.classes_)
data["Carrier_Name"] = le.fit_transform(data["Carrier"])
Carrier = list(le.classes_)
data["Tail_Number"] = le.fit_transform(data["TailNum"])
TailNum = list(le.classes_)

data["Origin_Point"] = le.fit_transform(data["Origin"])
Origin = list(le.classes_)
data["Origin_CityName"] = le.fit_transform(data["OriginCityName"])
OriginCityName = list(le.classes_)
data["Origin_State"] = le.fit_transform(data["OriginState"])
OriginState = list(le.classes_)
data["OriginState_Name"] = le.fit_transform(data["OriginStateName"])
OriginStateName = list(le.classes_)

data["Destination"] = le.fit_transform(data["Dest"])
Dest = list(le.classes_)
data["Dest_CityName"] = le.fit_transform(data["DestCityName"])
DestCityName = list(le.classes_)
data["Dest_State"] = le.fit_transform(data["DestState"])
DestState = list(le.classes_)
data["Dest_StateName"] = le.fit_transform(data["DestStateName"])
DestStateName = list(le.classes_)

data["DepTime_Blk"] = le.fit_transform(data["DepTimeBlk"])
DepTimeBlk = list(le.classes_)
data["ArrTime_Blk"] = le.fit_transform(data["ArrTimeBlk"])
ArrTimeBlk = list(le.classes_)

In [None]:
data.drop(['UniqueCarrier','Carrier','TailNum',
           'Origin','OriginCityName','OriginState','OriginStateName',
          'Dest','DestCityName','DestState','DestStateName',
          'DepTimeBlk','ArrTimeBlk'], axis=1, inplace=True)

In [None]:
data.drop(['DepartureDelayGroups','ArrivalDelayGroups'], axis=1, inplace=True)

In [None]:
print('Dimension reduced to:')
print(len(data.columns))
data.describe()

### Model creation with original raw data

In [None]:
#Data set up as predictors and target
rfDataOriginal = pd.DataFrame(data)
Delay_YesNo = rfDataOriginal['ArrDel15']
rfDataOriginal.drop(['ArrDel15'], axis=1, inplace=True)#Removing target variable
print('Dimension reduced to:')
print(len(rfDataOriginal.columns))
data.describe()

In [None]:
plt.matshow(rfDataOriginal.corr())
plt.title('Correlation matrix for MULTI-COLLINEAR data')
corr = rfDataOriginal.corr()

In [None]:
corr = rfDataOriginal.corr()#Lists all pairs of highly collinear variables
indices = np.where(corr > 0.8)
indices = [(corr.columns[x], corr.columns[y]) for x, y in zip(*indices)
                                        if x != y and x < y]
indices

In [None]:
rfDataOriginal.drop(['OriginAirportSeqID','OriginCityMarketID', 'OriginStateFips', 'OriginWac', 
                     'DestAirportSeqID', 'DestCityMarketID', 'DestStateFips', 'DestWac',
                     'CRSDepTime', 'CRSElapsedTime', 'CRSArrTime','Carrier_Name', 
                     'Origin_Point', 'Origin_CityName', 'Origin_State', 'OriginState_Name', 
                     'Destination', 'Dest_CityName', 'Dest_State', 'Dest_StateName'], axis=1, inplace=True)

In [None]:
print('Dimension finally reduced to:')
print(len(rfDataOriginal.columns))
list(rfDataOriginal.columns)

### Model building with non-redundant variables

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(rfDataOriginal, Delay_YesNo, test_size=0.2, random_state=42)

In [None]:
startTimeGS = datetime.now()
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
param_grid = {
    'n_estimators': [10, 25],
    # 'min_samples_split': [2, 4],
    # 'min_samples_leaf': [2, 4],
    # 'max_features': ['sqrt', 'log2'],
    "criterion": ["gini", "entropy"]
}

grid_rf = GridSearchCV(rf, param_grid, cv=10)
grid_rf.fit(X_train, Y_train)
bestModel = grid_rf.best_estimator_
bestParameters = grid_rf.best_params_
gridScores = grid_rf.cv_results_ 
print('Random forest Grid Search with non-redundant variables took [', datetime.now() - startTimeGS, '] seconds.')


In [None]:
print(bestModel)
print(bestParameters)
gridScores

In [None]:
startTimeRF = datetime.now()
rf = RandomForestClassifier(n_estimators=bestParameters.get('n_estimators'),
#                            min_samples_split=bestParameters.get('min_samples_split'),
#                           min_samples_leaf=bestParameters.get('min_samples_leaf'),
#                           max_features=bestParameters.get('max_features'),
                           criterion=bestParameters.get('criterion'))

cv = KFold(n_splits=10, shuffle=True, random_state=2)
cvScores = cross_val_score(rf, X_train, Y_train, cv=cv)
print('Mean cross-validation score is: ' + str(np.mean(cvScores)))
rf.fit(X_train, Y_train)
print('Random forest training and testing with non-redundant variables took [',
      datetime.now() - startTimeRF, '] seconds.')

In [None]:
Y_rf_pred = rf.predict(X_test)

labels = [0, 1]
cm = confusion_matrix(Y_test, Y_rf_pred, labels=labels)

print('Accuracy: ' + str(np.round(100*float(cm[0][0]+cm[1][1])/float((cm[0][0]+cm[1][1] + cm[1][0] + cm[0][1])),2))+'%')
print('Recall: ' + str(np.round(100*float((cm[1][1]))/float((cm[1][0]+cm[1][1])),2))+'%')
print('Confusion matrix:')
print(cm)

fpr, tpr, _ = roc_curve(Y_test, Y_rf_pred)
auc = np.trapz(fpr, tpr)
print('Area under the ROC curve: ' + str(auc))

fig = plt.figure(1)
plt.plot(fpr, tpr, color='green')
plt.xlabel('False positive rate (FPR)')
plt.ylabel('True positive rate (TPR)')
plt.title('Receiver operating characteristic (ROC)')

fig = plt.figure(2)
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
plt.title('Confusion matrix for Random Forest classifier with original data')
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

## Model creation with PCA components

In [None]:
# Removal of redundant and highly collinear variables from dataset
rfDataPCA = pd.DataFrame(data)
rfDataPCA.drop(['OriginAirportSeqID','OriginCityMarketID', 'OriginStateFips', 'OriginWac', 
                     'DestAirportSeqID', 'DestCityMarketID', 'DestStateFips', 'DestWac',
                     'CRSDepTime', 'CRSElapsedTime', 'CRSArrTime','Carrier_Name', 
                     'Origin_Point', 'Origin_CityName', 'Origin_State', 'OriginState_Name', 
                     'Destination', 'Dest_CityName', 'Dest_State', 'Dest_StateName'], axis=1, inplace=True)
print('Dimension reduced to:')
print(len(rfDataPCA.columns))
list(rfDataPCA.columns)
rfDataPCA.describe()

In [None]:
indices = np.where(corr > 0.7)
indices = [(corr.columns[x], corr.columns[y]) for x, y in zip(*indices)
                                        if x != y and x < y]
indices

In [None]:
rfDataPCA.drop(['FlightDate','DepTime','DepTime_Blk','ActualElapsedTime','AirTime','DistanceGroup','DepTime_Blk'], 
          axis=1, inplace=True)

In [None]:
#Check for collinearity
plt.matshow(rfDataPCA.corr())
plt.title('Correlation matrix for NON-COLLINEAR data')
corr = rfDataPCA.corr()

In [None]:
indices = np.where(corr > 0.7)
indices = [(corr.columns[x], corr.columns[y]) for x, y in zip(*indices)
                                        if x != y and x < y]
indices

In [None]:
print('Dimension reduced to:')
print(len(rfDataPCA.columns))
rfDataPCA.describe()

## Dimension reduction with PCA

In [None]:
#PCA 1: On HIGHLY MULTI-COLLINEAR data
pca = PCA(n_components=8)
data_reduced = pca.fit_transform(data)
print(pca.explained_variance_ratio_)
np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)

In [None]:
plt.figure(1, figsize=(20, 5))
plt.plot(pca.explained_variance_, linewidth=3)
plt.axis('tight')
plt.title('Scree plot for PCA with COLLINEAR data')
plt.xlabel('No. of principal components')
plt.ylabel('Explained variance')

In [None]:
#PCA 2: On PARTIALLY MULTI-COLLINEAR data
pca = PCA(n_components=8)
rfDataOriginal_reduced = pca.fit_transform(rfDataOriginal)
print(pca.explained_variance_ratio_)
np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)

In [None]:
plt.figure(1, figsize=(20, 5))
plt.plot(pca.explained_variance_, linewidth=3)
plt.axis('tight')
plt.title('Scree plot for PCA with FEATURE SELECTED data')
plt.xlabel('No. of principal components')
plt.ylabel('Explained variance')

In [None]:
#PCA 3: On LOW MULTI-COLLINEAR data
pca = PCA(n_components=8)
rfDataPCA_reduced = pca.fit_transform(rfDataPCA)
print(pca.explained_variance_ratio_)
np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)

In [None]:
plt.figure(1, figsize=(20, 5))
plt.plot(pca.explained_variance_, linewidth=3)
plt.axis('tight')
plt.title('Scree plot for PCA with FEATURE RE-ENGINEERED data')
plt.xlabel('No. of principal components')
plt.ylabel('Explained variance')

In [None]:
Delay_YesNo = data['ArrDel15']
data.drop(['ArrDel15'], axis=1, inplace=True)#Removing target variable
print('Dimension reduced to:')
print(len(data.columns))
data.describe()
pca = PCA(n_components=2)
data_reduced = pca.fit_transform(data)
print(pca.explained_variance_ratio_)
print('Cumulative explained variance:')
np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)

# Model building with dimension reduced data

In [None]:
x_train, x_test, y_train, y_test = train_test_split(data, Delay_YesNo, test_size=0.2, random_state=42)

In [None]:
startTimeGS = datetime.now()
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
param_grid = {
    'n_estimators': [10, 25],
    # 'min_samples_split': [2, 4],
    # 'min_samples_leaf': [2, 4],
    # 'max_features': ['sqrt', 'log2'],
    "criterion": ["gini", "entropy"]
}

grid_rf = GridSearchCV(rf, param_grid, cv=10)
grid_rf.fit(x_train, y_train)
bestModel = grid_rf.best_estimator_
bestParameters = grid_rf.best_params_
gridScores = grid_rf.cv_results_  # Use cv_results_ instead of grid_scores_
print('Random forest Grid Search with dimension reduced data took [', datetime.now() - startTimeGS, '] seconds.')

In [None]:
print(bestModel)
print(bestParameters)
gridScores

In [None]:
startTimeRF = datetime.now()
rf = RandomForestClassifier(n_estimators=bestParameters.get('n_estimators'),
#                            min_samples_split=bestParameters.get('min_samples_split'),
#                            min_samples_leaf=bestParameters.get('min_samples_leaf'),
#                            max_features=bestParameters.get('max_features'),
                            criterion=bestParameters.get('criterion'))

cv = KFold(n_splits=10, shuffle=True, random_state=2)
cvScores = cross_val_score(rf, x_train, y_train, cv=cv)
print('Mean cross-validation score is: ' + str(np.mean(cvScores)))
rf.fit(x_train, y_train)
print('Random forest training and testing with PCs took [', datetime.now() - startTimeRF, '] seconds.')


In [None]:
y_rf_pred = rf.predict(x_test)

labels = [0, 1]
cm = confusion_matrix(y_test, y_rf_pred, labels=labels)

print('Accuracy: ' + str(np.round(100*float(cm[0][0]+cm[1][1])/float((cm[0][0]+cm[1][1] + cm[1][0] + cm[0][1])),2))+'%')
print('Recall: ' + str(np.round(100*float((cm[1][1]))/float((cm[1][0]+cm[1][1])),2))+'%')
print('Confusion matrix:')
print(cm)

fpr, tpr, _ = roc_curve(y_test, y_rf_pred)
auc = np.trapz(fpr,tpr)
print('Area under the ROC curve: ' + str(auc))

fig = plt.figure(figsize=(20, 5))

plt.subplot2grid((2, 2), (1, 0), colspan=1)
#plt.subplot(1,2,1)
plt.plot(fpr,tpr,color='green')
plt.xlabel('False positive rate (FPR)')
plt.ylabel('True positive rate (TPR)')
plt.title('Receiver operating characteristic (ROC)')

ax1 = plt.subplot2grid((2, 2), (1, 1))
#ax = fig.add_subplot(122)
cax = ax1.matshow(cm)
plt.title('Confusion matrix for Random Forest classifier with PCs')
fig.colorbar(cax)
ax1.set_xticklabels([''] + labels)
ax1.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

In [None]:
print('Overall execution took [', datetime.now() - startTime, '] seconds.')

In [None]:
import pickle

# Train the Random Forest model
rf.fit(x_train, y_train)

# Save the trained model to a file using pickle
model_filename = 'random_forest_model.pickle'
with open(model_filename, 'wb') as model_file:
    pickle.dump(rf, model_file)




In [None]:
# import pickle

# # Save the db object to a pickle file
# with open('C:/Users/sulayako/datamodels/Airline-delay-prediction-in-Python-master/model.pkl', 'wb') as file:
#     pickle.dump(rf, file)

# # Confirm that the file is saved
# print("DB object saved to db.pkl")


In [None]:
# import pickle

# # Assuming "bestModel" is your trained Random Forest model
# model_filename = 'random_forest_model.pkl'

# # Save the model to a file
# with open(model_filename, 'wb') as model_file:
#     pickle.dump(bestModel, model_file)

# print(f"Model saved to {model_filename}")
