In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_curve
from sklearn.decomposition import PCA
from datetime import datetime
import os

In [2]:
# Load your dataset
folderPath = "/Users/praveen/Desktop/Airline-delay-prediction-in-Python/MEJ/DS Challange/StartUpML/Flight Delay"
os.chdir(folderPath)
fdata_list = []

for filename in os.listdir(os.getcwd()):
    if filename.endswith('.csv') and "On_Time_On_Time_Performance_2017_1.csv" in filename:
        temp = pd.read_csv(filename)
        fdata_list.append(temp)

  temp = pd.read_csv(filename)


In [3]:
# Concatenate all DataFrames in the list
fdata = pd.concat(fdata_list, ignore_index=True)

In [4]:
# Handle class imbalance through sampling
classDistribution = fdata['ArrDel15'].value_counts()
print('Class imbalance:')
print(classDistribution)
zero = fdata[fdata['ArrDel15']==0].tail(classDistribution.min())
one = fdata[fdata['ArrDel15']==1]
data = pd.concat([zero, one])

Class imbalance:
ArrDel15
0.0    341946
1.0     97699
Name: count, dtype: int64


In [5]:
# Sort data by date and time
data.sort_values(['Year', 'Month', 'DayofMonth', 'DayOfWeek'], ascending=[False, False, False, False], inplace=True)


In [6]:
# Print balanced class distribution
print('Class imbalance evened out:')
print(data['ArrDel15'].value_counts())

Class imbalance evened out:
ArrDel15
0.0    97699
1.0    97699
Name: count, dtype: int64


In [7]:
# Select relevant features
selected_features = ['Year', 'Month', 'TaxiOut', 'ArrTime', 'Distance', 'ArrDel15']
data_selected = data[selected_features]


In [8]:
# Drop missing values
data_selected.dropna(inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_selected.dropna(inplace=True)


In [9]:
# Split the data into features (X) and target variable (y)
X = data_selected.drop('ArrDel15', axis=1)
y = data_selected['ArrDel15']


In [10]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [11]:
# Train the model
rf = RandomForestClassifier(n_estimators=10, criterion='gini')  # You can modify the parameters based on your grid search results
rf.fit(X_train, y_train)

In [12]:
# Predictions
y_pred = rf.predict(X_test)


In [13]:
# Model evaluation
cm = confusion_matrix(y_test, y_pred)
fpr, tpr, _ = roc_curve(y_test, y_pred)

In [14]:
# Print evaluation metrics
print('Confusion matrix:')
print(cm)
print('Accuracy: {:.2f}%'.format(100 * np.sum(np.diag(cm)) / np.sum(cm)))
print('Area under the ROC curve: {:.2f}'.format(np.trapz(fpr, tpr)))

Confusion matrix:
[[13912  5569]
 [ 7859 11740]]
Accuracy: 65.64%
Area under the ROC curve: 0.34


In [15]:
import pickle

rf = RandomForestClassifier(n_estimators=10, criterion='gini')  # You can modify the parameters based on your grid search results
rf.fit(X_train, y_train)

# Save the trained model to a pickle file
with open('trained_model.pkl', 'wb') as model_file:
    pickle.dump(rf, model_file)