In [22]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_curve
from sklearn.decomposition import PCA
from datetime import datetime
import os

In [23]:
# Load your dataset
folderPath = "/Users/praveen/Desktop/Airline-delay-prediction-in-Python/MEJ/DS Challange/StartUpML/Flight Delay"
os.chdir(folderPath)
fdata_list = []

for filename in os.listdir(os.getcwd()):
    if filename.endswith('.csv') and "On_Time_On_Time_Performance_2017_1.csv" in filename:
        temp = pd.read_csv(filename)
        fdata_list.append(temp)

  temp = pd.read_csv(filename)


In [24]:
# Concatenate all DataFrames in the list
fdata = pd.concat(fdata_list, ignore_index=True)

In [25]:
# Handle class imbalance through sampling
classDistribution = fdata['ArrDel15'].value_counts()
print('Class imbalance:')
print(classDistribution)
zero = fdata[fdata['ArrDel15']==0].tail(classDistribution.min())
one = fdata[fdata['ArrDel15']==1]
data = pd.concat([zero, one])

Class imbalance:
ArrDel15
0.0    341946
1.0     97699
Name: count, dtype: int64


In [26]:
# Sort data by date and time
data.sort_values(['Year', 'Month', 'DayofMonth', 'DayOfWeek'], ascending=[False, False, False, False], inplace=True)


In [27]:
# Print balanced class distribution
print('Class imbalance evened out:')
print(data['ArrDel15'].value_counts())

Class imbalance evened out:
ArrDel15
0.0    97699
1.0    97699
Name: count, dtype: int64


In [28]:
# Select relevant features
selected_features = ['Year', 'Month', 'TaxiOut', 'ArrTime', 'Distance', 'ArrDel15']
data_selected = data[selected_features]


In [29]:
# Drop missing values
data_selected.dropna(inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_selected.dropna(inplace=True)


In [30]:
# Split the data into features (X) and target variable (y)
X = data_selected.drop('ArrDel15', axis=1)
y = data_selected['ArrDel15']


In [31]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [32]:
# Train the model
rf = RandomForestClassifier(n_estimators=10, criterion='gini')  
rf.fit(X_train, y_train)

In [33]:
# Predictions
y_pred = rf.predict(X_test)


In [34]:
# Model evaluation
cm = confusion_matrix(y_test, y_pred)
fpr, tpr, _ = roc_curve(y_test, y_pred)

In [35]:
# Print evaluation metrics
print('Confusion matrix:')
print(cm)
print('Accuracy: {:.2f}%'.format(100 * np.sum(np.diag(cm)) / np.sum(cm)))
print('Area under the ROC curve: {:.2f}'.format(np.trapz(fpr, tpr)))

Confusion matrix:
[[14033  5448]
 [ 7845 11754]]
Accuracy: 65.99%
Area under the ROC curve: 0.34


In [37]:
data.tail(2)

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,UniqueCarrier,AirlineID,Carrier,TailNum,...,Div4TailNum,Div5Airport,Div5AirportID,Div5AirportSeqID,Div5WheelsOn,Div5TotalGTime,Div5LongestGTime,Div5WheelsOff,Div5TailNum,Unnamed: 109
425230,2017,1,1,1,7,2017-01-01,VX,21171,VX,N629VA,...,,,,,,,,,,
425236,2017,1,1,1,7,2017-01-01,VX,21171,VX,N633VA,...,,,,,,,,,,


In [42]:
# Example delay check
def predict_flight_delay(flight_features):
    input_data = np.array(flight_features).reshape(1, -1)
    prediction = rf.predict(input_data)

    return prediction[0]

# Example usage
flight_features = [2018, 6, 20, 1500, 100]
prediction = predict_flight_delay(flight_features)

if prediction == 0:
    print("The flight is predicted to be delayed.")
else:
    print("The flight is predicted to be on time.")

The flight is predicted to be on time.




In [40]:
import pickle

rf = RandomForestClassifier(n_estimators=10, criterion='gini')  # You can modify the parameters based on your grid search results
rf.fit(X_train, y_train)

# Save the trained model to a pickle file
with open('/Users/praveen/Desktop/Airline-delay-prediction-in-Python/model.pkl', 'wb') as model_file:
    pickle.dump(rf, model_file)