In [None]:
# Import neseccary modules.
import pandas as pd
import numpy as np

In [None]:
%pip install pandas

In [None]:
%pip install numpy

In [None]:
%pip install scikit-learn

Load the dataset

In [None]:
train_data = pd.read_csv("datathon_train.csv")

Begin preliminary analysis of the data. Exploratory phase

In [None]:
train_data.head()

In [None]:
# Drop the Id column
train_data.drop("Id", axis=1, inplace=True)

In [None]:
train_data.head()

In [None]:
# Get summary statistics
train_data.describe()

Do we have an imbalanced dataset? Yes...yes we do.

In [None]:
# Do we have an imbalanced dataset? Let's find the distribution of the target variable
# Notice how there are many more flights (rows) that aren't delayed than flights that are
pd.value_counts(train_data["IS_DELAYED"])

Let's decide what columns/features to use in our prediction.

In [None]:
print(train_data.columns)

1. Are any the features basically the "same" as another feature? Well, the airports and cities say the same thing, DEP_DEL_NEW should be removed, and manufacture year is the same variable as the age of the plane.

In [None]:
train_data.drop(["ORIGIN_CITY_NAME", "DEST_CITY_NAME", "MANUFACTURE_YEAR"], axis=1, inplace=True)

Next, we have to turn the categorical variales into numbers so we can use them to train our model! We'll use sklearn's `LabelEncoder()` for this.

In [None]:
print(train_data.head())

Here's how a label encoder works: 

`fit`
If we have a list of discrete variables, like ["a", "b", "b", "c"], the label encoder will locate each unique item in the list ("a", "b", "c") and assign an integer to that object, for instance, 

"a" -> 0

"b" -> 1

"c" -> 2

`transform`
Now, when we encounter a list like ["b", "b", "c", "a"], the LabelENcoder will perform the translation between string and number, and output [1, 1, 2, 0]. Essentially, replacing the string with the corresponding number.

Label encoders, however, do not handle unseen values. So, if we try to translate "d", the LabelEncoder will through a bad error. If out training set contains only "a", "b", and "c", and our testing set contains a new string "d", we'll run into a problem. To account for this, we'll add an "UNSEEN" to the unique items in the list, so when we encounter an unknown value in the testing set, we'll replace it with "UNSEEN" and continue encoding.

In [None]:
# For each of the string, categorical variables, we must encode these values as numbers. 
from sklearn.preprocessing import LabelEncoder

dest_le = LabelEncoder().fit(train_data["DEST"].tolist() + ["UNSEEN"])
train_data["DEST"] = dest_le.transform(train_data["DEST"])

carrier_name_le = LabelEncoder().fit(train_data["CARRIER_NAME"].tolist()+ ["UNSEEN"])
train_data["CARRIER_NAME"] = carrier_name_le.transform(train_data["CARRIER_NAME"])

previous_airport_le = LabelEncoder().fit(train_data["PREVIOUS_AIRPORT"].tolist()+ ["UNSEEN"])
train_data["PREVIOUS_AIRPORT"] = previous_airport_le.transform(train_data["PREVIOUS_AIRPORT"])

dep_time_blk_le = LabelEncoder().fit(train_data["DEP_TIME_BLK"].tolist()+ ["UNSEEN"])
train_data["DEP_TIME_BLK"] = dep_time_blk_le.transform(train_data["DEP_TIME_BLK"])

departing_airport_le = LabelEncoder().fit(train_data["DEPARTING_AIRPORT"].tolist()+ ["UNSEEN"])
train_data["DEPARTING_AIRPORT"] = departing_airport_le.transform(train_data["DEPARTING_AIRPORT"])


2. For this starter code, I'll select 5 random variables to use as my features. You should do your own selection, and think about what features would be useful!

In [None]:
train_data = train_data[['DEPARTING_AIRPORT', 'DEST', 'PLANE_AGE', 'CARGO_HANDLING', 'PRCP', 'AWND', 'GROUND_SERV_PER_PASS', 'PREVIOUS_AIRPORT', "DEP_TIME_BLK", "IS_DELAYED"]]


In [None]:
print(train_data.head())

In [None]:
train_data.corr()

In [None]:
train_data

Now, we convert this dataframe into a numpy array to begin the model training process

In [None]:
train_data_np = train_data.to_numpy()

In [None]:
train_data_np

Then, we separate the features from the target variable

In [None]:
X = train_data_np[:, :-1] # All rows, and every column except for the last one, which is the target variable
y = train_data_np[:, -1]

In [None]:
X

In [None]:
y

Now, we split the data into a training set and testing set so we can both train the model, and evaluate the model after training it

In [None]:
from sklearn.model_selection import train_test_split
# IF YOUR MODEL IS TAKING TOO LONG TO RUN, INCREASE THE TEST SIZE to 0.7 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

Now, we define our model. This is truly where the magic happens, and it's truly just plug and play. Feel free to swap out my model with any one of these, and explore how the results change!

In [None]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

Here, I'm just defining a random model

In [None]:
# model = GaussianNB()
model = StandardScaler(
n_jobs=-1
    )

In [None]:
model.fit(X_train, y_train)

Great! Now that our model is done training, let's see how we did. To evaluate our model we must define what metric we evaluate our model on. We'll be using AUROC

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
# Use our testing subset and make predictions
y_test_predictions_probabilities = model.predict_proba(X_test)

`predict_proba` is a function that returns the probability/confidence of the model for each class. 

In [None]:
print(y_test_predictions_probabilities)

If we exaime the first row [0.57795736, 0.42204264], we interpret this as the model is 57% confident that the label should be 0, and 42% confident that the label should be 1. The AUROC Score is concerned only with the probability of the 1 label, so we must grab the second column

In [None]:
y_test_predictions = y_test_predictions_probabilities[:, 1] # All rows, second column

In [None]:
roc_auc_score(y_test, y_test_predictions)

This is decent!

Now let's load our test data and make predictions on that, then create our submission file

In [None]:
test_data = pd.read_csv("datathon_test.csv")

In [None]:
test_data

Since we used only the "SEGMENT_NUMBER", "NUMBER_OF_SEATS", "PRCP", "CARGO_HANDLING", "AIRLINE_FLIGHTS_MONTH", "DEP_TIME_BLK" columns when training, we must only use these when testing, because these features are what our model is trained on

Note, we MUST keep the Id column here to create our submission file

In [None]:
test_data = test_data[["Id", 'DEPARTING_AIRPORT', 'DEST', 'PLANE_AGE', 'CARGO_HANDLING', 'PRCP', 'AWND', 'GROUND_SERV_PER_PASS', 'PREVIOUS_AIRPORT', "DEP_TIME_BLK"]]

In [None]:
print(test_data.head())

Me must make the DEPARTING_AIRPORT, CARRIER_NAME, and  DEP_TIME_BLK. numerical using the SAME label encoder we used on our train data for consistency, but first, as mentioned before, we must check if there are any values in these categories that weren't in the training data so we don't run into any errors. If we find any, we replace them with "UNSEEN"

In [None]:
new_dep_time_blk = []
for value in test_data["DEP_TIME_BLK"]:
       # If the value is unknown, we tag the "UNSEEN"
       if value not in dep_time_blk_le.classes_:
              new_dep_time_blk.append("UNSEEN")
       # If the value is known to the labelencoder, we can safely append that value
       else:
              new_dep_time_blk.append(value)
# Replace
test_data["DEP_TIME_BLK"] = new_dep_time_blk

In [None]:
new_departing_airport = []
for value in test_data["DEPARTING_AIRPORT"]:
       # If the value is unknown, we tag the "UNSEEN"
       if value not in departing_airport_le.classes_:
              new_departing_airport.append("UNSEEN")
       # If the value is known to the labelencoder, we can safely append that value
       else:
              new_departing_airport.append(value)
# Replace
test_data["DEPARTING_AIRPORT"] = new_departing_airport

In [None]:
new_dest = []
for value in test_data["DEST"]:
       # If the value is unknown, we tag the "UNSEEN"
       if value not in dest_le.classes_:
              new_dest.append("UNSEEN")
       # If the value is known to the labelencoder, we can safely append that value
       else:
              new_dest.append(value)
# Replace
test_data["DEST"] = new_dest

In [None]:
new_previous_airport = []
for value in test_data["PREVIOUS_AIRPORT"]:
       # If the value is unknown, we tag the "UNSEEN"
       if value not in previous_airport_le.classes_:
              new_previous_airport.append("UNSEEN")
       # If the value is known to the labelencoder, we can safely append that value
       else:
              new_previous_airport.append(value)
# Replace
test_data["PREVIOUS_AIRPORT"] = new_previous_airport

In [None]:
print(test_data)

In [None]:
test_data["DEP_TIME_BLK"] = dep_time_blk_le.transform(test_data["DEP_TIME_BLK"])
test_data["DEPARTING_AIRPORT"] = departing_airport_le.transform(test_data["DEPARTING_AIRPORT"])
test_data["DEST"] = dest_le.transform(test_data["DEST"])
test_data["PREVIOUS_AIRPORT"] = previous_airport_le.transform(test_data["PREVIOUS_AIRPORT"])

In [None]:
test_data

Great! Let's now do the same thing as we did before

In [None]:
test_data_np = test_data.to_numpy()

In [None]:
test_data_np

This is now entirely test data, and we don't need to split using `train_test_split` because we're not training a new model.

In [None]:
X_TEST = test_data_np[:, 1:] # The first column is the Id column, which we do not want to keep in our predictions

In [None]:
predictions = model.predict_proba(X_TEST) # Just like before

In [None]:
predictions

In [None]:
len(predictions)

Now, time to make our submission file! The submission file has two columns to named exactly this way. "Id", and "IS_DELAYED"

In [None]:
submission = test_data[["Id"]]

In [None]:
submission

In [None]:
submission["IS_DELAYED"] = predictions[:, 1]

In [None]:
submission

Now, we save the dataframe into a CSV

In [None]:
submission.to_csv("test_submission.csv", index=False)