In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'catboost'

In [27]:
# Create dataframe
df2018flights = pd.read_csv("./Datasets1/2018.csv")

# Remove op_carrier_fl_num, crs_dep_time, dep_time, wheels_off, wheels_on, cancelled, cancellation_co..., diverted, actual_elapsed_..., air_time, unnamed:27
df2018flights = df2018flights.drop(["OP_CARRIER_FL_NUM", "CRS_DEP_TIME", "DEP_TIME", "WHEELS_OFF", "WHEELS_ON", "CANCELLED", "CANCELLATION_CODE", "DIVERTED", "ACTUAL_ELAPSED_TIME", "AIR_TIME", "Unnamed: 27"], axis= 1)

# Create month column
df2018flights["Month"] = pd.DatetimeIndex(df2018flights["FL_DATE"]).month

# Remove date column
df2018flights = df2018flights.drop(["FL_DATE"], axis= 1)

# Add labeled column
df2018flights["Label"] = np.where(df2018flights["ARR_DELAY"] > 0, 1, 0)

# Remove column use to make label
df2018flights = df2018flights.drop(["ARR_DELAY"], axis= 1)

# Dummy encode carrier, destination, and origin columns
df2018flights = pd.get_dummies(df2018flights, columns= ["OP_CARRIER", "ORIGIN", "DEST"])

df2018flights

Unnamed: 0,DEP_DELAY,TAXI_OUT,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,CRS_ELAPSED_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,...,DEST_USA,DEST_VEL,DEST_VLD,DEST_VPS,DEST_WRG,DEST_WYS,DEST_XNA,DEST_YAK,DEST_YNG,DEST_YUM
0,-5.0,15.0,10.0,1745,1722.0,-23.0,268.0,1605.0,,,...,0,0,0,0,0,0,0,0,0,0
1,-8.0,11.0,7.0,1254,1230.0,-24.0,99.0,414.0,,,...,0,0,0,0,0,0,0,0,0,0
2,-5.0,15.0,5.0,1649,1636.0,-13.0,134.0,846.0,,,...,0,0,0,0,0,0,0,0,0,0
3,6.0,19.0,6.0,1756,1754.0,-2.0,190.0,1120.0,,,...,0,0,0,0,0,0,0,0,0,0
4,20.0,13.0,10.0,922,936.0,14.0,112.0,723.0,,,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7213441,-4.0,20.0,7.0,1714,1709.0,-5.0,100.0,331.0,,,...,0,0,0,0,0,0,0,0,0,0
7213442,6.0,18.0,10.0,1952,1953.0,1.0,181.0,936.0,,,...,0,0,0,0,0,0,0,0,0,0
7213443,-5.0,36.0,4.0,2107,2118.0,11.0,112.0,511.0,,,...,0,0,0,0,0,0,0,0,0,0
7213444,23.0,11.0,4.0,1350,1404.0,14.0,50.0,130.0,,,...,0,0,0,0,0,0,0,0,0,0


In [28]:
X = df2018flights.drop(["Label"], axis= 1)
y = df2018flights["Label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=69)

In [None]:
## 😸boost ##

# Calculate balance of class weights
import sklearn
weights = sklearn.utils.class_weight.compute_class_weight(class_weight= 'balanced', classes= np.unique(y), y= y)

cb = CatBoostClassifier(iterations=100, random_seed=0, logging_level='Silent', class_weights= weights)
cb.fit(X_train, y_train)

# predict on test set
y_pred = cb.predict(X_test)

# evaluate model performance
conf_mat = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Confusion matrix:")
print(conf_mat)
sns.heatmap(conf_mat, annot=True)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)