In [36]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

dataset = pd.read_csv("./datathon_train.csv")

def get_time(string):
  return int(string[:2])
dataset['DEP_TIME'] = dataset['DEP_TIME_BLK'].apply(get_time)


In [37]:
def subtract(num):
    return 2019-num

In [38]:
dataset['MANUFACTURE_YEAR'] = dataset['MANUFACTURE_YEAR'].apply(subtract)

In [39]:
dataset_clean = dataset.drop(['Id', 'ORIGIN_CITY_NAME', 'DEST_CITY_NAME', 'DEP_TIME_BLK','DEST'], axis = 1)
dataset_clean.head()

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,DEPARTING_AIRPORT,DISTANCE,SEGMENT_NUMBER,CONCURRENT_FLIGHTS,MANUFACTURE_YEAR,NUMBER_OF_SEATS,CARRIER_NAME,...,PLANE_AGE,PREVIOUS_AIRPORT,PRCP,SNOW,SNWD,TMAX,AWND,DEP_DELAY_NEW,IS_DELAYED,DEP_TIME
0,1,17,4,Raleigh-Durham International,427,6,13,5.0,76,Endeavor Air Inc.,...,5,Ronald Reagan Washington National,0.01,0.0,0.0,49.0,4.7,2.0,1,18
1,2,27,3,San Jose International,2689,2,3,17.0,162,JetBlue Airways,...,17,John F. Kennedy International,0.21,0.0,0.0,64.0,12.75,0.0,0,22
2,8,14,3,Los Angeles International,1947,1,30,23.0,199,Delta Air Lines Inc.,...,23,NONE,0.0,0.0,0.0,74.0,8.5,0.0,0,6
3,3,16,6,Newark Liberty International,488,3,23,4.0,76,"Midwest Airline, Inc.",...,4,Dallas Fort Worth Regional,0.0,0.0,0.0,56.0,17.67,0.0,0,14
4,6,20,4,Douglas Municipal,541,4,24,21.0,128,American Airlines Inc.,...,21,Myrtle Beach International,0.19,0.0,0.0,89.0,9.17,79.0,1,15


In [40]:
df2 = dataset_clean.groupby(['DEPARTING_AIRPORT'])['IS_DELAYED'].count()
df2 = pd.DataFrame(df2, columns = ['DEPARTING_AIRPORT', 'IS_DELAYED'])
df2['DEPARTING_AIRPORT'] = df2.index
df2 = df2.sort_values(by='IS_DELAYED', ascending=False)
df2 = df2.head(25)
airports_to_keep = list(df2['DEPARTING_AIRPORT'].values)
dataset_clean= dataset_clean[dataset_clean['DEPARTING_AIRPORT'].isin(airports_to_keep)]

In [41]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# Assuming 'X_df' is your dataset and 'Departing Airport Name' is a categorical column
categorical_columns = ['PREVIOUS_AIRPORT', 'CARRIER_NAME', 'DEPARTING_AIRPORT']

# Apply one-hot encoding to categorical columns
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
X_encoded = encoder.fit_transform(dataset_clean[categorical_columns])

# Drop the original categorical columns from the original DataFrame
dataset_clean = dataset_clean.drop(categorical_columns, axis=1)

# Convert the one-hot encoded NumPy array back to a DataFrame with feature names
encoded_feature_names = []
for category, column in zip(encoder.categories_, categorical_columns):
    encoded_feature_names.extend([f"{column}_{cat}" for cat in category])
X_encoded_df = pd.DataFrame(X_encoded, columns=encoded_feature_names)

# Concatenate the one-hot encoded columns with the original dataset
X_df = pd.concat([dataset_clean, X_encoded_df], axis=1)




In [42]:
X_df = X_df.dropna()

In [46]:
print(X_df.shape)

(317214, 397)


In [47]:
y = X_df['IS_DELAYED'].to_numpy()
New_X_df = X_df.drop(['DEP_DELAY_NEW', 'IS_DELAYED'], axis = 1)
X = New_X_df.to_numpy()

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=42)

In [50]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Create a GradientBoostingClassifier with specified parameters
gbm = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)

# Train the GBM model
gbm.fit(X_train, y_train)

# Make predictions on the test data
y_pred = gbm.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)






Accuracy: 0.6543353876708227


In [35]:
from sklearn.metrics import classification_report
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.6545
Classification Report:
               precision    recall  f1-score   support

         0.0       0.67      0.83      0.74      1203
         1.0       0.60      0.39      0.47       797

    accuracy                           0.65      2000
   macro avg       0.64      0.61      0.61      2000
weighted avg       0.64      0.65      0.64      2000



Accuracy: 0.6601025493922335
Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.85      0.75     83870
           1       0.62      0.37      0.47     55575

    accuracy                           0.66    139445
   macro avg       0.65      0.61      0.61    139445
weighted avg       0.65      0.66      0.64    139445

#It takes 16 minutes to run

In [8]:
from sklearn.model_selection import train_test_split, GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 0.9, 1.0]
}
grid_search = GridSearchCV(gbm, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [9]:
best_clf = grid_search.best_estimator_

y_pred = best_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Best Parameters:", grid_search.best_params_)
print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.9}
Accuracy: 0.6535
Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.83      0.74      1199
           1       0.61      0.39      0.47       801

    accuracy                           0.65      2000
   macro avg       0.64      0.61      0.61      2000
weighted avg       0.64      0.65      0.63      2000

