In [44]:
import pandas as pd
import numpy as np
import category_encoders as ce
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

dataset = pd.read_csv("./datathon_train.csv")

def get_time(string):
  return int(string[:2])
dataset['DEP_TIME'] = dataset['DEP_TIME_BLK'].apply(get_time)


In [45]:
dataset_clean = dataset.drop(['Id', 'ORIGIN_CITY_NAME', 'DEST_CITY_NAME', 'DEP_TIME_BLK','DEST'], axis = 1)
dataset_clean.head()

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,DEPARTING_AIRPORT,DISTANCE,SEGMENT_NUMBER,CONCURRENT_FLIGHTS,MANUFACTURE_YEAR,NUMBER_OF_SEATS,CARRIER_NAME,...,PLANE_AGE,PREVIOUS_AIRPORT,PRCP,SNOW,SNWD,TMAX,AWND,DEP_DELAY_NEW,IS_DELAYED,DEP_TIME
0,1,17,4,Raleigh-Durham International,427,6,13,2014.0,76,Endeavor Air Inc.,...,5,Ronald Reagan Washington National,0.01,0.0,0.0,49.0,4.7,2.0,1,18
1,2,27,3,San Jose International,2689,2,3,2002.0,162,JetBlue Airways,...,17,John F. Kennedy International,0.21,0.0,0.0,64.0,12.75,0.0,0,22
2,8,14,3,Los Angeles International,1947,1,30,1996.0,199,Delta Air Lines Inc.,...,23,NONE,0.0,0.0,0.0,74.0,8.5,0.0,0,6
3,3,16,6,Newark Liberty International,488,3,23,2015.0,76,"Midwest Airline, Inc.",...,4,Dallas Fort Worth Regional,0.0,0.0,0.0,56.0,17.67,0.0,0,14
4,6,20,4,Douglas Municipal,541,4,24,1998.0,128,American Airlines Inc.,...,21,Myrtle Beach International,0.19,0.0,0.0,89.0,9.17,79.0,1,15


In [58]:
dataset_clean.shape

(697224, 25)

In [46]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# Assuming 'X_df' is your dataset and 'Departing Airport Name' is a categorical column
categorical_columns = ['PREVIOUS_AIRPORT', 'CARRIER_NAME', 'DEPARTING_AIRPORT']

# Apply one-hot encoding to categorical columns
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
X_encoded = encoder.fit_transform(dataset_clean[categorical_columns])

# Drop the original categorical columns from the original DataFrame
dataset_clean = dataset_clean.drop(categorical_columns, axis=1)

# Convert the one-hot encoded NumPy array back to a DataFrame with feature names
encoded_feature_names = []
for category, column in zip(encoder.categories_, categorical_columns):
    encoded_feature_names.extend([f"{column}_{cat}" for cat in category])
X_encoded_df = pd.DataFrame(X_encoded, columns=encoded_feature_names)

# Concatenate the one-hot encoded columns with the original dataset
X_df = pd.concat([dataset_clean, X_encoded_df], axis=1)




In [47]:
X_df.shape

(697224, 475)

In [48]:
y = X_df['IS_DELAYED'].to_numpy()
New_X_df = X_df.drop(['DEP_DELAY_NEW', 'IS_DELAYED'], axis = 1)
X = New_X_df.to_numpy()

In [49]:
# Check the number of columns (features) in the original X_df
original_num_columns = X_df.shape[1]
print("Number of columns in original X_df:", original_num_columns)

# Check the number of columns (features) in New_X_df
new_num_columns = New_X_df.shape[1]
print("Number of columns in New_X_df:", new_num_columns)


Number of columns in original X_df: 475
Number of columns in New_X_df: 473


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=42)

In [16]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Create a GradientBoostingClassifier with specified parameters
gbm = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, subsample=0.9)

# Train the GBM model
gbm.fit(X_train, y_train)

# Make predictions on the test data
y_pred = gbm.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)






Accuracy: 0.6605041414177633


In [20]:
import joblib
# Save the model to a file
joblib.dump(gbm, 'gradient_boosting_model.pkl')

['gradient_boosting_model.pkl']

In [17]:
from sklearn.metrics import classification_report
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.6605041414177633
Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.85      0.75     83870
           1       0.62      0.37      0.47     55575

    accuracy                           0.66    139445
   macro avg       0.65      0.61      0.61    139445
weighted avg       0.65      0.66      0.64    139445



In [60]:
test_sample = pd.read_csv("./datathon_test.csv")

def get_time(string):
  return int(string[:2])
test_sample['DEP_TIME'] = test_sample['DEP_TIME_BLK'].apply(get_time)

test_sample = test_sample.drop(['Id', 'ORIGIN_CITY_NAME', 'DEST_CITY_NAME', 'DEP_TIME_BLK','DEST'], axis = 1)
test_sample.head()


Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,DEPARTING_AIRPORT,DISTANCE,SEGMENT_NUMBER,CONCURRENT_FLIGHTS,MANUFACTURE_YEAR,NUMBER_OF_SEATS,CARRIER_NAME,...,FLT_ATTENDANTS_PER_PASS,GROUND_SERV_PER_PASS,PLANE_AGE,PREVIOUS_AIRPORT,PRCP,SNOW,SNWD,TMAX,AWND,DEP_TIME
0,7,1,1,Minneapolis-St Paul International,980,4,47,2001.0,132,Delta Air Lines Inc.,...,0.000144,0.000149,18,Bradley International,0.0,0.0,0.0,93.0,4.7,20
1,4,12,5,Los Angeles International,1797,2,40,2001.0,160,Delta Air Lines Inc.,...,0.000144,0.000149,18,Cincinnati/Northern Kentucky International,0.0,0.0,0.0,73.0,12.3,10
2,10,11,5,Ronald Reagan Washington National,298,2,23,2003.0,50,Comair Inc.,...,0.0,9e-05,16,Norfolk International,0.0,0.0,0.0,75.0,5.82,7
3,9,26,4,LaGuardia,733,5,28,2007.0,120,United Air Lines Inc.,...,0.000254,0.000229,12,Chicago O'Hare International,0.0,0.0,0.0,82.0,9.4,20
4,8,13,2,Detroit Metro Wayne County,164,9,64,2003.0,50,SkyWest Airlines Inc.,...,3.4e-05,9.9e-05,16,Kalamazoo/Battle Creek International,0.0,0.0,0.0,88.0,7.61,20


In [61]:
test_sample.shape

(298811, 26)

In [62]:
# Assuming 'X_df' is your dataset and 'Departing Airport Name' is a categorical column
categorical_columns = ['PREVIOUS_AIRPORT', 'CARRIER_NAME', 'DEPARTING_AIRPORT']

# Apply one-hot encoding to categorical columns
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
test_encoded = encoder.fit_transform(test_sample[categorical_columns])

# Drop the original categorical columns from the original DataFrame
test_sample = test_sample.drop(categorical_columns, axis=1)

# Convert the one-hot encoded NumPy array back to a DataFrame with feature names
encoded_feature_names = []
for category, column in zip(encoder.categories_, categorical_columns):
    encoded_feature_names.extend([f"{column}_{cat}" for cat in category])
test_encoded_df = pd.DataFrame(test_encoded, columns=encoded_feature_names)

# Concatenate the one-hot encoded columns with the original dataset
test_df = pd.concat([test_sample, test_encoded_df], axis=1)



In [73]:
test_df = test_df.drop('PREVIOUS_AIRPORT_Adak NS', axis =1)
test_df['PREVIOUS_AIRPORT_Pago Pago International'] = 0
test_df['PREVIOUS_AIRPORT_Yakutat Airport'] = 0


In [74]:
for i in test_df.columns: 
    if i in New_X_df.columns:
        continue
    else: 
        print(i)

In [75]:
test_df.shape

(298811, 473)

In [76]:
New_test_df = test_df.to_numpy()

In [77]:
New_test_df.shape

(298811, 473)

In [78]:
# Make predictions on the test data
y_pred = gbm.predict(New_test_df)

In [83]:
y_pred.shape

(298811,)

In [84]:

# Assuming you have a list of IDs for your test data (starting from 0)
ids = range(len(New_test_df))

y_probabilities = gbm.predict_proba(New_test_df)
probabilities_class_1 = y_probabilities[:, 1]

# Create a DataFrame with the IDs and corresponding probabilities
output_df = pd.DataFrame({'Id': ids, 'IS_DELAYED': probabilities_class_1})

# Save the DataFrame to a CSV file
output_df.to_csv('submission.csv', index=False)

In [85]:
y_probabilities.shape

(298811, 2)

In [86]:
print(y_probabilities)

[[0.55606071 0.44393929]
 [0.58077518 0.41922482]
 [0.78852985 0.21147015]
 ...
 [0.62471108 0.37528892]
 [0.80500722 0.19499278]
 [0.65607532 0.34392468]]


Accuracy: 0.6601025493922335
Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.85      0.75     83870
           1       0.62      0.37      0.47     55575

    accuracy                           0.66    139445
   macro avg       0.65      0.61      0.61    139445
weighted avg       0.65      0.66      0.64    139445

#It takes 16 minutes to run

In [10]:
from sklearn.model_selection import train_test_split, GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 0.9, 1.0]
}
grid_search = GridSearchCV(gbm, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

KeyboardInterrupt: 