In [None]:
import pandas as pd
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
df = pd.read_csv('gdrive/My Drive/CompleteData.csv')

columns_to_drop = ['DEP_HOUR',
'MKT_UNIQUE_CARRIER',
'MKT_CARRIER_FL_NUM',
'OP_UNIQUE_CARRIER',
'OP_CARRIER_FL_NUM',
'TAIL_NUM',
'ORIGIN',
'DEST',
'DEP_TIME',
'CRS_DEP_TIME',
'TAXI_OUT',
'DEP_DELAY',
'AIR_TIME',
'DISTANCE',
'LATITUDE',
'LONGITUDE',
'ELEVATION',
'MESONET_STATION',
'YEAR OF MANUFACTURE',
'MANUFACTURER',
'ICAO TYPE',
'RANGE',
'WIDTH',]

df = df.drop(columns=columns_to_drop)

# drop rows with missing values
df = df.dropna(how='any')


df.to_csv("gdrive/My Drive/Cleaned_Data.csv", index=False)

data = pd.read_csv('gdrive/My Drive/Cleaned_Data.csv')

In [None]:
# View the first few rows of the DataFrame
data.head()

# Check for missing values
data.isnull().sum()

# Explore the statistics of the dataset
data.describe()

# Explore the correlation between variables
data.corr()

In [None]:
#EDA Graphs Code

import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import seaborn as sns
import calendar



# Set the directory path to save the visuals
save_dir = "gdrive/My Drive/"

reason_mapping = {
    0: 'Not Cancelled',
    1: 'Carrier Cancellation',
    2: 'Weather Cancellation',
    3: 'National Air System Cancellation',
    4: 'Security Cancellation'
}

# Map the cancellation reasons using the dictionary
data['CANCELLED'] = data['CANCELLED'].map(reason_mapping)


active_mapping = {
    0: 'No weather events present',
    1: 'Weather event(s) present',
    2: 'Significant weather event(s)',

}

# Map the ACTIVE_WEATHER using the dictionary
data['ACTIVE_WEATHER'] = data['ACTIVE_WEATHER'].map(active_mapping)


data['FL_DATE'] = pd.to_datetime(data['FL_DATE'])
data['Year'] = data['FL_DATE'].dt.year
#data['Month'] = data['FL_DATE'].dt.month
data['Month'] = data['FL_DATE'].dt.month.apply(lambda x: calendar.month_abbr[x])
data['Day'] = data['FL_DATE'].dt.day


# Display the first few rows of the dataset
print(data.head())
# Get summary statistics of numerical columns
print(data.describe())
# Get information about the columns, data types, and missing values
print(data.info())

# Check for missing values in each column
print(data.isnull().sum())

temperature_ranges = [
    (-30, -20),
    (-20, -10),
    (-10, 0),
    (0, 10),
    (10, 20),
    (20, 30),
    (30, 40)
]

# Function to assign temperature ranges
def get_temperature_range(temperature):
    for min_temp, max_temp in temperature_ranges:
        if min_temp <= temperature < max_temp:
            return f'{min_temp}-{max_temp}'



plt.figure(figsize=(12, 8))
sns.countplot(x='ACTIVE_WEATHER', hue='CANCELLED', data=data)
plt.title('Cancellation Type by Active Weather')
plt.xlabel('Active Weather')
plt.ylabel('Count')
plt.legend(title='Cancellation Type',bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.tight_layout()
# Format y-axis ticks to show full numbers
plt.gca().yaxis.set_major_formatter(mticker.StrMethodFormatter('{x:,.0f}'))
#plt.savefig(save_dir + 'cancellation_by_active_weather.png')  # Save the visualization as an image file
plt.show()

# Count occurrences of each cancellation type in each temperature range
act_cancellation_counts = data.groupby(['ACTIVE_WEATHER', 'CANCELLED']).size().unstack(fill_value=0)
# Create a summary table from the counts
act_summary_table = act_cancellation_counts.reset_index()
# Display the summary table
print(act_summary_table)



plt.figure(figsize=(12, 8))
sns.countplot(x='VISIBILITY', hue='CANCELLED', data=data)
plt.title('Cancellation Type by Visibilty')
plt.xlabel('Visibilty')
plt.ylabel('Count')
plt.legend(title='Cancellation Type',bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.tight_layout()
# Format y-axis ticks to show full numbers
plt.gca().yaxis.set_major_formatter(mticker.StrMethodFormatter('{x:,.0f}'))
plt.savefig(save_dir + 'cancellation_by_Visibilty.png')  # Save the visualization as an image file
plt.show()

# Count occurrences of each cancellation type in each temperature range
vis_cancellation_counts = data.groupby(['VISIBILITY', 'CANCELLED']).size().unstack(fill_value=0)
# Create a summary table from the counts
vis_summary_table = vis_cancellation_counts.reset_index()
# Display the summary table
print(vis_summary_table)




# Apply the function to create a new column 'Temperature Range'
data['Temp_Range'] = data['TEMPERATURE'].apply(get_temperature_range)
plt.figure(figsize=(12, 8))
sns.countplot(x='Temp_Range', hue='CANCELLED', data=data,
              order=[f'{min_temp}-{max_temp}' for min_temp, max_temp in temperature_ranges])
plt.title('Cancellation Type by Temperature')
plt.xlabel('Temperature')
plt.ylabel('Count')
plt.legend(title='Cancellation Type',bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.tight_layout()
# Format y-axis ticks to show full numbers
plt.gca().yaxis.set_major_formatter(mticker.StrMethodFormatter('{x:,.0f}'))
plt.savefig(save_dir + 'Temperature.png')  # Save the visualization as an image file
plt.show()


# Count occurrences of each cancellation type in each temperature range
temperature_cancellation_counts = data.groupby(['Temp_Range', 'CANCELLED']).size().unstack(fill_value=0)
# Create a summary table from the counts
summary_table = temperature_cancellation_counts.reset_index()
# Display the summary table
print(summary_table)


weather_features = ['WIND_SPD', 'VISIBILITY', 'TEMPERATURE', 'DEW_POINT', 'REL_HUMIDITY', 'ALTIMETER', 'CLOUD_COVER']
correlation_matrix = data[weather_features].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap of Weather Features')
plt.tight_layout()
plt.savefig(save_dir + 'corr_weather_features.png')  # Save the visualization as an image file
plt.show()


# Example: Bar plot of cancellation reasons
plt.figure(figsize=(10, 6))
sns.countplot(x='CANCELLED', data=data)
plt.title('Cancellation Type')
plt.xlabel('Type')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
# Format y-axis ticks to show full numbers
plt.gca().yaxis.set_major_formatter(mticker.StrMethodFormatter('{x:,.0f}'))
plt.savefig(save_dir + 'barplot_cancel_types.png')  # Save the visualization as an image file
plt.show()

# Filter out rows where the cancellation status is "Not Cancelled"
cancelled_data = data[data['CANCELLED'] != 'Not Cancelled']
# Group the filtered data by month and cancellation status
cancellations_by_month = cancelled_data.groupby('Month')['CANCELLED'].value_counts().unstack()
# Create the stacked bar plot
plt.figure(figsize=(32, 20))
# Sort months chronologically
months_in_order = [calendar.month_abbr[i] for i in range(1, 13)]
cancellations_by_month = cancellations_by_month.reindex(months_in_order)
ax = cancellations_by_month.plot(kind='bar', stacked=True)
plt.title('Cancellations by Month')
plt.xlabel('Month')
plt.ylabel('Count')
# Move the legend outside the plot area to the left and at the bottom
legend = ax.legend(title='Cancellation Type', bbox_to_anchor=(0, -0.3), loc='upper left')
# Adjust layout to make room for the legend
plt.subplots_adjust(bottom=0.2)  # Increase the bottom margin to make space for the legend
plt.xticks(rotation=0)
plt.tight_layout()
plt.savefig(save_dir + 'cancels_by_month.png')  # Save the visualization as an image file
plt.show()

# Count occurrences of each cancellation type in each temperature range
month_cancellation_counts = data.groupby(['Month', 'CANCELLED']).size().unstack(fill_value=0)
# Create a summary table from the counts
month_table = month_cancellation_counts.reset_index()
# Display the summary table
print(month_table)


plt.figure(figsize=(12, 8))
sns.countplot(x='ACTIVE_WEATHER', hue='CANCELLED', data=cancelled_data)
plt.title('Cancellation Type by Active Weather')
plt.xlabel('Active Weather')
plt.ylabel('Count')
plt.legend(title='Cancellation Type',bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(save_dir + 'not-cancel_cancellation_by_active_weather.png')  # Save the visualization as an image file
plt.show()


plt.figure(figsize=(12, 8))
sns.countplot(x='VISIBILITY', hue='CANCELLED', data=cancelled_data)
plt.title('Cancellation Type by Visibilty')
plt.xlabel('Visibilty')
plt.ylabel('Count')
plt.legend(title='Cancellation Type',bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(save_dir + 'not-cancel_cancellation_by_Visibilty.png')  # Save the visualization as an image file
plt.show()




# Apply the function to create a new column 'Temperature Range'
cancelled_data['Temperature Range'] = cancelled_data['TEMPERATURE'].apply(get_temperature_range)

# Create the stacked bar plot using the new temperature range column
plt.figure(figsize=(12, 8))
sns.countplot(x='Temperature Range', hue='CANCELLED', data=cancelled_data,
              order=[f'{min_temp}-{max_temp}' for min_temp, max_temp in temperature_ranges])
plt.title('Cancellation Type by Temperature Ranges')
plt.xlabel('Temperature Range')
plt.ylabel('Count')
plt.legend(title='Cancellation Type',bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(save_dir + 'not-cancel_Temperature.png')  # Save the visualization as an image file
plt.show()



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
df = data

# Preprocess datetime variables
df['FL_DATE'] = pd.to_datetime(df['FL_DATE'])
df['year'] = df['FL_DATE'].dt.year
df['month'] = df['FL_DATE'].dt.month
df['day'] = df['FL_DATE'].dt.day


features = list(df.columns)  # Add other features
features.remove('CANCELLED')
features.remove('FL_DATE')

target = 'CANCELLED'  # the target variable
print(features)

In [None]:
#RANDOM FOREST CODE

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import log_loss

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42)

# Train the random forest model
rf_model = RandomForestClassifier(verbose=2)
rf_model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = rf_model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision_micro = precision_score(y_test, y_pred, average='micro')
recall_micro = recall_score(y_test, y_pred, average='micro')
f1_micro = f1_score(y_test, y_pred, average='micro')
# Make predictions on the testing set
y_pred_prob = rf_model.predict_proba(X_test)  # Replace rf_model with your trained model
# Calculate log loss
logloss = log_loss(y_test, y_pred_prob)



# Calculate error rates
error_rate = 1 - accuracy
precision_error = 1 - precision_micro
recall_error = 1 - recall_micro
f1_error = 1 - f1_micro

# Print the evaluation metrics and error rates
print("Accuracy:", accuracy)
print("Error Rate:", error_rate)
print("Precision (Micro):", precision_micro)
print("Precision Error:", precision_error)
print("Recall (Micro):", recall_micro)
print("Recall Error:", recall_error)
print("F1-score (Micro):", f1_micro)
print("F1-score Error:", f1_error)
# Print log loss
print("Log Loss:", logloss)

# Generate the classification report
report = classification_report(y_test, y_pred)


# Display the classification report
print(report)

In [None]:
#Logistic regression

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# import data
df = pd.read_csv('gdrive/My Drive/CompleteData.csv')
# drop rows with missing values
df = df.dropna(how='any')

# Convert datetime column to pandas datetime format (if needed)
df['FL_DATE'] = pd.to_datetime(df['FL_DATE'])

# Set datetime column as the index
df.set_index('FL_DATE', inplace=True)

# Split the data into features (X) and target variable (y)

X = df[['WIND_DIR', 'WIND_SPD', 'WIND_GUST', 'VISIBILITY', 'TEMPERATURE',
        'DEW_POINT', 'REL_HUMIDITY', 'ALTIMETER', 'LOWEST_CLOUD_LAYER', 'N_CLOUD_LAYER',
        'LOW_LEVEL_CLOUD', 'MID_LEVEL_CLOUD', 'HIGH_LEVEL_CLOUD',
        'CLOUD_COVER', 'ACTIVE_WEATHER',]]  # Select relevant weather-related features


y = df['CANCELLED']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an instance of the Logistic Regression model
logreg = LogisticRegression(multi_class='ovr')  # Use OvR strategy for multi-class classification

# Fit the model to the training data
logreg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = logreg.predict(X_test)

# Output the classification report
classification_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(classification_report)

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the MLP classifier
mlp = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=1000, random_state=42, verbose=True)
mlp.fit(X_train, y_train)

# Predict on the test set
y_pred = mlp.predict(X_test)

# Generate classification report
classification_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(classification_report)

In [None]:
data = data[data['CANCELLED'] != 0]
mapping = {
    1: 'carrier_cancelled',
    2: 'weather_cancelled',
    3: 'national_air_system_cancelled',
    4: 'security_cancelled'
}

# Rename values in 'CANCELLED' column based on the mapping
data['CANCELLED'] = data['CANCELLED'].replace(mapping)

# Print the updated DataFrame
print(data)

data.to_csv('gdrive/My Drive/only_non_cancelled_type_data.csv', index=False)

In [None]:
# View the first few rows of the DataFrame
data.head()

# Check for missing values
data.isnull().sum()

# Explore the statistics of the dataset
data.describe()

# Explore the correlation between variables
data.corr()

In [None]:
#  RF model 1 default hyperparameters

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42)

# Train the random forest model
rf_model = RandomForestClassifier(verbose=2)
rf_model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = rf_model.predict(X_test)

# Generate the classification report
report = classification_report(y_test, y_pred)

# Display the classification report
print(report)

In [None]:
#  RF model 2 for better results attempt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42)

# Train the random forest model with hyperparameter tuning # 83, 56, 67
rf_model = RandomForestClassifier(
    n_estimators=100,  # Increase the number of trees
    max_depth=10,      # Limit tree depth to reduce overfitting
    min_samples_split=5,  # Minimum samples required to split a node
    min_samples_leaf=2,   # Minimum samples required to be a leaf node
    class_weight='balanced',  # Adjust class weights for imbalanced data
    random_state=42,
    verbose=2
)



rf_model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = rf_model.predict(X_test)

# Generate the classification report
report = classification_report(y_test, y_pred)

# Display the classification report
print(report)

In [None]:
#  RF model 3 for better results attempt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42)

# Train the random forest model with hyperparameter tuning # 83, 65, 73
rf_model = RandomForestClassifier(
    n_estimators=100,  # Increase the number of trees
    max_depth=15,      # Limit tree depth to reduce overfitting
    min_samples_split=10,  # Minimum samples required to split a node
    min_samples_leaf=2,   # Minimum samples required to be a leaf node
    class_weight='balanced',  # Adjust class weights for imbalanced data
    random_state=42,
    verbose=2
)


rf_model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = rf_model.predict(X_test)

# Generate the classification report
report = classification_report(y_test, y_pred)

# Display the classification report
print(report)

In [None]:
#  RF model 4 for better results attempt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42)

# Train the random forest model with hyperparameter tuning # 81, 69, 74
rf_model = RandomForestClassifier(
    n_estimators=100,  # Increase the number of trees
    max_depth=25,      # Limit tree depth to reduce overfitting
    min_samples_split=10,  # Minimum samples required to split a node
    min_samples_leaf=2,   # Minimum samples required to be a leaf node
    class_weight='balanced',  # Adjust class weights for imbalanced data
    random_state=42,
    verbose=2
)

rf_model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = rf_model.predict(X_test)

# Generate the classification report
report = classification_report(y_test, y_pred)

# Display the classification report
print(report)