In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Machine Learning

In [None]:
%matplotlib inline
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('/kaggle/input/cln-flr-bgo-csv/CLN_FLR_BGO_21_24.csv',delimiter=';')

In [None]:
df.head()

In [None]:
dep_only_df = df[df['Dep'] == 1]
# Dropping columns that are not needed.
columns_to_drop = ['Arr', 'Flight In', 'STA', 'ATA',]
cleaned_df = dep_only_df.drop(columns=columns_to_drop)

In [None]:
cleaned_df.head()

In [None]:
cleaned_df['Date'] = pd.to_datetime(cleaned_df['Date'], format='%d/%m/%Y')
cleaned_df['STD'] = pd.to_datetime(cleaned_df['STD'], format='%H:%M').dt.time
cleaned_df['ATD'] = pd.to_datetime(cleaned_df['ATD'], format='%H:%M').dt.time
cleaned_df['DayOfWeek'] = cleaned_df['Date'].dt.dayofweek
cleaned_df['Month'] = cleaned_df['Date'].dt.month
cleaned_df['STD_Minutes'] = cleaned_df['STD'].apply(lambda x: x.hour * 60 + x.minute)
cleaned_df['ATD_Minutes'] = cleaned_df['ATD'].apply(lambda x: x.hour * 60 + x.minute)
cleaned_df = cleaned_df.drop('Origin', axis=1)
cleaned_df = cleaned_df.drop('Delay Code / Time', axis=1)
cleaned_df['Flight Out'] = cleaned_df['Flight Out'].str.replace('WF','')
cleaned_df['Aircraft Type'] = cleaned_df['Aircraft Type'].str.replace('DH','')
cleaned_df['Aircraft Type'] = cleaned_df['Aircraft Type'].str.replace('AT','')
cleaned_df = cleaned_df.dropna()
# Verify the operation by checking for NaN values again
print(cleaned_df.isna().sum())

In [None]:
cleaned_df.head()

In [None]:
print(cleaned_df.dtypes)

In [None]:
cleaned_df.info()

In [None]:
# For destination
le_destination = LabelEncoder()
cleaned_df['Destination'] = le_destination.fit_transform(cleaned_df['Destination'])

# Extracting the mapping
destination_mapping = dict(zip(le_destination.classes_, le_destination.transform(le_destination.classes_)))
destination_mapping_df = pd.DataFrame(list(destination_mapping.items()), columns=['Destination', 'Encoded_Value'])

# For aircraft reg
le_aircraft_reg = LabelEncoder()
cleaned_df['Aircraft Reg'] = le_aircraft_reg.fit_transform(cleaned_df['Aircraft Reg'])

# Extracting the mapping
aircraft_reg_mapping = dict(zip(le_aircraft_reg.classes_, le_aircraft_reg.transform(le_aircraft_reg.classes_)))
aircraft_reg_mapping_df = pd.DataFrame(list(aircraft_reg_mapping.items()), columns=['Aircraft Reg', 'Encoded_Value'])

print(destination_mapping_df)
print(aircraft_reg_mapping_df)

In [None]:
cleaned_df.head()

In [None]:
unique_aircraft_types = cleaned_df['Aircraft Type'].unique()
print(unique_aircraft_types)

* Dash8-100 = 1
* Dash8-200 = 2
* Dash8-300 = 3
* Dash8-400 = 4
* Embraer E2 = 290
* ATR72 = 7

In [None]:
X = cleaned_df.drop(['ATD_Minutes', 'ATD','STD', 'Date'], axis=1)  # Adjust according to the final set of features
y = cleaned_df['ATD_Minutes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
rf = RandomForestRegressor(n_estimators=100, random_state = 42)
rf.fit(X_train, y_train)

predictions = rf.predict(X_test)

mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse= mean_squared_error(y_test, predictions, squared=False)
r2 = r2_score(y_test, predictions)

print("Mean Squared Error:", mse)
print("Root mean squared error:", rmse)
print("R2-score", r2)
print("Mean Absolute Error:", mae)

In [None]:
# Scatter plot of actual vs predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, predictions, color='blue', alpha=0.5)  # Actual vs Predicted
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linewidth=2)  # Perfect predictions line
plt.title('Actual vs Predicted Values')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.show()

In [None]:
importances = rf.feature_importances_
features = X.columns
plt.bar(features, importances)
plt.xticks(rotation=90)
plt.xlabel('Features')
plt.ylabel('Importance')
plt.title('Feature Importances')
plt.show()

# Feature engineering:

Adding a time of day feature (Morning, Afternoon, Evening, Night)

Adding a holiday feature (Christmas, Easter, Summer vacation etc..)

Adding a season feature (Winter, Spring, Summer, Autumn)

In [None]:
cleaned_df['STD'] = pd.to_datetime(cleaned_df['STD'], format='%H:%M:%S')
cleaned_df['STD_hour'] = cleaned_df['STD'].dt.hour

def get_time_of_day(hour):
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'
    
cleaned_df['TimeOfDay'] = cleaned_df['STD_hour'].apply(get_time_of_day)
cleaned_df['STD'] = pd.to_datetime(cleaned_df['STD'], format='%H:%M').dt.time

In [None]:
def get_season(date_obj):
    month = date_obj.month
    if 3 <= month < 6:
        return 'Spring'
    elif 6 <= month < 9:
        return 'Summer'
    elif 9 <= month < 12:
        return 'Autumn'
    else:
        return 'Winter'
    
cleaned_df['Season'] = cleaned_df['Date'].apply(get_season)

In [None]:
def identify_holiday(week_of_year, year):
    if (week_of_year >= 50 and year == 2021) or (week_of_year <= 1 and year == 2022):
        return 'Christmas_Holidays'
    elif (week_of_year >= 50 and year == 2022) or (week_of_year <= 1 and year == 2023):
        return 'Christmas_Holidays'
    elif (week_of_year >= 50 and year == 2023) or (week_of_year <= 1 and year == 2024):
        return 'Christmas_Holidays'
    elif 8 <= week_of_year <= 9:
        return 'Winter_Holiday'
    elif 11 <= week_of_year <= 13:
        return 'Easter_Holiday'
    elif 28 <= week_of_year <= 30:
        return 'Summer_Vacation'
    elif 40 <= week_of_year <= 41:
        return 'Autumn_Vacation'
    else:
        return 'Regular'

# Apply the function to each row in your dataframe to create a new 'Holiday' column
# Assuming cleaned_df has a 'Date' column of dtype datetime64[ns]
cleaned_df['WeekOfYear'] = cleaned_df['Date'].dt.isocalendar().week
cleaned_df['Year'] = cleaned_df['Date'].dt.year
cleaned_df['Holiday'] = cleaned_df.apply(lambda row: identify_holiday(row['WeekOfYear'], row['Year']), axis=1)

In [None]:
cleaned_df.head()

In [None]:
cleaned_df = cleaned_df.drop(['Date', 'Dep', 'DayOfWeek', 'Month', 'STD_hour', 'WeekOfYear', 'Year'], axis=1)

In [None]:
cleaned_df.head()

In [None]:
# One-hot encode categorical variables
cleaned_df = pd.get_dummies(cleaned_df, columns=['TimeOfDay', 'Season', 'Holiday'])

In [None]:
cleaned_df.head()

In [None]:
cleaned_df.columns

In [None]:
# Define features and target variable
selected_columns = ['Flight Out', 'Destination', 
                    'Aircraft Type', 'Aircraft Reg', 'STD_Minutes', 
                    'ATD_Minutes', 'TimeOfDay_Afternoon', 'TimeOfDay_Evening',
                    'TimeOfDay_Morning', 'TimeOfDay_Night',
                    'Season_Autumn', 'Season_Spring', 'Season_Summer',
                    'Season_Winter', 'Holiday_Autumn_Vacation',
                    'Holiday_Christmas_Holidays', 'Holiday_Easter_Holiday',
                    'Holiday_Regular', 'Holiday_Summer_Vacation', 'Holiday_Winter_Holiday']
X = cleaned_df[selected_columns].drop(['ATD_Minutes'], axis=1)  # Features
y = cleaned_df['ATD_Minutes']  # Target variable

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize RandomForestRegressor model
model = RandomForestRegressor(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
predictions = model.predict(X_test)

mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse= mean_squared_error(y_test, predictions, squared=False)
r2 = r2_score(y_test, predictions)

print("Mean Squared Error:", mse)
print("Root mean squared error:", rmse)
print("R2-score", r2)
print("Mean Absolute Error:", mae)

In [None]:
from sklearn.inspection import permutation_importance

# Scatter plot of actual vs predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, predictions, color='blue', alpha=0.5)  # Actual vs Predicted
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linewidth=2)  # Perfect predictions line
plt.title('Actual vs Predicted Values')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.show()

# Feature importance plot
result = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42, n_jobs=-1)
sorted_idx = result.importances_mean.argsort()

plt.figure(figsize=(10, 8))
plt.barh(X.columns[sorted_idx], result.importances_mean[sorted_idx])
plt.xlabel('Permutation Importance')
plt.title('Feature Importance')
plt.show()

Saving the model.

In [None]:
import pickle

# Save the model to a file
with open('random_forest_model.pkl', 'wb') as f:
    pickle.dump(model, f)


# Deep Learning

In [None]:
pip install fastai

In [None]:
dlf = pd.read_csv('/kaggle/input/cln-flr-bgo-csv/CLN_FLR_BGO_21_24.csv',delimiter=';')
dlf.drop(['Arr','Flight In', 'Dep','Origin','STA','ATA','Delay Code / Time'], axis=1, inplace=True)
dlf.head()

In [None]:
def time_to_minutes(time_str):
    if pd.isna(time_str):
        return None
    hours, minutes = map(int, time_str.split(':'))
    return hours * 60 + minutes

dlf['STD_Minutes'] = dlf['STD'].apply(time_to_minutes)
dlf['ATD_Minutes'] = dlf['ATD'].apply(time_to_minutes)

dlf.dropna(subset=['ATD_Minutes'], inplace=True)

dlf['Date'] = pd.to_datetime(dlf['Date'], format='%d/%m/%Y')

In [None]:
dlf.head()

In [None]:
from fastai.tabular.all import *

# Define categorical and continuous columns
categorical_cols = ['Flight Out', 'Destination', 'Aircraft Type', 'Aircraft Reg']
continuous_cols = ['STD_Minutes']
y_names = 'ATD_Minutes'

# Process the data
splits = RandomSplitter(valid_pct=0.2, seed=42)(range_of(dlf))
procs = [Categorify, FillMissing, Normalize]
to = TabularPandas(dlf, procs=procs,
                   cat_names=categorical_cols,
                   cont_names=continuous_cols,
                   y_names=y_names,
                   y_block=RegressionBlock(),
                   splits=splits)

dls = to.dataloaders(bs=64)

In [None]:
learn = tabular_learner(dls, metrics=rmse)

In [None]:
learn.fit_one_cycle(15)

In [None]:
preds,targs = learn.get_preds()

mae = mean_absolute_error(targs, preds)
mse = mean_squared_error(targs, preds)
rmse= mean_squared_error(targs, preds, squared=False)
r2 = r2_score(targs, preds)

print("Mean Squared Error:", mse)
print("Root mean squared error:", rmse)
print("R2-score", r2)
print("Mean Absolute Error:", mae)

In [None]:
learn.show_results()

In [None]:
# Scatter plot of actual vs predicted values
plt.figure(figsize=(10, 6))
plt.scatter(targs, preds, color='blue', alpha=0.5)  # Actual vs Predicted
plt.plot([targs.min(), targs.max()], [targs.min(), targs.max()], color='red', linewidth=2)  # Perfect predictions line
plt.title('Actual vs Predicted Values')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.show()