# ML Models

In [2]:
import numpy as np
import pandas as pd
from datetime import datetime

# Machine learning
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

### Load dataset

In [3]:
flights = pd.read_csv("/work/Dataset_Post_Analysis.csv", sep=",")

flights.head()

Unnamed: 0,Flight_ID,Aeronave,Capacity,Passengers,Bookings,STD-int,STA-int,DUR_HR,DepartureStation_E,ArrivalStation_E,...,Q_Perecederos,Q_Refrescos,Q_Sopas,Total_Quantity,WeekoftheYear,Month,DUR_INTERVAL,DayoftheWeek_E,Time_Interval_E,Week_Month_Label_E
0,ab954014077430bd842cfa305a55c0f8,XA-VBY,240,229.0,157.0,1697716000.0,1697726000.0,2.75,15.0,21.0,...,4.0,17.0,0.0,42.0,42,10,2.0,4.0,1.0,42.0
1,efd86c996035dacdca7a0ccb2560dda1,XA-VIX,186,186.0,109.0,1688346000.0,1688360000.0,4.0,34.0,17.0,...,1.0,6.0,2.0,15.0,27,7,3.0,1.0,0.0,22.0
2,dd0fad3248951d2f71d63e6279aeaa4b,XA-VBW,220,200.0,142.0,1687792000.0,1687795000.0,0.6667,18.0,14.0,...,0.0,0.0,0.0,0.0,26,6,0.0,1.0,2.0,21.0
3,d0987ee648eea254063bfe2b39571b67,XA-VAP,186,162.0,90.0,1676018000.0,1676023000.0,1.1667,22.0,0.0,...,3.0,7.0,0.0,22.0,6,2,1.0,0.0,1.0,57.0
4,3b5df8805161ea827d2f2e4298c38e06,XA-VBY,240,183.0,125.0,1694107000.0,1694110000.0,0.9167,6.0,13.0,...,0.0,0.0,0.0,2.0,36,9,0.0,4.0,2.0,34.0


### Split dataset into training and test

In [8]:
# PUT HERE THE ARRAY OF TARGETS
# targets = ['Total_Quantity', 'Passengers', 'Q_Alimentos Charter', 'Q_Bebidas Calientes', 'Q_Botanas', 'Q_Galletas', 'Q_Lacteos', 'Q_Licores', 'Q_Perecederos', 'Q_Refrescos', 'Q_Sopas']
targets = ['Passengers']
# PUT HERE THE ARRAY OF FEATURES
# remove_features = targets + ['Flight_ID', 'Month', ]
# features = [item for item in list(flights.columns) if item not in remove_features]
# features = ['Capacity', 'DUR_HR','DepartureStation_E', 'ArrivalStation_E']
features = ['Capacity', 'Bookings', 'DUR_HR','DepartureStation_E', 'ArrivalStation_E']

# Splitting the dataset into features (X) and target variable (y)
# X = flights.drop(columns=targets)  # Drop the target column to get the features
X = flights[features]  # Drop the target column to get the features
y = flights[targets]  # Extract the target column

# Splitting the flights dataset into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f'X_train:{X_train.shape}, X_test:{X_test.shape}, y_train:{y_train.shape}, y_test:{y_test.shape}')
print(targets)
print(features)

X_train:(95712, 5), X_test:(23929, 5), y_train:(95712, 1), y_test:(23929, 1)
['Passengers']
['Capacity', 'Bookings', 'DUR_HR', 'DepartureStation_E', 'ArrivalStation_E']


# Linear regression

In [None]:
# REGULARIZATION TO REMOVE FEATURES

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

# Define numerical and categorical features
# numeric_features = ['Capacity', 'Bookings', 'STD-int', 'STA-int', 'DUR_HR', 'Total_Quantity', 'WeekoftheYear', 'Month', 'DUR_INTERVAL', 'DepartureStation_E', 'ArrivalStation_E', 'Destination_E', 'Origin_E', 'DayoftheWeek_E', 'Time_Interval_E', 'Week_Month_Label_E']
# numeric_features = features
numeric_features = ['Capacity', 'Bookings', 'DUR_HR','DepartureStation_E', 'ArrivalStation_E']
categorical_features = []

# Create preprocessing pipeline
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create linear regression model
m_lr = make_pipeline(preprocessor, LinearRegression())

# Fit the model
m_lr.fit(X_train, y_train)

# Predict on the test set
y_pred = m_lr.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Absolute Error: 17.123724352849642
Mean Squared Error: 491.0663940686215
R-squared: 0.6258080360453142


In [None]:
print("Accuracy:", m_lr.score(X_test, y_test))

Accuracy: 0.2984071458462537


# Regularization

In [None]:
from sklearn.linear_model import Ridge

clf = Ridge(alpha=1.0)
clf.fit(X,y)

clf.score(X_test, y_test)

0.2985379079542738

# Decision Tree

In [10]:
from sklearn import tree

# Create linear regression model
m_dt = make_pipeline(preprocessor, tree.DecisionTreeRegressor())

# Fit the model
m_dt.fit(X_train, y_train)

# Predict on the test set
y_pred = m_dt.predict(X_test)

# Evaluate the model
dt_mae = mean_absolute_error(y_test, y_pred)
dt_mse = mean_squared_error(y_test, y_pred)
dt_r2 = r2_score(y_test, y_pred)
dt_score = m_dt.score(X_test, y_test)

print("Mean Absolute Error:", dt_mae)
print("Mean Squared Error:", dt_mse)
print("R-squared:", dt_r2)
print("Score:", dt_score)

Mean Absolute Error: 13.041707025210203
Mean Squared Error: 370.68821063598784
R-squared: 0.7175360577955872
Score: 0.7175360577955872


# Random Forest

In [11]:
from sklearn.ensemble import RandomForestRegressor

forest = RandomForestRegressor(max_depth = 12, random_state = 10)
forest.fit(X_train.to_numpy(), y_train.to_numpy().squeeze())

# Predict on the test set
y_pred = forest.predict(X_test.to_numpy())

forest_mae = mean_absolute_error(y_test, y_pred)
forest_mse = mean_squared_error(y_test, y_pred)
forest_r2 = r2_score(y_test, y_pred)
forest_score = forest.score(X_test.to_numpy(), y_test.to_numpy().squeeze())

print("Mean Absolute Error:", forest_mae)
print("Mean Squared Error:", forest_mse)
print("R-squared:", forest_r2)
print("Score:", forest_score)

Mean Absolute Error: 11.925854630636184
Mean Squared Error: 269.5808151608379
R-squared: 0.7945797637794721
Score: 0.7945797637794721


# XG-Boost

In [12]:
!pip install xgboost==2.0.3

Collecting xgboost==2.0.3
  Downloading xgboost-2.0.3-py3-none-manylinux2014_x86_64.whl (297.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.1/297.1 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xgboost
Successfully installed xgboost-2.0.3

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [13]:
import xgboost as xg
from sklearn.ensemble import GradientBoostingRegressor

xgb_r = xg.XGBRegressor(objective="reg:linear", n_estimators=15, seed=10)

xgb_r.fit(X_train, y_train)

#boost = GradientBoostingRegressor()
#boost.fit(X_train.to_numpy(), y_train.to_numpy().squeeze())

# Predict on the test set
y_pred = xgb_r.predict(X_test.to_numpy())

boost_mae = mean_absolute_error(y_test, y_pred)
boost_mse = mean_squared_error(y_test, y_pred)
boost_r2 = r2_score(y_test, y_pred)
#boost_score = boost.score(X_test.to_numpy(), y_test.to_numpy().squeeze())

print("Mean Absolute Error:", boost_mae)
print("Mean Squared Error:", boost_mse)
print("R-squared:", boost_r2)
#print("Score:", boost_score)

Mean Absolute Error: 12.633890965364605
Mean Squared Error: 288.55309959849296
R-squared: 0.7801229073132546


# Neural Networks

### FNN

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Define the FNN model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1)  # Output layer with 1 neuron for regression
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# After training, you can use the model to make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = np.mean((y_test - y_pred)**2)
print("Mean Squared Error:", mse)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Mean Squared Error: 411.55954906959767


In [None]:
print("Score:", r2_score(y_test, y_pred))


Score: 0.6258080360453142


# Quantity

### Split dataset for Quantity and select features and targets

In [None]:
# PUT HERE THE ARRAY OF TARGETS
targets_q  = ['Total_Quantity', 'Q_Alimentos Charter', 'Q_Bebidas Calientes', 'Q_Botanas', 'Q_Galletas', 'Q_Lacteos', 'Q_Licores', 'Q_Perecederos', 'Q_Refrescos', 'Q_Sopas']
# targets_q = ['Passengers']
# PUT HERE THE ARRAY OF FEATURES
# remove_features = targets + ['Flight_ID', 'Month', ]
# features = [item for item in list(flights.columns) if item not in remove_features]
features_q = ['Aeronave', 'Passengers', 'Capacity', 'Bookings', 'DUR_HR','DepartureStation_E', 'ArrivalStation_E']

# Splitting the dataset into features (X) and target variable (y)
# X = flights.drop(columns=targets)  # Drop the target column to get the features
XQ = flights[features_q]  # Drop the target column to get the features
yQ = flights[targets_q]  # Extract the target column

XQ = XQ[0:5000]
yQ = yQ[0:5000]

# Splitting the flights dataset into 80% training and 20% testing

XQ_train, XQ_test, yQ_train, yQ_test = train_test_split(XQ, yQ, test_size=0.2, random_state=42)

print(f'XQ_train:{XQ_train.shape}, XQ_test:{XQ_test.shape}, yQ_train:{yQ_train.shape}, yQ_test:{yQ_test.shape}')

XQ_train:(4000, 7), XQ_test:(1000, 7), yQ_train:(4000, 10), yQ_test:(1000, 10)


In [None]:
print(XQ_train.columns)

Index(['Aeronave', 'Passengers', 'Capacity', 'Bookings', 'DUR_HR',
       'DepartureStation_E', 'ArrivalStation_E'],
      dtype='object')


### LSTM

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization

# Normalize numerical variables
numerical_cols = ["Passengers", "Dur_HR"]
XQ_categorical = ["DepartureStation_E", "ArrivalStation_E"]
scaler = StandardScaler()
XQ_numerical = scaler.fit_transform(XQ[numerical_cols])

# Combine categorical and numerical features
XQ_processed = np.concatenate((XQ_numerical, XQ_categorical), axis=1)

# Build the LSTM model
m_lstm = Sequential([
    LSTM(units=64, input_shape=(XQ_train.shape[1], 1), return_sequences=True),
    BatchNormalization(),
    Dropout(0.2),
    LSTM(units=32),
    BatchNormalization(),
    Dropout(0.2),
    Dense(units=1)
])

# Compile the m_lstm
m_lstm.compile(optimizer='adam', loss='mse')

# Train the m_lstm
m_lstm.fit(XQ_train, yQ_train, epochs=2, batch_size=32, validation_data=(XQ_test, yQ_test))

# Evaluate the m_lstm
mse_lstm = m_lstm.evaluate(XQ_test, yQ_test)
print("Mean Squared Error:", mse_lstm)
# print("Score:", r2_score(yQ_test, y_pred))

KeyError: "['Dur_HR'] not in index"

In [None]:
yQ_test.loc[1501]

Total_Quantity         38.0
Q_Alimentos Charter     0.0
Q_Bebidas Calientes     0.0
Q_Botanas              10.0
Q_Galletas              0.0
Q_Lacteos               0.0
Q_Licores               2.0
Q_Perecederos           8.0
Q_Refrescos            16.0
Q_Sopas                 2.0
Name: 1501, dtype: float64

In [None]:
XQ_train[:,0].shape

(4000, 1)

In [None]:
m_lstm.predict(yQ_test.loc[1501])



array([[-0.24740773],
       [ 8.4677925 ],
       [ 8.4677925 ],
       [ 0.39990467],
       [ 8.4677925 ],
       [ 8.4677925 ],
       [ 5.3037148 ],
       [ 0.80096835],
       [ 0.03534359],
       [ 5.3037148 ]], dtype=float32)

In [None]:
yQ_test.loc[1501]

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=25b1adb0-2b47-474f-a7b7-c06d723a95f4' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>