In [None]:
# timport các thư viện cần thiết để xử lsi dữ liệu, trực quan hóa, huấn luyện mô hình, API
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import mean_absolute_error, r2_score


In [None]:
# Kiểm tra và sử dụng GPU trong huấn luyện
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [None]:
# Đẩy dữ liệu file CSV vào bằng pandas. dưới dạng dataFrame
df_raw = pd.read_csv('Air_Traffic_Passenger_Statistics.csv')
# Tạo bản sao trên dữ liệu gốc, đảm bảo an toàn.
df = df_raw.copy()

In [None]:
# xem 3 cột đầu tiên.
df.head(3)

Unnamed: 0,index,Activity Period,Operating Airline,Operating Airline IATA Code,Published Airline,Published Airline IATA Code,GEO Summary,GEO Region,Activity Type Code,Price Category Code,Terminal,Boarding Area,Passenger Count,Adjusted Activity Type Code,Adjusted Passenger Count,Year,Month
0,0,200507,ATA Airlines,TZ,ATA Airlines,TZ,Domestic,US,Deplaned,Low Fare,Terminal 1,B,27271,Deplaned,27271,2005,July
1,1,200507,ATA Airlines,TZ,ATA Airlines,TZ,Domestic,US,Enplaned,Low Fare,Terminal 1,B,29131,Enplaned,29131,2005,July
2,2,200507,ATA Airlines,TZ,ATA Airlines,TZ,Domestic,US,Thru / Transit,Low Fare,Terminal 1,B,5415,Thru / Transit * 2,10830,2005,July


In [None]:
# chuyển Tháng sang năm và thêm vào dataframe
month_map = {
    "January": 1, "February": 2, "March": 3,
    "April": 4, "May": 5, "June": 6,
    "July": 7, "August": 8, "September": 9,
    "October": 10, "November": 11, "December": 12
}

df['Month_number'] = df['Month'].map(month_map)

In [None]:
# chuyển Month qua sin và cos để thể hiện tính chu kì. để mô hình hiểu tháng 12 gần tháng 1 chẳng hạn.
df['Month_sin'] = np.sin(2 * np.pi * df['Month_number'] / 12)
df['Month_cos'] = np.cos(2 * np.pi * df['Month_number'] / 12)

In [None]:
# danh sách tên các cột mà cần category nhằm mã hóa để huấn luyện mô hình
cat_cols = ["Operating Airline", "GEO Region", "Activity Type Code", "Terminal"]
# mã hóa catrgory từng cột. tạo cột mới thêm vào dataframe và lưu unique vào dictionaries
num_categories = {}
for col in cat_cols:
    df[col] = df[col].astype("category")
    df[col + "_code"] = df[col].cat.codes
    num_categories[col] = len(df[col].cat.categories)

print(num_categories)

num_airlines = num_categories["Operating Airline"]
num_regions = num_categories["GEO Region"]
num_activities = num_categories["Activity Type Code"]
num_terminals = num_categories["Terminal"]


{'Operating Airline': 77, 'GEO Region': 9, 'Activity Type Code': 3, 'Terminal': 5}


In [7]:
df.head()

Unnamed: 0,index,Activity Period,Operating Airline,Operating Airline IATA Code,Published Airline,Published Airline IATA Code,GEO Summary,GEO Region,Activity Type Code,Price Category Code,...,Adjusted Passenger Count,Year,Month,Month_number,Month_sin,Month_cos,Operating Airline_code,GEO Region_code,Activity Type Code_code,Terminal_code
0,0,200507,ATA Airlines,TZ,ATA Airlines,TZ,Domestic,US,Deplaned,Low Fare,...,27271,2005,July,7,-0.5,-0.866025,0,8,0,2
1,1,200507,ATA Airlines,TZ,ATA Airlines,TZ,Domestic,US,Enplaned,Low Fare,...,29131,2005,July,7,-0.5,-0.866025,0,8,1,2
2,2,200507,ATA Airlines,TZ,ATA Airlines,TZ,Domestic,US,Thru / Transit,Low Fare,...,10830,2005,July,7,-0.5,-0.866025,0,8,2,2
3,3,200507,Air Canada,AC,Air Canada,AC,International,Canada,Deplaned,Other,...,35156,2005,July,7,-0.5,-0.866025,4,2,0,2
4,4,200507,Air Canada,AC,Air Canada,AC,International,Canada,Enplaned,Other,...,34090,2005,July,7,-0.5,-0.866025,4,2,1,2


In [None]:
# đặc trưng đầu vào
FEATURE_INPUT = [
    "Year", "Month_sin", "Month_cos", "Adjusted Passenger Count",
    "Operating Airline_code", "GEO Region_code", "Activity Type Code_code", "Terminal_code"
]
# biến mục tiêu
TARGET = "Adjusted Passenger Count"


In [None]:
# tạo cửa sổ tuần tự cho mô hình học.

# chuẩn hóa  các cột có gía trị số
num_features = ["Year", "Month_sin", "Month_cos", "Adjusted Passenger Count"]

# dùng Standar Scaler để chuẩn hóa
scaler_X = StandardScaler()
X_num_scaled = scaler_X.fit_transform(df[num_features])

# các cột mã hóa category
cat_cols = ["Operating Airline", "GEO Region", "Activity Type Code", "Terminal"]

num_categories = {}
X_cat_codes = []
for col in cat_cols:
    df[col] = df[col].astype("category")
    codes = df[col].cat.codes.values
    X_cat_codes.append(codes.reshape(-1, 1)) # type: ignore
    num_categories[col] = len(df[col].cat.categories)

X_cat_codes = np.hstack(X_cat_codes)

# ghép X_cat và X_num lại thành X đầu vào
X_all = np.hstack([X_num_scaled, X_cat_codes])

# chuẩn hóa biến mục tiêu y
scaler_y = StandardScaler()
y_scaled = scaler_y.fit_transform(df["Passenger Count"].values.reshape(-1, 1)).flatten() # type: ignore


# tạo chuỗi tuần tự để huấn luyện mô hình.
def build_sequences(X, y, seq_len=24):
    X_seq, y_seq = [], []
    for i in range(len(X) - seq_len):
        X_seq.append(X[i:i+seq_len])
        y_seq.append(y[i+seq_len])
    return np.array(X_seq), np.array(y_seq)

X_seq, y_seq = build_sequences(X_all, y_scaled, seq_len=24)

# in cỡ của ma trận
print("X_seq:", X_seq.shape) 
print("y_seq:", y_seq.shape)  


X_seq: (14983, 24, 8)
y_seq: (14983,)


In [None]:
# cấu hình kiến trúc mạng nơ ron GRU
# 3 lớp, mỗi lớp có 256 nút, dropout 0.2 để tránh quá khớp, đầu ra là Linear với 1 node
class PassengerGRU(nn.Module):
    def __init__(self, num_airlines, num_regions, num_activities, num_terminals,
                 input_dim_numeric, hidden_size=256, num_layers=3, dropout=0.2):
        super().__init__()

        self.airline_embed = nn.Embedding(num_airlines, 32)
        self.region_embed = nn.Embedding(num_regions, 16)
        self.activity_embed = nn.Embedding(num_activities, 8)
        self.terminal_embed = nn.Embedding(num_terminals, 4)
        
        self.numeric_fc = nn.Linear(input_dim_numeric, 32)
        
        total_dim = 32 + 32 + 16 + 8 + 4
        
        self.gru = nn.GRU(total_dim, hidden_size, num_layers,
                          batch_first=True, dropout=dropout)
        
        self.fc_out = nn.Linear(hidden_size, 1)

    # lan truyền tiến
    def forward(self, x_num, x_airline, x_region, x_activity, x_terminal):
        num_feat = self.numeric_fc(x_num)
        airline_feat = self.airline_embed(x_airline)
        region_feat = self.region_embed(x_region)
        activity_feat = self.activity_embed(x_activity)
        terminal_feat = self.terminal_embed(x_terminal)

        x = torch.cat([num_feat, airline_feat, region_feat,
                       activity_feat, terminal_feat], dim=-1)
        
        out, _ = self.gru(x)  
        
        context = out[:, -1, :]  
        return self.fc_out(context).squeeze(-1)


In [None]:
# chia tập 80 20 để huấn luyện và kiểm tra
X_train, X_val, y_train, y_val = train_test_split(
    X_seq, y_seq, test_size=0.2, random_state=42, shuffle=True
)

# đưa vào cấu hình tensor 
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).to(device)
X_val_tensor   = torch.tensor(X_val, dtype=torch.float32).to(device)
y_val_tensor   = torch.tensor(y_val, dtype=torch.float32).to(device)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset   = TensorDataset(X_val_tensor, y_val_tensor)

# batch size = 64
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=64, shuffle=False)

# gọi model và dùng device tức GPU để huấn luyện
model = PassengerGRU(
    num_airlines, num_regions, num_activities, num_terminals, input_dim_numeric=4
).to(device)

In [None]:
best_val_loss = float("inf")
# dùng MSE loss
loss_fn = nn.MSELoss()
# dùng tối ưu ADAM với learning rate là 10^-3
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# huấn luyện 40 qua toàn bộ train và kiểm trên test
EPOCHS = 40

for epoch in range(EPOCHS):
    model.train()
    total_train_loss = 0.0

    train_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS} [Train]", leave=False)
    for xb, yb in train_bar:
        x_num = xb[:, :, :4].to(device)
        x_airline = xb[:, :, 4].long().to(device)
        x_region = xb[:, :, 5].long().to(device)
        x_activity = xb[:, :, 6].long().to(device)
        x_terminal = xb[:, :, 7].long().to(device)
        yb = yb.to(device)

        preds = model(x_num, x_airline, x_region, x_activity, x_terminal)
        loss = loss_fn(preds, yb)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()
        train_bar.set_postfix({"batch_loss": f"{loss.item():.4f}"})

    avg_train_loss = total_train_loss / len(train_loader)


    model.eval()
    total_val_loss = 0.0
    all_preds, all_true = [], []

    val_bar = tqdm(val_loader, desc=f"Epoch {epoch+1}/{EPOCHS} [Val]", leave=False)
    with torch.no_grad():
        for xb, yb in val_bar:
            x_num = xb[:, :, :4].to(device)
            x_airline = xb[:, :, 4].long().to(device)
            x_region = xb[:, :, 5].long().to(device)
            x_activity = xb[:, :, 6].long().to(device)
            x_terminal = xb[:, :, 7].long().to(device)
            yb = yb.to(device)

            preds = model(x_num, x_airline, x_region, x_activity, x_terminal)
            loss = loss_fn(preds, yb)

            total_val_loss += loss.item()
            all_preds.append(preds.cpu().numpy())
            all_true.append(yb.cpu().numpy())

            val_bar.set_postfix({"batch_loss": f"{loss.item():.4f}"})

    avg_val_loss = total_val_loss / len(val_loader)

    all_preds = np.concatenate(all_preds)
    all_true = np.concatenate(all_true)
    rmse_scaled = np.sqrt(avg_val_loss)

    rmse_original = rmse_scaled * scaler_y.scale_[0]# type: ignore

    mae_original = mean_absolute_error(
        scaler_y.inverse_transform(all_true.reshape(-1, 1)),
        scaler_y.inverse_transform(all_preds.reshape(-1, 1))
    )

    r2 = r2_score(
        scaler_y.inverse_transform(all_true.reshape(-1, 1)),
        scaler_y.inverse_transform(all_preds.reshape(-1, 1))
    )

    # in các metrics đánh giá
    print(
        f"Epoch {epoch+1}/{EPOCHS} | "
        f"Train Loss: {avg_train_loss:.4f} | "
        f"Val Loss: {avg_val_loss:.4f} | "
        f"RMSE: {rmse_original:.2f} | "
        f"MAE: {mae_original:.2f} | "
        f"R²: {r2:.4f}"
    )
    # lưu mô hình tốt nhất.
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), "gru_passenger_best.pt")
        print(f"Saved new best model at epoch {epoch+1}, Val Loss: {avg_val_loss:.4f}")
    

                                                                                         

Epoch 1/40 | Train Loss: 0.0358 | Val Loss: 0.0642 | RMSE: 14772.68 | MAE: 5823.42 | R²: 0.9344
Saved new best model at epoch 1, Val Loss: 0.0642


                                                                                         

Epoch 2/40 | Train Loss: 0.0380 | Val Loss: 0.0636 | RMSE: 14705.41 | MAE: 5452.78 | R²: 0.9348
Saved new best model at epoch 2, Val Loss: 0.0636


                                                                                         

Epoch 3/40 | Train Loss: 0.0339 | Val Loss: 0.0619 | RMSE: 14506.50 | MAE: 5820.77 | R²: 0.9377
Saved new best model at epoch 3, Val Loss: 0.0619


                                                                                         

Epoch 4/40 | Train Loss: 0.0324 | Val Loss: 0.0674 | RMSE: 15134.54 | MAE: 5626.36 | R²: 0.9323


                                                                                         

Epoch 5/40 | Train Loss: 0.0327 | Val Loss: 0.0835 | RMSE: 16849.50 | MAE: 6079.38 | R²: 0.9145


                                                                                         

Epoch 6/40 | Train Loss: 0.0415 | Val Loss: 0.0535 | RMSE: 13491.34 | MAE: 5984.98 | R²: 0.9459
Saved new best model at epoch 6, Val Loss: 0.0535


                                                                                         

Epoch 7/40 | Train Loss: 0.0392 | Val Loss: 0.0591 | RMSE: 14177.40 | MAE: 5763.47 | R²: 0.9402


                                                                                         

Epoch 8/40 | Train Loss: 0.0291 | Val Loss: 0.0852 | RMSE: 17025.11 | MAE: 6448.79 | R²: 0.9130


                                                                                         

Epoch 9/40 | Train Loss: 0.0369 | Val Loss: 0.0655 | RMSE: 14923.93 | MAE: 5557.69 | R²: 0.9336


                                                                                          

Epoch 10/40 | Train Loss: 0.0249 | Val Loss: 0.0722 | RMSE: 15665.54 | MAE: 5313.21 | R²: 0.9271


                                                                                          

Epoch 11/40 | Train Loss: 0.0225 | Val Loss: 0.0737 | RMSE: 15830.77 | MAE: 5168.42 | R²: 0.9256


                                                                                          

Epoch 12/40 | Train Loss: 0.0217 | Val Loss: 0.0781 | RMSE: 16301.96 | MAE: 5338.18 | R²: 0.9204


                                                                                          

Epoch 13/40 | Train Loss: 0.0562 | Val Loss: 0.1112 | RMSE: 19445.55 | MAE: 7605.36 | R²: 0.8860


                                                                                          

Epoch 14/40 | Train Loss: 0.0585 | Val Loss: 0.0667 | RMSE: 15058.59 | MAE: 6113.15 | R²: 0.9325


                                                                                          

Epoch 15/40 | Train Loss: 0.0398 | Val Loss: 0.0619 | RMSE: 14511.08 | MAE: 5585.31 | R²: 0.9376


                                                                                          

Epoch 16/40 | Train Loss: 0.0255 | Val Loss: 0.0788 | RMSE: 16370.01 | MAE: 5792.00 | R²: 0.9203


                                                                                          

Epoch 17/40 | Train Loss: 0.0208 | Val Loss: 0.0541 | RMSE: 13561.15 | MAE: 4933.59 | R²: 0.9458


                                                                                          

Epoch 18/40 | Train Loss: 0.0208 | Val Loss: 0.0643 | RMSE: 14791.01 | MAE: 5164.53 | R²: 0.9352


                                                                                          

Epoch 19/40 | Train Loss: 0.0235 | Val Loss: 0.0648 | RMSE: 14843.87 | MAE: 6662.59 | R²: 0.9348


                                                                                          

Epoch 20/40 | Train Loss: 0.0196 | Val Loss: 0.0584 | RMSE: 14093.14 | MAE: 4761.51 | R²: 0.9413


                                                                                          

Epoch 21/40 | Train Loss: 0.0169 | Val Loss: 0.0596 | RMSE: 14238.60 | MAE: 4809.05 | R²: 0.9402


                                                                                          

Epoch 22/40 | Train Loss: 0.0159 | Val Loss: 0.0699 | RMSE: 15419.94 | MAE: 4616.24 | R²: 0.9296


                                                                                          

Epoch 23/40 | Train Loss: 0.0172 | Val Loss: 0.0702 | RMSE: 15451.49 | MAE: 4917.77 | R²: 0.9292


                                                                                          

Epoch 24/40 | Train Loss: 0.0151 | Val Loss: 0.0715 | RMSE: 15599.03 | MAE: 4866.14 | R²: 0.9279


                                                                                          

Epoch 25/40 | Train Loss: 0.0143 | Val Loss: 0.0651 | RMSE: 14874.79 | MAE: 4978.26 | R²: 0.9346


                                                                                          

Epoch 26/40 | Train Loss: 0.0171 | Val Loss: 0.0762 | RMSE: 16096.31 | MAE: 5257.68 | R²: 0.9230


                                                                                          

Epoch 27/40 | Train Loss: 0.0198 | Val Loss: 0.0737 | RMSE: 15836.19 | MAE: 5299.05 | R²: 0.9257


                                                                                          

Epoch 28/40 | Train Loss: 0.0161 | Val Loss: 0.0704 | RMSE: 15476.92 | MAE: 5080.60 | R²: 0.9291


                                                                                          

Epoch 29/40 | Train Loss: 0.0191 | Val Loss: 0.0642 | RMSE: 14780.63 | MAE: 5001.57 | R²: 0.9350


                                                                                          

Epoch 30/40 | Train Loss: 0.0241 | Val Loss: 0.0685 | RMSE: 15260.03 | MAE: 5271.54 | R²: 0.9306


                                                                                          

Epoch 31/40 | Train Loss: 0.0284 | Val Loss: 0.0673 | RMSE: 15132.80 | MAE: 5587.82 | R²: 0.9313


                                                                                          

Epoch 32/40 | Train Loss: 0.0326 | Val Loss: 0.0702 | RMSE: 15449.90 | MAE: 6514.54 | R²: 0.9292


                                                                                          

Epoch 33/40 | Train Loss: 0.0265 | Val Loss: 0.0626 | RMSE: 14585.29 | MAE: 5477.38 | R²: 0.9372


                                                                                          

Epoch 34/40 | Train Loss: 0.0264 | Val Loss: 0.0690 | RMSE: 15322.98 | MAE: 5486.78 | R²: 0.9306


                                                                                          

Epoch 35/40 | Train Loss: 0.0185 | Val Loss: 0.0594 | RMSE: 14211.70 | MAE: 4783.96 | R²: 0.9402


                                                                                          

Epoch 36/40 | Train Loss: 0.0195 | Val Loss: 0.0615 | RMSE: 14461.01 | MAE: 4972.62 | R²: 0.9380


                                                                                          

Epoch 37/40 | Train Loss: 0.0208 | Val Loss: 0.0576 | RMSE: 14001.18 | MAE: 5490.53 | R²: 0.9418


                                                                                          

Epoch 38/40 | Train Loss: 0.0267 | Val Loss: 0.0745 | RMSE: 15919.62 | MAE: 5577.15 | R²: 0.9250


                                                                                          

Epoch 39/40 | Train Loss: 0.0262 | Val Loss: 0.0504 | RMSE: 13089.48 | MAE: 4995.36 | R²: 0.9498
Saved new best model at epoch 39, Val Loss: 0.0504


                                                                                          

Epoch 40/40 | Train Loss: 0.0257 | Val Loss: 0.0555 | RMSE: 13737.84 | MAE: 4783.14 | R²: 0.9445




In [None]:
# lưu mean và std chuân quá lấy từ Standar Scaler
joblib.dump(scaler_X, "scaler_X.pkl")
joblib.dump(scaler_y, "scaler_y.pkl")

['scaler_y.pkl']