# XGBoost

In [13]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load and preprocess data
data = pd.read_csv("./Data/Beijing.csv")
data = data.fillna(method='ffill')
data['wd'] = data['wd'].astype('category').cat.codes

# Feature selection and feature engineering
data['TEMP_WSPM'] = data['TEMP'] * data['WSPM']  # Interaction feature
X = data[['year', 'month', 'hour', 'TEMP', 'PRES', 'RAIN', 'WSPM', 'PM10', 'SO2', 'NO2', 'TEMP_WSPM']]
y = data['PM2.5']

# Reduce dataset size (sample synchronously)
sampled_indices = data.sample(frac=0.3, random_state=42).index
X = X.loc[sampled_indices]
y = y.loc[sampled_indices]

# Train-test split (with shuffling)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42, shuffle=True)

# Train XGBoost model with optimized parameters (CPU-only)
model = xgb.XGBRegressor(
    n_estimators=500,  # Reduced number of trees
    learning_rate=0.05,  # Moderate learning rate
    max_depth=6,  # Deeper trees for complex patterns
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=10)

# Evaluate model
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"MAPE: {mape:.2f}")
print(f"R2: {r2}")


  data = data.fillna(method='ffill')


[0]	validation_0-rmse:78.31265
[10]	validation_0-rmse:54.71882
[20]	validation_0-rmse:42.51139
[30]	validation_0-rmse:35.18561
[40]	validation_0-rmse:31.52476
[50]	validation_0-rmse:29.70322
[60]	validation_0-rmse:28.89862
[70]	validation_0-rmse:28.15364
[80]	validation_0-rmse:27.71232
[90]	validation_0-rmse:27.36113
[100]	validation_0-rmse:27.13001
[110]	validation_0-rmse:26.94603
[120]	validation_0-rmse:26.72149
[130]	validation_0-rmse:26.61334
[140]	validation_0-rmse:26.47257
[150]	validation_0-rmse:26.36110
[160]	validation_0-rmse:26.26798
[170]	validation_0-rmse:26.14803
[180]	validation_0-rmse:26.07114
[190]	validation_0-rmse:26.01803
[200]	validation_0-rmse:25.93136
[210]	validation_0-rmse:25.88121
[220]	validation_0-rmse:25.83121
[230]	validation_0-rmse:25.78642
[240]	validation_0-rmse:25.74845
[250]	validation_0-rmse:25.72412
[260]	validation_0-rmse:25.69843
[270]	validation_0-rmse:25.64865
[280]	validation_0-rmse:25.60134
[290]	validation_0-rmse:25.54766
[300]	validation_0-rm

# GRU

In [14]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load and preprocess data
data = pd.read_csv("./Data/Beijing.csv")
data = data.fillna(method='ffill')
data['wd'] = data['wd'].astype('category').cat.codes

# Feature engineering
data['TEMP_WSPM'] = data['TEMP'] * data['WSPM']  # Interaction feature
data['TEMP_SQ'] = data['TEMP'] ** 2  # Polynomial feature
data['RAIN_LOG'] = np.log1p(data['RAIN'])  # Log-transform

# Select features and target
features = ['year', 'month', 'hour', 'TEMP', 'PRES', 'RAIN', 'WSPM', 'PM10', 'SO2', 'NO2', 'TEMP_WSPM', 'TEMP_SQ', 'RAIN_LOG']
X = data[features].values
y = data['PM2.5'].values

# Normalize features
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()
X = scaler_X.fit_transform(X)
y = scaler_y.fit_transform(y.reshape(-1, 1))

# Convert to sequences
def create_sequences(data, target, seq_length):
    sequences = []
    for i in range(len(data) - seq_length):
        seq_x = data[i:i+seq_length]
        seq_y = target[i+seq_length]
        sequences.append((seq_x, seq_y))
    return sequences

SEQ_LENGTH = 24  # Use 24 hours of data for prediction
sequences = create_sequences(X, y, SEQ_LENGTH)

# Train-test split
train_size = int(0.7 * len(sequences))
val_size = int(0.15 * len(sequences))
test_size = len(sequences) - train_size - val_size

train_data = sequences[:train_size]
val_data = sequences[train_size:train_size + val_size]
test_data = sequences[train_size + val_size:]

# Create PyTorch Dataset and DataLoader
class TimeSeriesDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq_x, seq_y = self.sequences[idx]
        return torch.tensor(seq_x, dtype=torch.float32), torch.tensor(seq_y, dtype=torch.float32)

train_loader = DataLoader(TimeSeriesDataset(train_data), batch_size=64, shuffle=True)
val_loader = DataLoader(TimeSeriesDataset(val_data), batch_size=64, shuffle=False)
test_loader = DataLoader(TimeSeriesDataset(test_data), batch_size=64, shuffle=False)

# Define GRU Model
class GRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=2):
        super(GRUModel, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.gru(x)
        out = self.fc(out[:, -1, :])  # Use the last hidden state
        return out

# Model, loss function, optimizer
INPUT_SIZE = len(features)
HIDDEN_SIZE = 64
OUTPUT_SIZE = 1
NUM_LAYERS = 2

model = GRUModel(INPUT_SIZE, HIDDEN_SIZE, OUTPUT_SIZE, NUM_LAYERS).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=10):
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for batch_x, batch_y in train_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)

            # Forward pass
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        # Validation loss
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch_x, batch_y in val_loader:
                batch_x, batch_y = batch_x.to(device), batch_y.to(device)
                outputs = model(batch_x)
                loss = criterion(outputs, batch_y)
                val_loss += loss.item()

        print(f"Epoch [{epoch+1}/{epochs}], Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}")

# Train the model
train_model(model, train_loader, val_loader, criterion, optimizer, epochs=20)

# Evaluate on test set
model.eval()
test_loss = 0
predictions = []
actuals = []
with torch.no_grad():
    for batch_x, batch_y in test_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        outputs = model(batch_x)
        test_loss += criterion(outputs, batch_y).item()
        predictions.extend(outputs.cpu().numpy())
        actuals.extend(batch_y.cpu().numpy())

print(f"Test Loss: {test_loss/len(test_loader):.4f}")

# Inverse transform predictions and actuals for evaluation
predictions = scaler_y.inverse_transform(np.array(predictions).reshape(-1, 1))
actuals = scaler_y.inverse_transform(np.array(actuals).reshape(-1, 1))

# Compute evaluation metrics
rmse = np.sqrt(mean_squared_error(actuals, predictions))
mae = mean_absolute_error(actuals, predictions)
r2 = r2_score(actuals, predictions)

print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R2: {r2}")


ModuleNotFoundError: No module named 'torch'