In [44]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, r2_score
from torch.utils.data import DataLoader, TensorDataset

In [45]:
# Load training, validation, and test data
X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv')
X_val = pd.read_csv('X_val.csv')
y_val = pd.read_csv('y_val.csv')
X_test = pd.read_csv('X_test.csv')
y_test = pd.read_csv('y_test.csv')

In [46]:
# Convert to numpy arrays
X_train_np = X_train.values
y_train_np = y_train.values.flatten()
X_val_np = X_val.values
y_val_np = y_val.values.flatten()
X_test_np = X_test.values
y_test_np = y_test.values.flatten()

In [47]:
# Feature selection using PCA to reduce features
n_components = 10 
pca = PCA(n_components=n_components)
X_train_reduced = pca.fit_transform(X_train_np)
X_val_reduced = pca.transform(X_val_np)
X_test_reduced = pca.transform(X_test_np)

In [48]:
# Reshape data to fit CNN input format (batch_size, channels, height, width)
X_train_reduced = X_train_reduced.reshape((X_train_reduced.shape[0], 1, 1, n_components))
X_val_reduced = X_val_reduced.reshape((X_val_reduced.shape[0], 1, 1, n_components))
X_test_reduced = X_test_reduced.reshape((X_test_reduced.shape[0], 1, 1, n_components))


In [49]:
# Define CNN model
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=(1, 3))
        self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=(1, 3))
        self.fc1 = nn.Linear(32 * 1 * (n_components - 4), 128)
        self.fc2 = nn.Linear(128, 1)
        self.dropout = nn.Dropout(p=0.5)
    
    def forward(self, x):
        x = self.conv1(x)
        x = nn.ReLU()(x)
        x = self.dropout(x)
        x = self.conv2(x)
        x = nn.ReLU()(x)
        x = x.view(x.size(0), -1) 
        x = self.fc1(x)
        x = nn.ReLU()(x)
        x = self.fc2(x)
        return x

In [50]:
# Initialize model, loss function, and optimizer
model = CNN()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.00005, weight_decay=0.01)


In [51]:
# Convert data to PyTorch tensors with float32 dtype
X_train_tensor = torch.tensor(X_train_reduced, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_np, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val_reduced, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val_np, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_reduced, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_np, dtype=torch.float32)


In [52]:
# Create DataLoader for batch processing
batch_size = 1000 
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)


In [53]:
# Lists to log training loss and accuracy
train_losses = []
train_accuracies = []
val_losses = []
val_accuracies = []

In [54]:
# Training loop with batch processing
epochs = 200
for epoch in range(epochs):
    model.train()
    running_loss = 0.0  # Reset running loss for each epoch
    
    # Process each batch
    for batch_x, batch_y in train_loader:
        optimizer.zero_grad()
        
        # Forward pass
        predictions = model(batch_x).squeeze()  # Remove singleton dimension
        loss = criterion(predictions, batch_y)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        running_loss += loss.item()# Accumulate batch loss

    # Calculate the average loss for the epoch
    epoch_loss = (running_loss / len(train_loader))
    
    # Calculate accuracy and loss for validation set
    model.eval()  # Set model to evaluation mode
    with torch.no_grad():  # No need to track gradients during evaluation
        y_val_pred = model(X_val_tensor).squeeze()
        val_loss = criterion(y_val_pred, y_val_tensor).item()
        val_mse = mean_squared_error(y_val_np, y_val_pred.numpy())
        val_r2 = r2_score(y_val_np, y_val_pred.numpy())  # R² score
        val_accuracy = val_r2 * 100  # Convert to percentage

        # Store logs for this epoch
        train_losses.append(epoch_loss)
        val_losses.append(val_loss)
        train_accuracies.append(val_accuracy)
        val_accuracies.append(val_accuracy)

    # Print results for each epoch
    print(f'Epoch [{epoch + 1}/{epochs}], Loss: {epoch_loss}')

Epoch [1/200], Loss: 0.16955502033233644
Epoch [2/200], Loss: 0.11873148500919342
Epoch [3/200], Loss: 0.07863275721669197
Epoch [4/200], Loss: 0.051488227993249897
Epoch [5/200], Loss: 0.039107536226511
Epoch [6/200], Loss: 0.0356776487827301
Epoch [7/200], Loss: 0.03436060011386871
Epoch [8/200], Loss: 0.0334552913159132
Epoch [9/200], Loss: 0.032321655601263044
Epoch [10/200], Loss: 0.031614232361316684
Epoch [11/200], Loss: 0.030620966926217078
Epoch [12/200], Loss: 0.029832707569003105
Epoch [13/200], Loss: 0.02877737008035183
Epoch [14/200], Loss: 0.027532709315419197
Epoch [15/200], Loss: 0.026804945468902587
Epoch [16/200], Loss: 0.02587109848856926
Epoch [17/200], Loss: 0.02503103069961071
Epoch [18/200], Loss: 0.0241388301551342
Epoch [19/200], Loss: 0.023387585058808325
Epoch [20/200], Loss: 0.02236979678273201
Epoch [21/200], Loss: 0.021804993748664857
Epoch [22/200], Loss: 0.02126714825630188
Epoch [23/200], Loss: 0.02040700040757656
Epoch [24/200], Loss: 0.019933243691921

In [55]:
# Save trained model and logs
torch.save({
    'model_state_dict': model.state_dict(),
    'train_losses': train_losses,
    'train_accuracies': train_accuracies,
    'val_losses': val_losses,
    'val_accuracies': val_accuracies
}, 'cnn_model.pth')

In [56]:
# Reshape the test data to match the CNN input format
X_test_reduced = X_test_reduced.reshape((X_test_reduced.shape[0], 1, 1, n_components))

# Convert the reshaped data to tensor
X_test_tensor = torch.tensor(X_test_reduced, dtype=torch.float32)

# Proceed with the test
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    y_test_pred = model(X_test_tensor).flatten().numpy()

    # Calculate MSE, MAE, RMSE, R² score, and MAPE
    test_mse = mean_squared_error(y_test.values, y_test_pred)
    test_r2 = r2_score(y_test.values, y_test_pred)
    test_mae = mean_absolute_error(y_test.values, y_test_pred)
    test_rmse = mean_squared_error(y_test.values, y_test_pred, squared=False)
    test_mape = np.mean(np.abs((y_test.values - y_test_pred) / y_test.values)) * 100

    # Calculate accuracy as R² score in percentage
    test_accuracy = test_r2 * 100

    print(f'Mean Squared Error on test data: {test_mse}')
    print(f'R² score (Accuracy) on test data: {test_r2}')
    print(f'Accuracy: {test_accuracy}%')
    print(f'Mean Absolute Error (MAE) on test data: {test_mae}')
    print(f'Root Mean Squared Error (RMSE) on test data: {test_rmse}')
    print(f'Mean Absolute Percentage Error (MAPE) on test data: {test_mape}%')




Mean Squared Error on test data: 0.0053165791610501585
R² score (Accuracy) on test data: 0.8644319494102973
Accuracy: 86.44319494102973%
Mean Absolute Error (MAE) on test data: 0.06011723784290252
Root Mean Squared Error (RMSE) on test data: 0.07291487612997884
Mean Absolute Percentage Error (MAPE) on test data: 81.37127959813154%
