In [18]:
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.parallel
import pandas as pd
#from ray import tune
#from ray.tune import grid_search, uniform
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import argparse
import scipy.io
import os


In [19]:
import boto3
import io
import scipy.io

# Specify the S3 bucket and file path
bucket_name = 'sagemaker-studio-12bcc0mjktib'
file_key = '2LVSI_scaled_extra.mat'

# Create a Boto3 S3 client
s3_client = boto3.client('s3')

# Download the file from S3
response = s3_client.get_object(Bucket=bucket_name, Key=file_key)
file_content = response['Body'].read()

# Load the .mat file using scipy.io
mat_data = scipy.io.loadmat(io.BytesIO(file_content))

# Continue with the rest of your code
input_scaled = mat_data.get('input_scaled')
output_scaled = mat_data.get('output_scaled')

# Split the dataset into input and output subsets
input_subset = input_scaled[:, :-6]
output_subset = output_scaled[:, :-6]

# Verify the shape of the resulting subsets
print(input_subset.shape)
print(output_subset.shape)


(3201204, 31)
(3201204, 25)


In [20]:
class MLP(nn.Module):
  def __init__(self, input_size, output_size, dropout =  0.005 , num_layers = 3, num_neurons =2048):
        super(MLP, self).__init__()
        self.num_layers = num_layers
        self.layers = nn.ModuleList()
        self.bn_layers = nn.ModuleList()
        self.activation_fn = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

        for i in range(num_layers):
          if i == 0:
            self.layers.append(nn.Linear(input_size, num_neurons))
            self.bn_layers.append(nn.BatchNorm1d(num_neurons))
          else:
            self.layers.append(nn.Linear(num_neurons // (2**(i-1)), num_neurons // (2**i)))
            self.bn_layers.append(nn.BatchNorm1d(num_neurons // (2**i)))

        # Try Relu also here after going through hyperparamter tuning
        self.output_layer = nn.Linear(num_neurons // (2**(num_layers-1)), output_size)
        self.dropout = nn.Dropout(dropout)

  def forward(self, x):
      for i in range(self.num_layers):
        x = self.layers[i](x)
        x = self.bn_layers[i](x)
        x = self.dropout(x)
        x = self.activation_fn(x)

      x = self.output_layer(x)
      x = self.sigmoid(x)
      return x

In [21]:
# Set device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
input_data = torch.Tensor(input_subset)
output_data = torch.Tensor(output_subset)
#print(input_data)


# Split the data into training, validation, and testing sets
#test_size = 0.2 ist 80% Training set
train_data, test_data, train_target, test_target = train_test_split(
    input_data, output_data, test_size=0.7, random_state=42
)

#test_size hier auf 0.5 heißt die übrigen 20% nochmal in 10% für val und 10% für test
val_data, test_data, val_target, test_target = train_test_split(
    test_data, test_target, test_size=0.5, random_state=42
)
#data = pd.DataFrame(train_data.numpy())
#print(len(data.iloc[:,7]))
#unique_values = data.iloc[:,7].unique()
#print(len(unique_values))
# Create data loaders
train_dataset = TensorDataset(train_data, train_target)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataset = TensorDataset(val_data, val_target)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_dataset = TensorDataset(test_data, test_target)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

cuda


In [27]:
# Define the dimensions of your input, hidden, and output layers
input_size = input_data.shape[1]
output_size = output_data.shape[1]
print(input_data.shape[0])
# Create an instance of the MLP model
model = MLP(input_size, output_size).to(device)
#model = MLP1(input_size, output_size).to(device)

#Define inital data
test_accuracies = []
best_test_accuracy = 0.0
best_val_loss = float('inf')

#get_dropout = MLP(input_size, output_size).dropout
get_dropout = MLP(input_size, output_size).dropout
# Access the dropout probability 'p'
dropout_probability = get_dropout.p

# Print the dropout probability
print(dropout_probability)


# Define the loss function and optimizer
lr = 0.006
string_loss = "MSE"
string_optimizer = "ADAM"
num_epochs = 1000
train_losses = []
val_losses = []

3201204
Dropout value: 0002


In [28]:
#MSE Loss function
criterion = nn.MSELoss()
#optimizer = optim.RMSprop(model.parameters(), lr)
optimizer = optim.Adam(model.parameters(), lr)
#optimizer = optim.Adamax(model.parameters(), lr)
#optimizer =  optim.SGD(model.parameters(), lr)

#Take care of negative values when using this loss function!
def msle_loss(output, target):
    log_output = torch.log(output + 1)
    log_target = torch.log(target + 1)
    mse_loss = criterion(log_output, log_target)
    return mse_loss

for epoch in range(num_epochs):
    train_loss = 0.0
    val_loss = 0.0

    # Training
    model.train()
    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        #loss =  msle_loss(outputs, labels)
        loss =  criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    train_loss /= len(train_loader)
    train_losses.append(train_loss)

    # Validation
    model.eval()
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            #loss =  msle_loss(outputs, labels)
            loss =  criterion(outputs, labels)
            val_loss += loss.item()

    val_loss /= len(val_loader)
    val_losses.append(val_loss)

    print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.6f}, Val Loss: {val_loss:.6f}")

    # Check if the current validation loss is better than the previous best
    if val_loss < best_val_loss:
        # Update the best validation loss
        best_val_loss = val_loss
        # Save the model's state
        best_model_state = model.state_dict()

# Load the best model state for evaluation
model.load_state_dict(best_model_state)

Epoch [1/15], Train Loss: 0.000372, Val Loss: 0.000058
Epoch [2/15], Train Loss: 0.000096, Val Loss: 0.000035
Epoch [3/15], Train Loss: 0.000067, Val Loss: 0.000029
Epoch [4/15], Train Loss: 0.000055, Val Loss: 0.000028
Epoch [5/15], Train Loss: 0.000047, Val Loss: 0.000023
Epoch [6/15], Train Loss: 0.000041, Val Loss: 0.000015
Epoch [7/15], Train Loss: 0.000038, Val Loss: 0.000016
Epoch [8/15], Train Loss: 0.000035, Val Loss: 0.000017
Epoch [9/15], Train Loss: 0.000031, Val Loss: 0.000013
Epoch [10/15], Train Loss: 0.000029, Val Loss: 0.000016
Epoch [11/15], Train Loss: 0.000027, Val Loss: 0.000016
Epoch [12/15], Train Loss: 0.000026, Val Loss: 0.000016
Epoch [13/15], Train Loss: 0.000024, Val Loss: 0.000009
Epoch [14/15], Train Loss: 0.000022, Val Loss: 0.000010
Epoch [15/15], Train Loss: 0.000021, Val Loss: 0.000010


In [29]:
#RMSE Error metric
#RMSE Error shows the biggest difference of predicted value to original value
model.eval()
test_loss = 0.0

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        test_loss += loss.item()

test_loss /= len(test_loader)
rmse = test_loss ** 0.5  # Calculate the square root of the MSE
print(f"Test RMSE: {rmse:.4f}")
#print(f"Test MSE: {test_loss:.4f}")

Test RMSE: 0.0032


In [30]:
#MAE (Mean Absolute Error) Error metric
#Shows you the average difference between the predicted values and the original values
model.eval()
test_mae = 0.0

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)

        outputs = model(inputs)
        mae = torch.mean(torch.abs(outputs - labels))
        test_mae += mae.item()

test_mae /= len(test_loader)
print(f"Test MAE: {test_mae:.4f}")

Test MAE: 0.0014


In [31]:
#R2-Score metric
#If R-square equals 1, it means that the model perfectly fits
#the data and there is no difference between the predicted value and actual value!
model.eval()
predictions = []
targets = []

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)

        outputs = model(inputs)
        predictions.append(outputs.cpu().numpy())
        targets.append(labels.cpu().numpy())

predictions = np.concatenate(predictions, axis=0)
targets = np.concatenate(targets, axis=0)

#Adjusted R2-Score
def adjusted_r2_score(r2, n_samples, n_features):
    adj_r2 = 1 - (1 - r2) * (n_samples - 1) / (n_samples - n_features - 1)
    return adj_r2

# Get the number of samples and features
n_samples = targets.shape[0]
n_features = inputs.shape[1]  # Replace 'inputs' with the appropriate variable containing your input data

# Calculate R2 score
r2 = r2_score(targets, predictions)

# Calculate Adjusted R2 score
adj_r2 = adjusted_r2_score(r2, n_samples, n_features)

# Print both R2 and Adjusted R2 scores
print(f"R^2 Score: {r2:.6f}")
print(f"Adjusted R^2 Score: {adj_r2:.6f}")

# Define the filename for saving the model
string = "best_model_r_square_{0}_epoch_{1}_loss_{2}_optimizer_{3}_lr_{4}_dropout_{5}_data_size_{6}.pt".format(
round(r2,5), num_epochs, string_loss, string_optimizer, lr, dropout_probability, input_data.shape[0])
print(string)
# Save the model in the current working directory
# Specify the S3 bucket name
bucket_name = 'sagemaker-studio-12bcc0mjktib'

# Create a Boto3 S3 client
s3_client = boto3.client('s3')

torch.save(model.state_dict(), string)
#s3_client.upload_file(string, bucket_name, os.path.basename(string))

R^2 Score: 0.997369
Adjusted R^2 Score: 0.997368
best_model_r_square_0.99737_epoch_15_loss_MSE_optimizer_ADAM_lr_0.0004_dropout_0002_data_size_3201204.pt
