In [1]:
import pandas as pd
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
class FTIRDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        label = self.data.iloc[idx, 2]  # assuming label is in the third column
        features = torch.tensor(self.data.iloc[idx, :2].values, dtype=torch.float32)
        return features, label

In [None]:
full_df = pd.read_csv('../data/processsed/combined_dataset.csv')
# Split the data into training and testing sets
train_data, test_data = train_test_split(full_df, test_size=0.2, random_state=42)

# Create instances of the FTIRDataset class
train_dataset = FTIRDataset(train_data)
test_dataset = FTIRDataset(test_data)

# Create DataLoader for training and testing
batch_size = 1  # set to 1 since each sequence is treated as an individual sample
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
# Define the RNN model
class FTIRRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(FTIRRNN, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])  # Only take the output from the last time step
        return out

In [None]:
files = ['Blood_and_RNA_1mg_and_Lys_1mg.CSV', 'Lys_1_mg_Blood_Raw.CSV', 'Lys_10_ng_Blood_Raw.CSV', 'RNA_1_mg_and_Blood_Raw.CSV', 'RNA_10_ng_and_Blood_Raw.CSV']

# Specify input size based on the number of features in your data
input_size = 2  # Assuming wavenumber and transmittance are the features

# Specify hidden layer size and output size based on your requirements
hidden_size = 64
output_size = len(files)  # Number of classes equals the number of files

# Instantiate the model
model = FTIRRNN(input_size, hidden_size, output_size)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(train_data[['wavenumber', 'transmittance']].values, dtype=torch.float32)
y_train_tensor = torch.tensor(train_data['label'].astype('category').cat.codes.values, dtype=torch.long)

In [None]:
num_epochs = 50

for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_tensor.unsqueeze(0))  # Add an extra dimension for the sequence length (1 in this case)
    loss = criterion(outputs, y_train_tensor.unsqueeze(0))
    loss.backward()
    optimizer.step()

    # Evaluation during training (optional)
    model.eval()
    with torch.no_grad():
        test_outputs = model(X_train_tensor.unsqueeze(0))
        _, predicted_labels = torch.max(test_outputs.data, 1)

    accuracy = (predicted_labels == y_train_tensor.unsqueeze(0)).sum().item() / len(y_train_tensor)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}, Accuracy: {accuracy:.4f}')