In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import time
from torch.utils.data import DataLoader, TensorDataset, random_split

# =====================
# 1️⃣ Load Encoded Data (Modified for CNN)
# =====================
def load_encoded_data(one_hot_csv, train_ratio=0.8, input_size=7098):
    one_hot_data = pd.read_csv(one_hot_csv, header=None).values.astype(np.float32)
    num_samples = one_hot_data.shape[0]

    # Reshape for CNN: (batch, channels, length)
    one_hot_data = one_hot_data.reshape((num_samples, 1, input_size))  # Adding channel dimension

    X = torch.tensor(one_hot_data, dtype=torch.float32)
    dataset = TensorDataset(X, X)

    train_size = max(1, int(train_ratio * len(dataset)))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

    print(f"Total samples: {len(dataset)}, Train: {train_size}, Test: {test_size}")
    return train_dataset, test_dataset

# =====================
# 2️⃣ CNN Autoencoder (Fixed Dimensions)
# =====================
class CNNEncoder(nn.Module):
    def __init__(self, input_length=7098, latent_dim=200):
        super(CNNEncoder, self).__init__()

        # Calculate the output size after convolutions
        self.conv1 = nn.Conv1d(1, 64, kernel_size=5, stride=2, padding=2)
        self.conv2 = nn.Conv1d(64, 128, kernel_size=5, stride=2, padding=2)
        self.conv3 = nn.Conv1d(128, 256, kernel_size=5, stride=2, padding=2)
        self.conv4 = nn.Conv1d(256, 512, kernel_size=5, stride=2, padding=2)

        # Calculate the flattened size
        self.flattened_size = self._get_conv_output(input_length)
        self.fc = nn.Linear(self.flattened_size, latent_dim)

        self.relu = nn.ReLU()
        self.bn1 = nn.BatchNorm1d(64)
        self.bn2 = nn.BatchNorm1d(128)
        self.bn3 = nn.BatchNorm1d(256)
        self.bn4 = nn.BatchNorm1d(512)

    def _get_conv_output(self, length):
        # Helper function to calculate the output size after convolutions
        with torch.no_grad():
            dummy = torch.zeros(1, 1, length)
            dummy = self.conv1(dummy)
            dummy = self.conv2(dummy)
            dummy = self.conv3(dummy)
            dummy = self.conv4(dummy)
            return dummy.numel()  # Total elements after flattening

    def forward(self, x):
        x = self.relu(self.bn1(self.conv1(x)))
        x = self.relu(self.bn2(self.conv2(x)))
        x = self.relu(self.bn3(self.conv3(x)))
        x = self.relu(self.bn4(self.conv4(x)))
        x = x.view(x.size(0), -1)  # Flatten
        x = self.fc(x)
        return x

class CNNDecoder(nn.Module):
    def __init__(self, output_length=7098, latent_dim=200):
        super(CNNDecoder, self).__init__()
        self.output_length = output_length

        # Calculate the unflattened size (must match encoder's last conv output)
        self.conv_output_size = 512 * ((output_length // 16) + 1)  # Adjusted calculation

        self.fc = nn.Linear(latent_dim, self.conv_output_size)

        self.deconv1 = nn.ConvTranspose1d(512, 256, kernel_size=5, stride=2, padding=2, output_padding=1)
        self.deconv2 = nn.ConvTranspose1d(256, 128, kernel_size=5, stride=2, padding=2, output_padding=1)
        self.deconv3 = nn.ConvTranspose1d(128, 64, kernel_size=5, stride=2, padding=2, output_padding=1)
        self.deconv4 = nn.ConvTranspose1d(64, 1, kernel_size=5, stride=2, padding=2, output_padding=1)

        self.relu = nn.ReLU()
        self.bn1 = nn.BatchNorm1d(256)
        self.bn2 = nn.BatchNorm1d(128)
        self.bn3 = nn.BatchNorm1d(64)

    def forward(self, x):
        x = self.fc(x)
        x = x.view(x.size(0), 512, -1)  # Reshape to match encoder's last conv output

        x = self.relu(self.bn1(self.deconv1(x)))
        x = self.relu(self.bn2(self.deconv2(x)))
        x = self.relu(self.bn3(self.deconv3(x)))
        x = self.deconv4(x)

        # Ensure output has the correct length
        if x.size(2) > self.output_length:
            x = x[:, :, :self.output_length]
        elif x.size(2) < self.output_length:
            padding = torch.zeros(x.size(0), 1, self.output_length - x.size(2), device=x.device)
            x = torch.cat([x, padding], dim=2)
        return x

class CNNAutoencoder(nn.Module):
    def __init__(self, input_length=7098, latent_dim=200):
        super(CNNAutoencoder, self).__init__()
        self.encoder = CNNEncoder(input_length, latent_dim)
        self.decoder = CNNDecoder(input_length, latent_dim)

    def forward(self, x):
        encoded = self.encoder(x)
        reconstructed = self.decoder(encoded)
        return reconstructed

# =====================
# 3️⃣ Training (Same as before)
# =====================
def train_model(model, train_dataset, epochs=10, batch_size=4, learning_rate=0.15):
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, weight_decay=1e-5)
    criterion = nn.MSELoss()

    model.train()
    start_time = time.time()

    for epoch in range(epochs):
        epoch_start_time = time.time()
        total_loss = 0.0
        for X, _ in train_loader:
            optimizer.zero_grad()
            outputs = model(X)
            loss = criterion(outputs, X)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss:.6f} - Time: {time.time() - epoch_start_time:.2f}s")

    end_time = time.time()
    accuracy = (1-loss)*100
    print(f"Total Loss: {loss:.2f}")
    print(f"Accuracy: {accuracy:.2f} %")
    print(f"Total Training Time: {end_time - start_time:.2f} seconds")

# =====================
# 4️⃣ Run Training with CNN
# =====================
one_hot_csv = "basepaper_encoded.csv"
train_dataset, test_dataset = load_encoded_data(one_hot_csv, input_size=7098)
model = CNNAutoencoder(input_length=7098)

train_model(model, train_dataset, epochs=10, batch_size=4)

Total samples: 189, Train: 151, Test: 38
Epoch 1/10 - Loss: 31.854653 - Time: 48.53s
Epoch 2/10 - Loss: 4.351855 - Time: 44.44s
Epoch 3/10 - Loss: 2.729854 - Time: 43.99s
Epoch 4/10 - Loss: 1.198484 - Time: 44.65s
Epoch 5/10 - Loss: 1.090384 - Time: 44.48s
Epoch 6/10 - Loss: 1.063669 - Time: 44.02s
Epoch 7/10 - Loss: 0.963861 - Time: 44.48s
Epoch 8/10 - Loss: 0.962477 - Time: 43.78s
Epoch 9/10 - Loss: 0.890112 - Time: 44.39s
Epoch 10/10 - Loss: 0.933416 - Time: 45.12s
Total Loss: 0.02
Accuracy: 98.35 %
Total Training Time: 447.88 seconds


In [1]:
!ls

basepaper_encoded.csv  sample_data
