<a href="https://colab.research.google.com/github/Rosie-Chenyr/Advanced-H1B-LCA-Data-Analysis-Project/blob/main/class04.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# RNN with GPU
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import time

In [None]:
# Check for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [None]:
# Sample sentences
sentences = [
    "i loved this movie",
    "the acting was terrible",
    "great performances by the cast",
    "i fell asleep during the film",
    "this film is a masterpiece",
    "the special effects were amazing",
    "worst movie i have seen",
    "the soundtrack was beautiful"
]

In [None]:
# Create a simple vocabulary
all_words = []
for sentence in sentences:
    all_words.extend(sentence.split())

vocab = sorted(set(all_words))
word2idx = {word: i for i, word in enumerate(vocab)}
idx2word = {i: word for i, word in enumerate(vocab)}
vocab_size = len(vocab)

print(f"Vocabulary size: {vocab_size}")
print(f"Vocabulary: {vocab}")

Vocabulary size: 28
Vocabulary: ['a', 'acting', 'amazing', 'asleep', 'beautiful', 'by', 'cast', 'during', 'effects', 'fell', 'film', 'great', 'have', 'i', 'is', 'loved', 'masterpiece', 'movie', 'performances', 'seen', 'soundtrack', 'special', 'terrible', 'the', 'this', 'was', 'were', 'worst']


In [None]:
# Convert sentences to sequences
def sentence_to_indices(sentence, word2idx):
  return [word2idx[word] for word in sentence.split()]

sequences = [sentence_to_indices(sentence, word2idx) for sentence in sentences]
print(sequences)
max_length = max(len(seq) for seq in sequences)

[[13, 15, 24, 17], [23, 1, 25, 22], [11, 18, 5, 23, 6], [13, 9, 3, 7, 23, 10], [24, 10, 14, 0, 16], [23, 21, 8, 26, 2], [27, 17, 13, 12, 19], [23, 20, 25, 4]]


In [None]:
# Pad sequences
def pad_sequence(seq, max_length):
  return seq + [0] * (max_length - len(seq)) # Using 0 as padding

padded_sequences = [pad_sequence(seq, max_length) for seq in sequences]
print(padded_sequences)
X = torch.LongTensor(padded_sequences).to(device)
print(X)

[[13, 15, 24, 17, 0, 0], [23, 1, 25, 22, 0, 0], [11, 18, 5, 23, 6, 0], [13, 9, 3, 7, 23, 10], [24, 10, 14, 0, 16, 0], [23, 21, 8, 26, 2, 0], [27, 17, 13, 12, 19, 0], [23, 20, 25, 4, 0, 0]]
tensor([[13, 15, 24, 17,  0,  0],
        [23,  1, 25, 22,  0,  0],
        [11, 18,  5, 23,  6,  0],
        [13,  9,  3,  7, 23, 10],
        [24, 10, 14,  0, 16,  0],
        [23, 21,  8, 26,  2,  0],
        [27, 17, 13, 12, 19,  0],
        [23, 20, 25,  4,  0,  0]], device='cuda:0')


In [None]:
# Convert to one-hot encoding
def to_one_hot(X, vocab_size):
    one_hot = torch.zeros(X.size(0), X.size(1), vocab_size, device=device)
    for i in range(X.size(0)):
        for j in range(X.size(1)):
            one_hot[i, j, X[i, j]] = 1
    return one_hot

X_one_hot = to_one_hot(X, vocab_size)
print(X_one_hot)

tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [1., 0., 0.,  ..., 0., 0., 0.],
         [1., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 1., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 1., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [1., 0., 0.,  ..., 0., 0., 0.],
         [1., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [1., 0., 0.,  ..., 0., 0., 0.]],

        ...,

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 1., 0.],
         [0., 0., 1.,  ..., 0., 0., 0.],
         [1., 0., 0.,  ..., 0., 0., 0

In [None]:
# Define RNN autoencoder
class RNNAutoencoder(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super(RNNAutoencoder, self).__init__()
        self.hidden_size = hidden_size

        # Encoder
        self.encoder = nn.RNN(vocab_size, hidden_size, batch_first=True)

        # Decoder
        self.decoder = nn.RNN(hidden_size, hidden_size, batch_first=True)

        # Output layer
        self.output_layer = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        batch_size = x.size(0)
        seq_len = x.size(1)

        # Encoder
        _, hidden = self.encoder(x)

        # Create decoder inpyt (repeat hidden state for each time step)
        decoder_input = hidden.permute(1, 0, 2).repeat(1, seq_len, 1)

        # Decode
        outputs, _ = self.decoder(decoder_input)

        # Project to vocabulary space
        outputs = self.output_layer(outputs)

        return outputs, hidden.squeeze(0)

In [None]:
# Initialize the model and move to GPU
hidden_size = 10 # Size of the hidden/encoded representation
model = RNNAutoencoder(vocab_size, hidden_size).to(device)

In [None]:
# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [None]:
# Training loop
num_epochs = 1000
for epoch in range(num_epochs):
    # Make sure model is in training mode
    model.train()

    # Forward pass
    outputs, encoded = model(X_one_hot)

    # Reshape for cross entropy loss
    outputs = outputs.view(-1, vocab_size)
    targets = X.view(-1)

    # Calculate loss
    loss = criterion(outputs, targets)

    # Backward pass and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Print progress
    if (epoch+1) % 100 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [100/1000], Loss: 0.0098
Epoch [200/1000], Loss: 0.0080
Epoch [300/1000], Loss: 0.0066
Epoch [400/1000], Loss: 0.0056
Epoch [500/1000], Loss: 0.0047
Epoch [600/1000], Loss: 0.0041
Epoch [700/1000], Loss: 0.0035
Epoch [800/1000], Loss: 0.0031
Epoch [900/1000], Loss: 0.0027
Epoch [1000/1000], Loss: 0.0024


In [None]:
# Test the autoencoder
model.eval()  # Important: set to evaluation mode
with torch.no_grad():
    outputs, encoded_data = model(X_one_hot)

    # Find the most likely word at each position
    _, predicted_indices = torch.max(outputs, dim=2)

    # Move data back to CPU for processing
    predicted_indices = predicted_indices.cpu()
    encoded_data = encoded_data.cpu()

    # Print original and reconstructed texts
    print("\nOriginal vs Reconstructed:")
    for i in range(len(sentences)):
        original = sentences[i]

        reconstructed_words = []
        for j in range(len(predicted_indices[i])):
            idx = predicted_indices[i][j].item()
            if idx in idx2word:
                reconstructed_words.append(idx2word[idx])

        reconstructed = ' '.join(reconstructed_words)

        print(f"Original: {original}")
        print(f"Reconstructed: {reconstructed}")
        print()

    # Print encoded representations
    print("\nEncoded representations (10-dimensional):")
    for i, sentence in enumerate(sentences):
        print(f"{sentence}: {encoded_data[i].numpy()}")


Original vs Reconstructed:
Original: i loved this movie
Reconstructed: i loved this movie a a

Original: the acting was terrible
Reconstructed: the acting was terrible a a

Original: great performances by the cast
Reconstructed: great performances by the cast a

Original: i fell asleep during the film
Reconstructed: i fell asleep during the film

Original: this film is a masterpiece
Reconstructed: this film is a masterpiece a

Original: the special effects were amazing
Reconstructed: the special effects were amazing a

Original: worst movie i have seen
Reconstructed: worst movie i have seen a

Original: the soundtrack was beautiful
Reconstructed: the soundtrack was beautiful a a


Encoded representations (10-dimensional):
i loved this movie: [-0.47268057  0.97325027 -0.99186295 -0.76696897  0.8042304   0.96708065
  0.89488065  0.11419757  0.96373004 -0.492016  ]
the acting was terrible: [-0.95551366  0.86767507 -0.9756296  -0.9938029  -0.09841057  0.7844355
 -0.51801544  0.97147757  0