<a href="https://colab.research.google.com/github/Rosie-Chenyr/Advanced-H1B-LCA-Data-Analysis-Project/blob/joy/class03.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Autoencoder with GPU
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# Check for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [None]:
# Sample sentences
sentences = [
    "i loved this movie",
    "the acting was terrible",
    "great performances by the cast",
    "i fell asleep during the film",
    "this film is a masterpiece",
    "the special effects were amazing",
    "worst movie i have seen",
    "the soundtrack was beautiful"
]

In [None]:
# Create bag-of-words representation
vectorizer = CountVectorizer(binary=True)
X = vectorizer.fit_transform(sentences).toarray()
vocab_size = len(vectorizer.get_feature_names_out())
print(f"Vocabulary size: {vocab_size}")

Vocabulary size: 26


In [None]:
# Convert to torch tensors and move to device
X_tensor = torch.FloatTensor(X).to(device)

In [None]:
# Define a simple autoencoder
class TextAutoencoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(TextAutoencoder, self).__init__()
        self.encoder = nn.Linear(input_dim, encoding_dim)
        self.decoder = nn.Linear(encoding_dim, input_dim)

    def forward(self, x):
        encoded = torch.relu(self.encoder(x))
        decoded = torch.sigmoid(self.decoder(encoded))
        return decoded, encoded

In [None]:
# Initilize the model and move to device
input_dim = vocab_size
encoding_dim = 5 # Compressed representation size
model = TextAutoencoder(input_dim, encoding_dim).to(device)

In [None]:
# Loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [None]:
# Training loop with GPU acceleration
num_epochs = 100
for epoch in range(num_epochs):
    # Forward pass
    reconstructed, encoded = model(X_tensor)
    loss = criterion(reconstructed, X_tensor)

    # Backward pass and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Print progress
    if (epoch+1) % 20 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [20/100], Loss: 0.5352
Epoch [40/100], Loss: 0.3684
Epoch [60/100], Loss: 0.2792
Epoch [80/100], Loss: 0.2077
Epoch [100/100], Loss: 0.1502


In [None]:
# Test the autoencoder
model.eval()
with torch.no_grad():
    reconstructed, encoded_data = model(X_tensor)

    # Move data back to CPU for processing
    reconstructed = reconstructed.cpu()
    encoded_data = encoded_data.cpu()
    X_tensor_cpu = X_tensor.cpu()

    # Print original and reconstructed text
    print("\nOriginal vs Reconstructed:")
    for i in range(len(sentences)):
        print(f"\nOriginal: {sentences[i]}")

        # Get original words
        original_indices = X_tensor_cpu[i].nonzero().flatten().tolist()
        original_words = [vectorizer.get_feature_names_out()[idx] for idx in original_indices]

        # Get reconstructed words (top N where N is number of words in original)
        n_words = len(original_indices)
        values, indices = torch.topk(reconstructed[i], n_words)
        reconstructed_words = [vectorizer.get_feature_names_out()[idx.item()] for idx in indices]

        print(f"Reconstructed words: {', '.join(reconstructed_words)}")

    # Print encoded representation
    print("\nEncoded representations (5-dimensional):")
    for i, sentence in enumerate(sentences):
        print(f"{sentence}: {encoded_data[i].numpy()}")


Original vs Reconstructed:

Original: i loved this movie
Reconstructed words: movie, this, loved

Original: the acting was terrible
Reconstructed words: the, was, terrible, acting

Original: great performances by the cast
Reconstructed words: the, great, cast, performances, by

Original: i fell asleep during the film
Reconstructed words: film, this, the, masterpiece, fell

Original: this film is a masterpiece
Reconstructed words: film, this, is, masterpiece

Original: the special effects were amazing
Reconstructed words: the, special, were, amazing, effects

Original: worst movie i have seen
Reconstructed words: movie, worst, have, the

Original: the soundtrack was beautiful
Reconstructed words: the, was, beautiful, soundtrack

Encoded representations (5-dimensional):
i loved this movie: [0.0000000e+00 2.7190549e+00 0.0000000e+00 8.0972910e-04 1.9842306e+00]
the acting was terrible: [0.        2.334205  3.3679435 3.349973  0.       ]
great performances by the cast: [0.         0.16954