In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, Dataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import GloVe

In [None]:
# 1. Prepare your dataset
data = [
    ["I have found the movie very boring", "negative"],
    ["I think it was OK", "neutral"],
    ["Did you see it? It was amazing!!", "positive"],
]

In [None]:
# Tokenization
tokenizer = get_tokenizer("basic_english")
tokenized_data = [(tokenizer(sentence), label) for sentence, label in data]

In [None]:
# Encoding labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform([label for _, label in data])

In [None]:
# Split data (this step is trivial here because of the small size of the dataset)
train_data, test_data, train_labels, test_labels = train_test_split(
    tokenized_data, labels, test_size=0.1
)

In [None]:
# 2. Load GloVe embeddings
def load_glove_embeddings(dim=50):
    glove = GloVe(name="6B", dim=dim)
    return glove


glove = load_glove_embeddings()

In [None]:
# 3. Create a Dataset and DataLoader
class SentimentDataset(Dataset):
    def __init__(self, data, labels, glove):
        self.data = data
        self.labels = labels
        self.glove = glove

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        tokens, label = self.data[idx]
        label = self.labels[idx]
        # embeddings = torch.stack(
        #     [self.glove[token] for token in tokens if token in self.glove.stoi]
        # )
        embeddings = torch.stack(
            [self.glove[token] for token in tokens]
        )
        return embeddings, label

In [None]:
train_dataset = SentimentDataset(train_data, train_labels, glove)
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)

In [None]:
train_data

In [None]:
# 4. Build the Model
class LSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        _, (hidden, _) = self.lstm(x)
        out = self.fc(hidden.squeeze(0))
        return out

In [None]:
# Model instance
model = LSTMClassifier(
    embedding_dim=50, hidden_dim=128, output_dim=len(label_encoder.classes_)
)

In [None]:
# 5. Training
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

In [None]:
# Training loop
for epoch in range(10):  # Assuming 10 epochs for simplicity
    for embeddings, label in train_loader:
        # Forward pass
        outputs = model(embeddings)
        loss = criterion(outputs, torch.tensor([label], dtype=torch.long))

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch [{epoch+1}/10], Loss: {loss.item():.4f}")

# Note: For a real project, you would also implement validation during training and evaluation on a test set.

In [None]:
for embeddings, label in train_loader:
    print(embeddings.shape)