In [2]:
import torch
import torch.nn as nn
import pandas as pd
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np
from torch.utils.data import DataLoader, TensorDataset

device = 'cuda' if torch.cuda.is_available else 'cpu'

In [3]:
df = pd.read_csv("test.csv")

In [4]:
def tokenize(texts):
    tokenize_text = [text.lower().split() for text in texts]
    return tokenize_text

In [5]:
tokenized_reviews = tokenize(df['review'])

In [6]:
def build_vocab(tokenized_texts):
    vocab = Counter()
    for text in tokenized_texts:
        vocab.update(text)
    vocab = {word: i+2 for i , word in enumerate(vocab)}
    vocab['<PAD>'] = 0
    vocab['<UNK>'] = 1
    return vocab
    

In [7]:
vocab = build_vocab(tokenized_reviews)

In [8]:
def encode_texts(tokenized_texts , vocab):
    encoded_texts = [[vocab.get(word , vocab['<UNK>']) for word in text] for text in tokenized_texts]
    return encoded_texts

In [9]:
encoded_reviews = encode_texts(tokenized_reviews , vocab)

In [10]:
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(df['sentiment'])

In [11]:
X_train, X_test, y_train, y_test = train_test_split(encoded_reviews, encoded_labels, test_size=0.1, random_state=42)

In [12]:
def pad_sequences(sequences, maxlen=None, padding='post', value=0):
    num_samples = len(sequences)
    lengths = [len(s) for s in sequences]
    if maxlen is None:
        maxlen = np.max(lengths)
    
    padded_sequences = np.full((num_samples, maxlen), value)
    for i, seq in enumerate(sequences):
        if len(seq) > maxlen:  # Truncate
            padded_sequences[i] = seq[:maxlen]
        else:  # Pad
            if padding == 'post':
                padded_sequences[i, :len(seq)] = seq
            else:  # pre-padding
                padded_sequences[i, -len(seq):] = seq
    return padded_sequences

In [13]:
max_seq_len = 5000  # Define maximum sequence length
X_train_padded = pad_sequences(X_train, maxlen=max_seq_len, padding='post', value=vocab['<PAD>'])
X_test_padded = pad_sequences(X_test, maxlen=max_seq_len, padding='post', value=vocab['<PAD>'])

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train_padded, dtype=torch.long)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_padded, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

# Create DataLoader
train_data = TensorDataset(X_train_tensor, y_train_tensor)
test_data = TensorDataset(X_test_tensor, y_test_tensor)
batch_size = 32
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, batch_size=batch_size)


In [14]:
class Embeddings(nn.Module):
    def __init__(self, vocab_size , embed_dim , max_len = 5000):
        super().__init__()
        self.embed = nn.Embedding(vocab_size , embed_dim)
        self.pos_embed = nn.Embedding(max_len , embed_dim)

    def forward(self , x):
        T = x.shape[-1]
        tok_embed = self.embed(x)
        pos_embed = self.pos_embed(torch.arange(T))
        return tok_embed + pos_embed

In [15]:
class Transformer(nn.Module):
    def __init__(self,vocab_size , embed_dim , nhead , num_layers, dropout = 0.5):
        super().__init__()
        self.embedding_layer = Embeddings(vocab_size , embed_dim)
        encoder_layer = nn.TransformerEncoderLayer(embed_dim, nhead , dropout = dropout)
        self.encoder = nn.TransformerEncoder(encoder_layer , num_layers)
        self.classifier = nn.Linear(embed_dim, 2)
    def forward(self, x):
        x = self.embedding_layer(x)
        x = self.encoder(x)
        # x = torch.mean(x , dim = 1)
        x = self.classifier(x)
        x = nn.Softmax(x , dim=-1)
        return x
        
        

In [16]:
vocab_size = len(vocab)
embed_dim = 512
nhead = 8
num_layers = 6
num_epochs = 10
num_iter = len(train_loader)

In [17]:
model = Transformer(vocab_size , embed_dim , nhead , num_layers)



In [18]:
loss_function = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(num_epochs):
    model.train()
    for i, ( X_batch, y_batch ) in enumerate(train_loader):
        # Move data to the same device as model
        X_batch, y_batch = X_batch, y_batch
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = loss_function(outputs.squeeze(), y_batch.float())
        loss.backward()
        optimizer.step()
        print(f"[{epoch}/{num_epochs}] : [{i}/{num_iter}] : {loss.item()}")    
    
    


KeyboardInterrupt: 