In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

files = []

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        files.append(os.path.join(dirname, filename))

for file in files:
    print(file)
    

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/word2vec-nlp-tutorial/testData.tsv.zip
/kaggle/input/word2vec-nlp-tutorial/sampleSubmission.csv
/kaggle/input/word2vec-nlp-tutorial/unlabeledTrainData.tsv.zip
/kaggle/input/word2vec-nlp-tutorial/labeledTrainData.tsv.zip


In [3]:
train_data = pd.read_csv(files[3], delimiter = '\t')
test_data = pd.read_csv(files[0], delimiter = '\t')

datasets = [train_data, test_data]
titles = ['Train', 'Test']
for dataset, title in zip(datasets,titles):
    display(dataset.head())

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...


In [4]:
### IMPORTS AND CUDA LOAD
import pandas as pd
import torch
from torch.nn import Module, Embedding, GRU, Linear
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader
from bs4 import BeautifulSoup

# Set device to gpu if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")


Using cpu device


In [5]:
# Preprocess review function 
# NOTE: As it seems this function makes predictions worse!
def preprocess(review):
    text = BeautifulSoup(review, "html.parser").get_text()
    return " ".join(text.lower().split())

In [6]:
# Dataset class
class ReviewDataset(Dataset):
    def __init__(self, reviews, labels, vocab):
        self.reviews = reviews
        self.labels = labels
        self.vocab = vocab
        self.max_len = 64

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = self.reviews[idx]
        tokens = [self.vocab.get(word, 0) for word in review.split()]
        padded_tokens = tokens[:self.max_len] + [0] * (self.max_len - len(tokens))

        if self.labels is not None:
            label = self.labels[idx]
            return torch.tensor(padded_tokens), torch.tensor(label)
        return torch.tensor(padded_tokens)


In [7]:
# Build vocabulary, datasets and dataloaders
vocab = {word: idx + 1 for idx, word in enumerate(set(" ".join(train_data["review"]).split()))}
vocab["<pad>"] = 0

train_dataset = ReviewDataset(train_data["review"], train_data["sentiment"], vocab)
test_dataset = ReviewDataset(test_data["review"], None, vocab)

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [8]:
# Our model
# Tried with different RNN, GRU AND LSTM
# RNN performed noticably worse (Or i think it did and something else was the problem :d)
class SentimentClassifier(Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = Embedding(vocab_size, embed_dim)
        
        # Single layer of GRU
        # If replaced with RNN model performs worse
        # LSTM performance is approximately equal to GRU's performance
        self.gru = GRU(embed_dim, hidden_dim, batch_first=True)
        self.fc = Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        _, hidden = self.gru(embedded)
        output = self.fc(hidden.squeeze(0))
        return output


In [9]:
# Hyperparameters for our model
vocab_size = len(vocab)
embed_dim = 64
hidden_dim = 64 # Tried different sizes, 64 performed on average the best + it needs less time to train
output_dim = 1 # BINARY CLASSIFICATION (sentiment analysis)

In [10]:
# Initialize model, loss, and optimizer
model = SentimentClassifier(vocab_size, embed_dim, hidden_dim, output_dim).to(device)
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = Adam(model.parameters())

In [None]:
# TRAINING LOOP
epochs = 6
for epoch in range(epochs):
    for batch_idx, (reviews, labels) in enumerate(train_loader):
        reviews = reviews.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(reviews)
        loss = criterion(outputs.squeeze(), labels.float())
        loss.backward()
        optimizer.step()

        if batch_idx % 100 == 0:
            print(f"Epoch {epoch+1}/{epochs}, Batch {batch_idx}/{len(train_loader)}, Loss: {loss.item():.4f}")


Epoch 1/6, Batch 0/391, Loss: 0.6934
Epoch 1/6, Batch 100/391, Loss: 0.6884
Epoch 1/6, Batch 200/391, Loss: 0.6861
Epoch 1/6, Batch 300/391, Loss: 0.6928
Epoch 2/6, Batch 0/391, Loss: 0.6546
Epoch 2/6, Batch 100/391, Loss: 0.6403
Epoch 2/6, Batch 200/391, Loss: 0.6028
Epoch 2/6, Batch 300/391, Loss: 0.5174
Epoch 3/6, Batch 0/391, Loss: 0.4873
Epoch 3/6, Batch 100/391, Loss: 0.5325
Epoch 3/6, Batch 200/391, Loss: 0.3852
Epoch 3/6, Batch 300/391, Loss: 0.4081
Epoch 4/6, Batch 0/391, Loss: 0.2883


In [None]:
# PREDICTION LOOP
model.eval()
predictions = []
with torch.no_grad():
    for reviews in test_loader:
        reviews = reviews.to(device)
        outputs = model(reviews)
        preds = torch.round(torch.sigmoid(outputs)).squeeze().cpu().numpy()
        predictions.extend(preds)

In [None]:
# Save predictions
test_data["sentiment"] = predictions
test_data[["id", "sentiment"]].to_csv("data.csv", index=False)
print("Predictions saved to 'data.csv'")
# Save model
torch.save(model.state_dict(), "weights.w")
print("Saved current model weights to 'weights.w'")