In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amazon-reviews/amazon_review_polarity_csv.tgz
/kaggle/input/amazon-reviews/train.csv
/kaggle/input/amazon-reviews/test.csv


In [1]:
import numpy as np
import pandas as pd
import random
import re
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from collections import Counter


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


In [3]:
train_df = pd.read_csv(
    "/kaggle/input/amazon-reviews/train.csv",
    header=None
)

test_df = pd.read_csv(
    "/kaggle/input/amazon-reviews/test.csv",
    header=None
)


In [4]:
train_df.columns = ['sentiment', 'title', 'review_text']
test_df.columns  = ['sentiment', 'title', 'review_text']


In [5]:
train_df

Unnamed: 0,sentiment,title,review_text
0,2,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...
1,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
2,2,Amazing!,This soundtrack is my favorite music of all ti...
3,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
4,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."
...,...,...,...
3599995,1,Don't do it!!,The high chair looks great when it first comes...
3599996,1,"Looks nice, low functionality",I have used this highchair for 2 kids now and ...
3599997,1,"compact, but hard to clean","We have a small house, and really wanted two o..."
3599998,1,what is it saying?,not sure what this book is supposed to be. It ...


In [6]:
test_df

Unnamed: 0,sentiment,title,review_text
0,2,Great CD,My lovely Pat has one of the GREAT voices of h...
1,2,One of the best game music soundtracks - for a...,Despite the fact that I have only played a sma...
2,1,Batteries died within a year ...,I bought this charger in Jul 2003 and it worke...
3,2,"works fine, but Maha Energy is better",Check out Maha Energy's website. Their Powerex...
4,2,Great for the non-audiophile,Reviewed quite a bit of the combo players and ...
...,...,...,...
399995,1,Unbelievable- In a Bad Way,We bought this Thomas for our son who is a hug...
399996,1,"Almost Great, Until it Broke...",My son recieved this as a birthday gift 2 mont...
399997,1,Disappointed !!!,"I bought this toy for my son who loves the ""Th..."
399998,2,Classic Jessica Mitford,This is a compilation of a wide range of Mitfo...


In [7]:
train_df['sentiment'] = train_df['sentiment'].map({1: 0, 2: 1})
test_df['sentiment']  = test_df['sentiment'].map({1: 0, 2: 1})


In [8]:
train_df['review_text'] = train_df['title'] + " " + train_df['review_text']
test_df['review_text']  = test_df['title'] + " " + test_df['review_text']

train_df = train_df[['review_text', 'sentiment']]
test_df  = test_df[['review_text', 'sentiment']]


In [9]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"[^a-zA-Z ]", "", text)
    return text

train_df['review_text'] = train_df['review_text'].apply(clean_text)
test_df['review_text']  = test_df['review_text'].apply(clean_text)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['review_text'] = train_df['review_text'].apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['review_text']  = test_df['review_text'].apply(clean_text)


In [12]:
train_text, val_text, train_y, val_y = train_test_split(
    train_df.review_text,
    train_df.sentiment,
    test_size=0.125,   # 10% of total data
    random_state=42
)

test_text = test_df.review_text
test_y    = test_df.sentiment


In [15]:
def build_vocab(texts, max_vocab=20000):
    counter = Counter()
    for text in texts:
        counter.update(text.split())
    vocab = {'<PAD>': 0, '<UNK>': 1}
    for word, _ in counter.most_common(max_vocab - 2):
        vocab[word] = len(vocab)
    return vocab

vocab = build_vocab(train_text)


In [16]:
def encode(text, vocab, max_len=100):
    tokens = [vocab.get(w, vocab['<UNK>']) for w in text.split()]
    return tokens[:max_len] + [0] * max(0, max_len - len(tokens))


In [22]:
class ReviewDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return (
            torch.tensor(self.texts[idx], dtype=torch.long),
            torch.tensor(self.labels[idx], dtype=torch.long)
        )


In [23]:
MAX_LEN = 100
BATCH_SIZE = 64

X_train = [encode(t, vocab, MAX_LEN) for t in train_text]
X_val   = [encode(t, vocab, MAX_LEN) for t in val_text]
X_test  = [encode(t, vocab, MAX_LEN) for t in test_text]

train_ds = ReviewDataset(X_train, train_y.values)
val_ds   = ReviewDataset(X_val, val_y.values)
test_ds  = ReviewDataset(X_test, test_y.values)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE)
test_loader  = DataLoader(test_ds, batch_size=BATCH_SIZE)


In [24]:
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.embed_dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(
            embed_dim,
            hidden_dim,
            bidirectional=True,
            batch_first=True
        )
        self.lstm_dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, 1)

    def forward(self, x):
        x = self.embedding(x)
        x = self.embed_dropout(x)
        _, (h, _) = self.lstm(x)
        h = torch.cat((h[-2], h[-1]), dim=1)
        h = self.lstm_dropout(h)
        return torch.sigmoid(self.fc(h))


In [25]:
def evaluate(model, loader):
    model.eval()
    preds, labels = [], []

    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            out = model(x).squeeze()
            preds.extend((out > 0.5).int().cpu().tolist())
            labels.extend(y.cpu().tolist())

    return f1_score(labels, preds, average='macro')


In [26]:
def train_model(model, train_loader, val_loader, epochs=5):
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.BCELoss()

    for epoch in range(epochs):
        model.train()
        losses = []

        for x, y in train_loader:
            x, y = x.to(device), y.to(device)

            optimizer.zero_grad()
            out = model(x).squeeze()
            loss = criterion(out, y.float())
            loss.backward()
            optimizer.step()

            losses.append(loss.item())

        val_f1 = evaluate(model, val_loader)
        print(
            f"Epoch {epoch+1} | "
            f"Train Loss: {np.mean(losses):.4f} | "
            f"Val Macro F1: {val_f1:.4f}"
        )


In [27]:
for dropout in [0.2, 0.4, 0.6]:
    print(f"\n===== DROPOUT = {dropout} =====")

    model = BiLSTM(
        vocab_size=len(vocab),
        embed_dim=100,
        hidden_dim=64,
        dropout=dropout
    ).to(device)

    train_model(model, train_loader, val_loader, epochs=5)

    test_f1 = evaluate(model, test_loader)
    print(f"Test Macro F1: {test_f1:.4f}")



===== DROPOUT = 0.2 =====
Epoch 1 | Train Loss: 0.1963 | Val Macro F1: 0.9403
Epoch 2 | Train Loss: 0.1577 | Val Macro F1: 0.9441
Epoch 3 | Train Loss: 0.1493 | Val Macro F1: 0.9454
Epoch 4 | Train Loss: 0.1443 | Val Macro F1: 0.9462
Epoch 5 | Train Loss: 0.1411 | Val Macro F1: 0.9470
Test Macro F1: 0.9465

===== DROPOUT = 0.4 =====
Epoch 1 | Train Loss: 0.2191 | Val Macro F1: 0.9357
Epoch 2 | Train Loss: 0.1729 | Val Macro F1: 0.9403
Epoch 3 | Train Loss: 0.1643 | Val Macro F1: 0.9418
Epoch 4 | Train Loss: 0.1596 | Val Macro F1: 0.9429
Epoch 5 | Train Loss: 0.1565 | Val Macro F1: 0.9432
Test Macro F1: 0.9431

===== DROPOUT = 0.6 =====
Epoch 1 | Train Loss: 0.2526 | Val Macro F1: 0.9234
Epoch 2 | Train Loss: 0.1970 | Val Macro F1: 0.9337
Epoch 3 | Train Loss: 0.1866 | Val Macro F1: 0.9336
Epoch 4 | Train Loss: 0.1818 | Val Macro F1: 0.9348
Epoch 5 | Train Loss: 0.1789 | Val Macro F1: 0.9350
Test Macro F1: 0.9344


In [28]:
def add_spelling_noise(text, prob=0.1):
    chars = list(text)
    for i in range(len(chars)):
        if random.random() < prob:
            chars[i] = random.choice("abcdefghijklmnopqrstuvwxyz")
    return ''.join(chars)

synonyms = {
    "good": "nice",
    "bad": "poor",
    "great": "excellent",
    "love": "like",
    "hate": "dislike"
}

def synonym_replace(text):
    return " ".join([synonyms.get(w, w) for w in text.split()])


In [29]:
noisy_text = test_text.apply(add_spelling_noise).apply(synonym_replace)
X_noisy = [encode(t, vocab, MAX_LEN) for t in noisy_text]

noisy_ds = ReviewDataset(X_noisy, test_y.values)
noisy_loader = DataLoader(noisy_ds, batch_size=BATCH_SIZE)

noisy_f1 = evaluate(model, noisy_loader)
print("Noisy Test Macro F1:", noisy_f1)


Noisy Test Macro F1: 0.8497805789485513
