In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import contractions

In [2]:
df = pd.read_csv("train.csv", names=["rating", "title", "review"])

In [3]:
df.isna().sum()

rating      0
title     207
review      0
dtype: int64

In [4]:
df["title"] = df["title"].fillna("")

In [5]:
df["review_length"] = df["review"].apply(len)

In [7]:
bins = df['review_length'].quantile([0, 0.2, 0.4, 0.6, 0.8, 1.0]).to_numpy()
bins[0] = 0
bins[-1] += 1
bins
labels = np.linspace(0, 4, 5)
df['length_bin'] = pd.cut(df['review_length'], bins=bins, labels=labels)

# Stratified sampling: group by class and bin
grouped = df.groupby(['rating', 'length_bin'])

# Decide total size per class
sample_size_per_class = 5000  # or whatever you want
final_samples = []

for label in df['rating'].unique():
    group = df[df['rating'] == label]
    bin_counts = group['length_bin'].value_counts(normalize=True)
    
    for bin_label, frac in bin_counts.items():
        n = int(sample_size_per_class * frac)
        subset = grouped.get_group((label, bin_label))
        
        if len(subset) < n:
            n = len(subset)  # safeguard
        sampled = subset.sample(n=n, random_state=42)
        final_samples.append(sampled)

# Combine
sampled_df = pd.concat(final_samples).sample(frac=1, random_state=45).reset_index(drop=True)

  grouped = df.groupby(['rating', 'length_bin'])


In [8]:
df = sampled_df

In [9]:
def preprocess(title, review):
    text = title + " : " + review
    text = text.lower()
    text = contractions.fix(text)
    text = re.sub(r'&', r'and', text)
    text = re.sub(r"[^a-z0-9!?.,:' ]+", "", text)
    text = re.sub(r'([!?.,])\1{1,}', r'\1', text)
    #separating punctuation from words for better tokenization later.
    text = re.sub(r'([!?.,])', r' \1 ', text)
    text = re.sub(r'([ ])\1{1,}', r'\1', text)
    return text

In [10]:
df["text"] = df.apply(lambda row: preprocess(row["title"], row["review"]), axis=1)

In [11]:
df.drop(["title", "review", "review_length", "length_bin"], inplace=True, axis=1)

In [12]:
df["rating"] -= 1

In [13]:
dim = 100
embed_file = f"glove.6B/glove.6B.{dim}d.txt"
embeds = {}
vocab_map = {}

#out of vocabulary vector
embeds[0] = np.zeros(dim)

#padding index vector
embeds[1] = np.zeros(dim)

with open(embed_file, "r", encoding='utf-8') as f:
    i = 2
    for l in f:
        l_split = l.split()
        word = l_split[0]
        vector = np.asarray(l_split[1:], "float32")
        vocab_map[word] = i
        embeds[i] = vector
        i += 1


In [14]:
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from torchvision import datasets
from torchvision.transforms import ToTensor

In [15]:
embed_tensor = torch.tensor(np.array([torch.tensor(v, dtype=torch.float32) for v in embeds.values()]))

In [35]:
max_text_len = max(len(text.split()) for text in df["text"])

def pad_and_tokenize(text, pad_len):
    pad_idx = 1
    padded = pad_idx * np.ones(pad_len, dtype=int)
    words = text.split()
    for i in range(len(words)):
        padded[i] = vocab_map.get(words[i], 0)
    return padded
        
df["token_id"] = df["text"].apply(lambda word: torch.tensor(pad_and_tokenize(word, max_text_len)))


In [36]:
train_data = TensorDataset(torch.stack(df["token_id"].to_list()), torch.from_numpy(df["rating"].to_numpy(dtype="float32")).reshape(-1, 1))
training_loader = DataLoader(train_data, batch_size=54)

## LSTM

In [42]:
class SentimentNN(nn.Module):
    def __init__(self, input_size, hidden_size, embed_tensor):
        super(SentimentNN, self).__init__()
        self.s_i = input_size
        self.s_h = hidden_size
        self.embed_layer = nn.Embedding.from_pretrained(embed_tensor, padding_idx=1)
        self.lstm = nn.LSTM(self.s_i, self.s_h, batch_first=True)
        self.dense = nn.Linear(self.s_h, 1)
    
    def forward(self, x):
        vec = self.embed_layer(x)
        output, (hn, cn) = self.lstm(vec)
        last_hidden = output[:, -1, :]
        logits = self.dense(last_hidden)
        y_hat = torch.sigmoid(logits)
        return y_hat

def train(model, training_loader, lr, threshold=0.5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.train()

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.BCELoss()
    
    for X_batch, y_batch in training_loader:
        X_batch.to(device)
        y_batch.to(device)

        y_hat = model(X_batch)
        
        loss = loss_fn(y_hat, y_batch)
        loss.backward()
        optimizer.step()

In [43]:
lstm_sentiment = SentimentNN(dim, 128, embed_tensor)

In [44]:
train(lstm_sentiment, training_loader, 0.01)

In [130]:
embed_layer = nn.Embedding.from_pretrained(embed_tensor, padding_idx=1)
for x, y in training_loader:
    print(x.shape)
    print(embed_layer(x).dtype)
    break

torch.Size([54, 1056])
torch.float32


In [92]:
embed_tensor.shape

torch.Size([400002, 100])

In [105]:
a = torch.rand((4,1,5))
a.shape

torch.Size([4, 1, 5])

In [114]:
a[:, -1, :].shape

torch.Size([4, 5])

In [28]:

df["rating"].unique()

array([1, 0])

In [34]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu
