# Sentiment Analysis

## Imports & Configs

In [2]:
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from torch.utils.data import Dataset , DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from tqdm import tqdm
import nltk
nltk.download('punkt')

# setup device agnostic code
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


device(type='cpu')

## Load Dataset (IMDB from huggingface)

In [3]:
from datasets import load_dataset
dataset = load_dataset("imdb")

train_texts = dataset['train']['text']
train_labels = dataset['train']['label']
test_texts  = dataset['test']['text']
test_labels  = dataset['test']['label']





The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [18]:
print(f'Train size: {len(train_texts)}')
print(f'Test size: {len(test_texts)}')

Train size: 25000
Test size: 25000


ClassLabel(names=['neg', 'pos'])

## Preprocessing

In [5]:
# text cleaning utility
def clean_text(string):
  string = string.lower()
  string = re.sub(r"https\S+","",string)
  string = re.sub(r"@\w+", "", string)
  string = re.sub(r"[^a-z0-9\s']"," ",string)
  string = re.sub(r"\s+"," ",string).strip()

  return string

In [6]:
# applying utility on train-test data
train_texts = [clean_text(t) for t in train_texts]
test_texts = [clean_text(t) for t in test_texts]

## Tokenization & Vocabulary

In [15]:
from collections import Counter

def build_vocab(texts,min_freq = 2, max_size = 200000):
  counter = Counter()
  for t in texts:
    counter.update(t.split())
  most_common = [w for w, c in counter.most_common(max_size) if c >=min_freq]
  itos = ["<PAD>","<OOV>"] + most_common
  stoi = {w:i for i , w in enumerate(itos)}
  return stoi , itos

stoi , itos = build_vocab(train_texts)
vocab_size = len(stoi)
print("Vocab Size: ", vocab_size)

Vocab Size:  51826


In [17]:
def texts_to_sequence(texts,stoi):
  seqs = []
  for t in texts:
    seq = [stoi.get(w,stoi["<OOV>"]) for w in t.split()]
    seqs.append(torch.tensor(seq, dtype= torch.long))
  return seqs

train_seq = texts_to_sequence(train_texts,stoi)
test_seq = texts_to_sequence(test_texts , stoi)

## Dataset & DataLoader

In [19]:
class TextDataset(Dataset):
  def __init__(self,seqs, labels):
    self.seqs = seqs
    self.labels = labels
  def __len__(self):
    return len(self.seqs)
  def __getitem__(self, idx):
    return self.seqs[idx], torch.tensor(self.labels[idx], dtype=torch.float)

def collate_fn(batch):
  seqs , labels = zip(*batch)
  seqs_padded = pad_sequence(seqs, batch_first=True, padding_value= 0)
  lengths = torch.tensor([len(s) for s in seqs])
  labels = torch.stack(labels)
  return seqs_padded, lengths, labels


In [20]:
train_ds = TextDataset(train_seq,train_labels)
test_ds = TextDataset(test_seq, test_labels)

train_loader = DataLoader(train_ds,
                          batch_size=64,
                          shuffle=True,
                          collate_fn= collate_fn)

test_loader = DataLoader(test_ds,
                          batch_size=64,
                          collate_fn= collate_fn)