In [2]:
# !pip install torchtext
# !pip install transformers

# Importing Libraries

In [3]:
import torch
import torch.nn as nn
from torch import optim
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Dataset
import torch.nn.functional as f
from transformers import BertTokenizer

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import torchtext.transforms as T

from tqdm.notebook import trange, tqdm

import polars as pl

# Hyperparameters

In [4]:
learning_rate = 1e-4
epochs = 20
batch_size = 32
max_len = 128

# Importing Dataset

In [5]:
training_data = pl.read_csv('./data/Latest_train.csv')

In [6]:
dataset = pl.read_csv('./data/final_dataset.csv')

In [7]:
print(dataset)

shape: (3_999_998, 2)
┌───────────┬─────────────────────────────────┐
│ sentiment ┆ review                          │
│ ---       ┆ ---                             │
│ i64       ┆ str                             │
╞═══════════╪═════════════════════════════════╡
│ 1         ┆ I'm reading a lot of reviews s… │
│ 1         ┆ This soundtrack is my favorite… │
│ 1         ┆ I truly like this soundtrack a… │
│ 1         ┆ If you've played the game, you… │
│ 1         ┆ I am quite sure any of you act… │
│ …         ┆ …                               │
│ 0         ┆ We bought this Thomas for our … │
│ 0         ┆ My son recieved this as a birt… │
│ 0         ┆ I bought this toy for my son w… │
│ 1         ┆ This is a compilation of a wid… │
│ 0         ┆ This DVD will be a disappointm… │
└───────────┴─────────────────────────────────┘


In [8]:
len(dataset)

3999998

### using tokenizer

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [10]:
tokens = tokenizer(
            dataset[10, 1], 
            padding='max_length', 
            truncation=True, 
            max_length=128,  
            return_tensors='pt'
        )

print(tokens['input_ids'].squeeze(0))

tensor([  101,  2023,  2001,  1037,  2307,  2338,  1010,  1045,  2074,  2071,
         2025,  2404,  2009,  2091,  1010,  1998,  2071,  2025,  3191,  2009,
         3435,  2438,  1012,  2879,  2054,  1037,  2338,  1996,  9792,  1998,
         4332,  1999,  2023,  2074,  7906,  2017, 16986,  1998,  5782,  2000,
         2113,  2054,  2003,  2183,  2000,  4148,  2279,  1012,  2023,  2338,
         3084,  2017,  2991,  1999,  2293,  1998,  2064,  3684,  2017,  2039,
         1010,  2009,  2064,  2036,  2191,  2017,  2061,  4963,  2100,  1012,
         2023,  2338,  2064,  2191,  2017,  2175, 16215, 22494,  2195,  1997,
         2115,  6699,  1012,  2023,  2003,  1037,  4248,  3191,  7472,  1012,
         2009,  2003,  2242,  2008,  2017,  2097,  2215,  2000,  2203,  2115,
         2154,  2125,  2007,  2065,  2017,  3191,  2012,  2305,  1012,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

### Converting the Datasets to Pytorch Datasets

In [11]:
config = {
    'train' : './data/Latest_train.csv',
    'test' : './data/Latest_test.csv'
}

class ReviewDataset(Dataset):
    def __init__(self, train = True):
        super().__init__()
        self.data = pl.read_csv(config['train'] if train else config['test'])
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        
        
    def __getitem__(self, index):
        text = self.data[index, 1]
        label = self.data[index, 0]
        
        tokens = self.tokenizer(
            text, 
            padding='max_length', 
            truncation=True, 
            max_length=128,
            return_tensors='pt'
        )
        
        return (tokens['input_ids'].squeeze(0), label)
    
    def __len__(self):
        return len(self.data)

In [12]:
data_train = ReviewDataset(train=True)
print(data_train[10])
print(len(data_train))

(tensor([  101,  2023,  2001,  1037,  2307,  2338,  1010,  1045,  2074,  2071,
         2025,  2404,  2009,  2091,  1010,  1998,  2071,  2025,  3191,  2009,
         3435,  2438,  1012,  2879,  2054,  1037,  2338,  1996,  9792,  1998,
         4332,  1999,  2023,  2074,  7906,  2017, 16986,  1998,  5782,  2000,
         2113,  2054,  2003,  2183,  2000,  4148,  2279,  1012,  2023,  2338,
         3084,  2017,  2991,  1999,  2293,  1998,  2064,  3684,  2017,  2039,
         1010,  2009,  2064,  2036,  2191,  2017,  2061,  4963,  2100,  1012,
         2023,  2338,  2064,  2191,  2017,  2175, 16215, 22494,  2195,  1997,
         2115,  6699,  1012,  2023,  2003,  1037,  4248,  3191,  7472,  1012,
         2009,  2003,  2242,  2008,  2017,  2097,  2215,  2000,  2203,  2115,
         2154,  2125,  2007,  2065,  2017,  3191,  2012,  2305,  1012,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0]

In [13]:
data_test = ReviewDataset(train=False)
print(data_test[10])
print(len(data_test))

(tensor([  101,  2065,  2017,  2215,  2000,  4952,  2000,  3449,  3804,  1010,
         2059,  2009,  2003,  2488,  2065,  2017,  2031,  3229,  2000,  2010,
         6457,  1010,  2023,  2003,  2025,  2032,  1010,  2009,  2003,  1037,
        21025,  7382,  6799,  1010,  2200,  2092, 23339,  1012,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0]

In [14]:
dataset_train = DataLoader(data_train, batch_size=16, shuffle=True)
dataset_test = DataLoader(data_test, batch_size=16, shuffle=True)

In [15]:
training_data[0, 1].split()

["I'm",
 'reading',
 'a',
 'lot',
 'of',
 'reviews',
 'saying',
 'that',
 'this',
 'is',
 'the',
 'best',
 "'game",
 "soundtrack'",
 'and',
 'I',
 'figured',
 'that',
 "I'd",
 'write',
 'a',
 'review',
 'to',
 'disagree',
 'a',
 'bit.',
 'This',
 'in',
 'my',
 'opinino',
 'is',
 'Yasunori',
 "Mitsuda's",
 'ultimate',
 'masterpiece.',
 'The',
 'music',
 'is',
 'timeless',
 'and',
 "I'm",
 'been',
 'listening',
 'to',
 'it',
 'for',
 'years',
 'now',
 'and',
 'its',
 'beauty',
 'simply',
 'refuses',
 'to',
 'fade.The',
 'price',
 'tag',
 'on',
 'this',
 'is',
 'pretty',
 'staggering',
 'I',
 'must',
 'say,',
 'but',
 'if',
 'you',
 'are',
 'going',
 'to',
 'buy',
 'any',
 'cd',
 'for',
 'this',
 'much',
 'money,',
 'this',
 'is',
 'the',
 'only',
 'one',
 'that',
 'I',
 'feel',
 'would',
 'be',
 'worth',
 'every',
 'penny.']

# Building Vocab

In [16]:
CONTEXT_SIZE = 2

test_sentence = training_data[0, 1].split()

ngrams = [
    (
        [test_sentence[i - j - 1] for j in range(CONTEXT_SIZE)],
        test_sentence[i]
    )
    for i in range(CONTEXT_SIZE, len(test_sentence))
]
# Print the first 3, just so you can see what they look like.
print(ngrams[:3])

vocab = set(test_sentence)
print(len(vocab))
word_to_ix = {word: i for i, word in enumerate(vocab)}

[(['reading', "I'm"], 'a'), (['a', 'reading'], 'lot'), (['lot', 'a'], 'of')]
70


In [34]:
tokenizer = get_tokenizer('basic_english')

In [36]:
def yield_tokens(data_iter):
    for i in range(len(data_iter)):
        yield tokenizer(data_iter[i, 1])
    
vocab = build_vocab_from_iterator(
    yield_tokens(training_data),
    min_freq = 2
)

In [37]:
print(len(vocab))

489763
