In [3]:
import torch
from torch import nn, optim

import pandas as pd
from os import cpu_count
from tqdm import tqdm

## Setting Default Device

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

device.type

'cuda'

## Downloading the Dataset

In [5]:
# We have to unzip the dataset: 'reviews_Cell_Phones_and_Accessories_5.json.gz'
import gzip
from pathlib import Path
import shutil

# Setting the path of the zip file
zip_path = Path("/content/reviews_Cell_Phones_and_Accessories_5.json.gz")
dest_path = Path("/content/reviews_Cell_Phones_and_Accessories_5.json")

if not dest_path.is_file():
    with gzip.open(zip_path, "rb") as zip_ref:
        print(f"[INFO] Unzipping dataset `{zip_path}` to `{dest_path}`...")
        with open(dest_path, "wb") as un_zip_ref:
            shutil.copyfileobj(zip_ref, un_zip_ref)

    print(f"[INFO] Dataset succesfully downloaded to `{dest_path}`...")
else:
    print(f"[INFO] Dataset `{dest_path}` alerady exists...")

[INFO] Dataset `/content/reviews_Cell_Phones_and_Accessories_5.json` alerady exists...


## Seeing the Dataset

In [6]:
df = pd.read_json(dest_path, lines=True)

df.head(3)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A30TL5EWN6DFXT,120401325X,christina,"[0, 0]",They look good and stick good! I just don't li...,4,Looks Good,1400630400,"05 21, 2014"
1,ASY55RVNIL0UD,120401325X,emily l.,"[0, 0]",These stickers work like the review says they ...,5,Really great product.,1389657600,"01 14, 2014"
2,A2TMXE2AFO7ONB,120401325X,Erica,"[0, 0]",These are awesome and make my phone look so st...,5,LOVE LOVE LOVE,1403740800,"06 26, 2014"


## Preprocess the Dataset

In [7]:
ps = pd.Series(df["reviewText"])

ps[0]

"They look good and stick good! I just don't like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I just won't buy a product like this again"

In [24]:
ps = ps[: len(ps) // 2]

len(ps)

48609

In [26]:
zero_length_revies_index = [i for i in range(len(ps)) if len(ps[i]) == 0]

non_zero_ps = ps.drop(zero_length_revies_index)
non_zero_ps

0        They look good and stick good! I just don't li...
1        These stickers work like the review says they ...
2        These are awesome and make my phone look so st...
3        Item arrived in great time and was in perfect ...
4        awesome! stays on, and looks great. can be use...
                               ...                        
48604    This charger works but it doesn't charge the b...
48605    This charger works very well once you understa...
48606    This is a handy device as you can use it to ch...
48607    I love this charger... it charges almost every...
48608    Not sure what the additional 2 buttons are for...
Name: reviewText, Length: 48579, dtype: object

In [27]:
from string import punctuation


def tokenize(sentence):
    # Creating the lookup table to replace all punctuation with spaces (' ')
    translation_dict = {i: ord(" ") for i in [ord(x) for x in punctuation]}

    # Replacing all punctuation with spaces
    unpuctuate_sentence = sentence.translate(translation_dict)

    # Creating the tokanization of the sentence
    tokenized_list = [word.lower() for word in unpuctuate_sentence.split(" ") if (len(word) > 1 or word == 'I')]

    return tokenized_list

In [28]:
tokenized_ps = non_zero_ps.apply(tokenize)

tokenized_ps

0        [they, look, good, and, stick, good, i, just, ...
1        [these, stickers, work, like, the, review, say...
2        [these, are, awesome, and, make, my, phone, lo...
3        [item, arrived, in, great, time, and, was, in,...
4        [awesome, stays, on, and, looks, great, can, b...
                               ...                        
48604    [this, charger, works, but, it, doesn, charge,...
48605    [this, charger, works, very, well, once, you, ...
48606    [this, is, handy, device, as, you, can, use, i...
48607    [i, love, this, charger, it, charges, almost, ...
48608    [not, sure, what, the, additional, buttons, ar...
Name: reviewText, Length: 48579, dtype: object

## Creating the Vocabulary

In [29]:
# vocab = set()
# for tokenized_sentence in tokenized_ps:
#     for word in tokenized_sentence:
#         vocab.add(word)

# vocab = list(vocab)

# The same code as above can be written (for performance increment)
vocab = list(set(word for tokenized_sentence in tokenized_ps for word in tokenized_sentence))
vocab_idx = {vocab[i] : i for i in range(len(vocab))}

vocab_idx

{'mimics': 0,
 '16did': 1,
 'cdma': 2,
 'tradtional': 3,
 'immediatly': 4,
 'mishaps': 5,
 'administrators': 6,
 'san': 7,
 'sosteni': 8,
 'multifunctional': 9,
 'originial': 10,
 'trusty': 11,
 'motoq': 12,
 'gus': 13,
 'smidgen': 14,
 'swirly': 15,
 'origins': 16,
 'fetish': 17,
 'fitmentcons': 18,
 'naught': 19,
 'audiothings': 20,
 'bicyclist': 21,
 'interlocutor': 22,
 'cheapera': 23,
 'muzac': 24,
 'aobut': 25,
 'downsides': 26,
 'goers': 27,
 'principal': 28,
 'subside': 29,
 'globalsat': 30,
 'labeling': 31,
 'vase': 32,
 'echo': 33,
 'facebooking': 34,
 'qwest': 35,
 'thumps': 36,
 'chews': 37,
 'excellentpowergen': 38,
 'cleanest': 39,
 'royally': 40,
 'razrs': 41,
 'amaizing': 42,
 'soldiers': 43,
 'qualitywe': 44,
 'telemarketers': 45,
 'compressing': 46,
 'admire': 47,
 'robustly': 48,
 'kickbacks': 49,
 '6700mah': 50,
 'snake': 51,
 'utilizar': 52,
 'knock': 53,
 'system': 54,
 'sudo': 55,
 'comprehension': 56,
 'impartial': 57,
 'cumbersomegoogle': 58,
 'routing': 59,
 '

## Creating One-Hot Encoding

In [31]:
def one_hot_encoding(vocab:list, word: str) -> torch.Tensor:
    important_index = vocab.index(word)

    return torch.tensor([1 if i == important_index else 0 for i in range(len(vocab))])

## CBOW Model

In [57]:
WINDOW_SIZE = 10
EMB_SIZE = 30

In [58]:
# Converting a tokenize sentence in training samples
def generate_samples(tokenized_sentence: list, window_size: int) -> list:
    samples = []

    for i in range(len(tokenized_sentence) - window_size):
        if len(tokenized_sentence[i+1: i+1+window_size]) == window_size:
            samples.append((" ".join(tokenized_sentence[i+1: i+1+window_size]), tokenized_sentence[i]))

    return samples


# Converting the entire reviews dataset into training samples
def generate_training_data(tokenized_ps: pd.core.series.Series, window_size: int) -> list:
    training_samples = []

    for tokenized_sentence in tokenized_ps:
        training_samples += generate_samples(tokenized_sentence, window_size)

    return training_samples

In [59]:
training_samples = generate_training_data(tokenized_ps, WINDOW_SIZE)

training_samples[90:99]

[('included free screen protector i never received one though its', 'and'),
 ('free screen protector i never received one though its not', 'included'),
 ('screen protector i never received one though its not big', 'free'),
 ('protector i never received one though its not big deal', 'screen'),
 ('i never received one though its not big deal it', 'protector'),
 ('never received one though its not big deal it would', 'i'),
 ('received one though its not big deal it would ve', 'never'),
 ('one though its not big deal it would ve been', 'received'),
 ('though its not big deal it would ve been nice', 'one')]

## Creating the Model

In [107]:
class CBOW(nn.Module):
    def __init__(self, window_size: int, vocab: str, hidden_units: int) -> None:
        super().__init__()

        self.vocab = vocab

        self.projection_layer = nn.Sequential(
            nn.Flatten(),
            nn.Linear(in_features=window_size*len(vocab), out_features=hidden_units)
        )

        self.embedding_layer = nn.Linear(in_features=hidden_units, out_features=hidden_units)


    def forward(self, contex_sentence: str): # x: shape=(1, window_size, vocab_size)
        print()
        print(torch.tensor([torch.LongTensor(one_hot_encoding(self.vocab, word)).to(device) for word in contex_sentence.split()]))
        x = torch.tensor([torch.LongTensor(one_hot_encoding(self.vocab, word)).to(device) for word in contex_sentence.split()]).unsqueeze(dim=0).to(device)

        return self.embedding_layer(self.projection_layer(x))

## Initializing the Model

In [108]:
cbow_model = CBOW(WINDOW_SIZE, vocab, EMB_SIZE).to(device)

cbow_model

CBOW(
  (projection_layer): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=420380, out_features=30, bias=True)
  )
  (embedding_layer): Linear(in_features=30, out_features=30, bias=True)
)

## Setting Loss Function and Optimizer

In [109]:
loss_fn = nn.CrossEntropyLoss()
opt = optim.Adam(params=cbow_model.parameters(), lr=1e-3)

## Setting Accuracy Metric

In [110]:
def accuracy_fn(logit: torch.Tensor, labels: torch.Tensor) -> float:
    pred_label_index = torch.softmax(logit, dim=1).argmax(dim=1).item()

    return (labels[0][pred_label_index].item() == 1) * 100

## Creating the Training Loop

In [111]:
def fit(model, epochs, training_samples, loss_fn, accuracy_fn, opt):
    history = []

    print("Starting Process...")

    for epoch in range(1, epochs + 1):
        epoch_loss, epoch_acc = 0, 0

        model.train()
        for sample in tqdm(training_samples):
            y_label = torch.LongTensor(one_hot_encoding(model.vocab, sample[1])).unsqueeze(dim=0)
            logit = model(sample[0])
            # loss = loss_fn(logit, y_label)

            # epoch_loss += loss.item()
            # epoch_acc += accuracy_fn(logit, y_label)

            # opt.zero_grad()
            # loss.backward()
            # opt.step()

        epoch_loss /= len(training_samples)
        epoch_acc /= len(training_samples)

        print({"epoch": epoch, "loss": epoch_loss, "acc(%)": epoch_acc})
        history.append({"epoch": epoch, "loss": epoch_loss, "acc(%)": epoch_acc})

    print("Process Successfully Completed...")

    return history

## Training the Model

In [112]:
history = fit(cbow_model, 5, training_samples, loss_fn, accuracy_fn, opt)

Starting Process...


  0%|          | 0/3975842 [00:00<?, ?it/s]







TypeError: ignored