## Importing Libraries

In [1]:
from modelling import *
from trainer import *
from dataloader import *
from modelling import SentimentClassifier

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import BertModel, BertTokenizer, get_linear_schedule_with_warmup
from transformers.generation import GenerationMixin
import torch
import tensorflow as tf
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch import nn, optim

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

from collections import defaultdict
from textwrap import wrap

In [3]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
path = 'C:/Users/personal/Documents/Womens Clothing E-Commerce Reviews.csv'

In [5]:
df = pd.read_csv(path)

In [6]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses


#### make the numbers of ratings review to a 0-based index i.e. convert 1-5 scale to 0-4 scale

In [7]:
def map_scores(Rating):
    Rating = int(Rating)
    if Rating == 1:
        return 0
    elif Rating == 2:
        return 1
    elif Rating == 3:
        return 2
    elif Rating == 4:
        return 3
    elif Rating == 5:
        return 4

In [8]:
df['Rating'] = df.Rating.apply(map_scores)
class_names = ['1-star', '2-star', '3-star', '4-star', '5-star']

In [9]:
reviews = df['Review Text'].to_list()
targets = df['Rating'].to_list()

#### Define Transformer and Tokenizer

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

#### Encoding on a sample text

In [11]:
sample_text = "NLP is a branch of artificial intelligence"
tokens = tokenizer.tokenize(sample_text)
token_ids = tokenizer.convert_tokens_to_ids(tokens)

In [12]:
encoding = tokenizer.encode_plus(
    sample_text,
    max_length=32,
    truncation=True,
    add_special_tokens=True,
    return_token_type_ids=False,
    padding=True,
    return_attention_mask=True,
    return_tensors='pt')

In [13]:
# viewing results of the output from above
encoding.keys()
encoding['input_ids'][0]
encoding['attention_mask'][0]
tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])

['[CLS]',
 'nl',
 '##p',
 'is',
 'a',
 'branch',
 'of',
 'artificial',
 'intelligence',
 '[SEP]']

#### Utility

In [14]:
df_train, df_test = train_test_split(df, test_size=0.3, random_state=RANDOM_SEED)
df_val, df_test = train_test_split(df, test_size=0.5, random_state=RANDOM_SEED)
print(df_train.shape, df_val.shape, df_test.shape)

(16440, 11) (11743, 11) (11743, 11)


In [15]:
# define create data loader 
train_data_loader = create_data_loader(df_train, tokenizer)

val_data_loader = create_data_loader(df_val, tokenizer, include_raw_text=True)

test_data_loader = create_data_loader(df_test, tokenizer, include_raw_text=True)

In [16]:
#for i in train_data_loader:
    #print(i.keys())
    # dict_keys(['input_ids','attention_mask','targets','review_text'])

In [17]:
data = next(iter(train_data_loader))

In [18]:
#checking the shape of the tensor data
data['input_ids'].shape

torch.Size([16, 129])

#### Model Utility Class

In [19]:
model = SentimentClassifier(n_classes=len(class_names))
model = model.to(device)

All TF 2.0 model weights were used when initializing BertModel.

All the weights of BertModel were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.


In [20]:
# quickcheck
input_ids = data['input_ids'].to(device)
attention_mask = data['attention_mask'].to(device)
F.softmax(model(input_ids,attention_mask), dim=1)

tensor([[0.1248, 0.3182, 0.1873, 0.2224, 0.1474],
        [0.1251, 0.3603, 0.1441, 0.2815, 0.0891],
        [0.1465, 0.2205, 0.1685, 0.3576, 0.1069],
        [0.1528, 0.3093, 0.1688, 0.2740, 0.0951],
        [0.1684, 0.2200, 0.1427, 0.4059, 0.0630],
        [0.1610, 0.3622, 0.1325, 0.2572, 0.0870],
        [0.1221, 0.2522, 0.2066, 0.3121, 0.1070],
        [0.1227, 0.3156, 0.2003, 0.3069, 0.0544],
        [0.1091, 0.2409, 0.2063, 0.3315, 0.1122],
        [0.1652, 0.3090, 0.1995, 0.2232, 0.1031],
        [0.1516, 0.3906, 0.1743, 0.2098, 0.0736],
        [0.1415, 0.2081, 0.2560, 0.3022, 0.0922],
        [0.1699, 0.3883, 0.1613, 0.1592, 0.1214],
        [0.1613, 0.2566, 0.1549, 0.3472, 0.0800],
        [0.1937, 0.3501, 0.1774, 0.2065, 0.0722],
        [0.0994, 0.3834, 0.1626, 0.2705, 0.0841]], grad_fn=<SoftmaxBackward0>)

#### Training

In [21]:
EPOCH = 1

optimizer = optim.Adam(model.parameters(), lr=0.001)

total_steps = len(train_data_loader) * EPOCH

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps= 0, num_training_steps=total_steps)

loss_fn = nn.CrossEntropyLoss().to(device)

#### Training Loop

In [22]:
%%time
history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCH):
    print(f'Epoch {epoch + 1} / {EPOCH}')
    train_acc, train_loss = train_model(model, train_data_loader, loss_fn, optimizer, device, scheduler, len(df_train))
    val_acc, val_loss = eval_model(model, val_data_loader, loss_fn, device, len(df_val))

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)

    if val_acc == best_accuracy:
        torch.save(model.state_dict(), 'best_model.bin')
        best_accuracy = val_acc

Epoch 1 / 1


  nn.utils.clip_grad_norm(model.parameters(), max_norm= 1.0)


#### Model evaluation on test_data_loader

In [None]:
y_review_texts, y_pred, y_pred_probs, y_test = get_predictions(model, test_data_loader)

#### Testing on actual review

In [None]:
review_text = "I could share photos seamlessly with this app".

encoded_review = tokenizer.encode_plus(
    review_text,                                   
    max_length=32,
    truncation=True,
    add_special_tokens=True,
    return_token_type_ids=False,
    padding=True,
    return_attention_mask=True,
    return_tensors='pt')

input_ids = encoded_review['input_ids'].to(device)

attention_mask = encoded_review['attention_mask'].to(device)

output = model(input_ids, attention_mask)

_, prediction = torch.max(output, dim=1)

print(f'Review text: {review_text}')
print(f'Sentiment: {class_names[prediction]}')