### Load Data

In [1]:
# TODO below lines are not for production
import warnings
warnings.filterwarnings('ignore')

In [2]:
PROJECT_PATH = "/Users/vinay/Projects/my-mlops/"

In [19]:
import pandas as pd
from datasets import Dataset
import os

df = pd.read_csv(os.path.join(PROJECT_PATH, "data", "IMDB-Dataset_100.csv"))
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [20]:
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.3)
dataset

DatasetDict({
    train: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 70
    })
    test: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 30
    })
})

In [21]:
df['sentiment'].value_counts()

sentiment
negative    58
positive    42
Name: count, dtype: int64

In [22]:
label2id = {'negative': 0, 'positive': 1}
id2label = {0: 'negative', 1: 'positive'}

dataset = dataset.map(lambda x: {'label': label2id[x['sentiment']]})

Map: 100%|██████████| 70/70 [00:00<00:00, 12606.32 examples/s]
Map: 100%|██████████| 30/30 [00:00<00:00, 9262.36 examples/s]


In [23]:
dataset['train'][0]

{'review': "I sure would like to see a resurrection of a up dated Seahunt series with the tech they have today it would bring back the kid excitement in me.I grew up on black and white TV and Seahunt with Gunsmoke were my hero's every week.You have my vote for a comeback of a new sea hunt.We need a change of pace in TV and this would work for a world of under water adventure.Oh by the way thank you for an outlet like this to view many viewpoints about TV and the many movies.So any ole way I believe I've got what I wanna say.Would be nice to read some more plus points about sea hunt.If my rhymes would be 10 lines would you let me submit,or leave me out to be in doubt and have me to quit,If this is so then I must go so lets do it.",
 'sentiment': 'positive',
 'label': 1}

### Data Tokenization

In [24]:
from transformers import AutoTokenizer
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model_ckpt = "huawei-noah/TinyBERT_General_4L_312D"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, use_fast=True)

In [25]:
def tokenize(batch):
    temp = tokenizer(batch['review'], padding=True, truncation=True, max_length=300)
    return temp

dataset = dataset.map(tokenize, batched=True, batch_size=None)

Map: 100%|██████████| 70/70 [00:00<00:00, 3309.08 examples/s]
Map: 100%|██████████| 30/30 [00:00<00:00, 4131.51 examples/s]


In [29]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_matrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)