### 1. Reading the dataset

In [1]:
import pandas as pd
df = pd.read_csv('dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


### 2. Data Processsing and Tokenization 

In [2]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [3]:
def process_data(row):

    text = row['review']
    
    text = str(text)
    
    # cleaning the data of white sapces 
    text = ' '.join(text.split())
    
    # tokenization
    encodings = tokenizer(text, padding="max_length", truncation=True, max_length=256)
    # max_lenth is the usd to insure uniform input length greater the better
    
    
    label = 0
    if row['sentiment'] == 'positive':
        label += 1

        #adding metadata
    encodings['label'] = label
    encodings['text'] = text

    return encodings

In [4]:
# Checking if function is working perfectly

print(process_data({
    'review': 'this is a sample review of a movie.',
    'sentiment': 'positive'
}))

{'input_ids': [101, 2023, 2003, 1037, 7099, 3319, 1997, 1037, 3185, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [5]:
# forming batches and passing it to process_data function

processed_data = []

for i in range(len(df[:1000])):
    processed_data.append(process_data(df.iloc[i]))

### 3. Generating Processed dataset

In [6]:
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'true'

In [7]:
#splitting the data frame into training and testing sets
from sklearn.model_selection import train_test_split

new_df = pd.DataFrame(processed_data)

train_df, valid_df = train_test_split(
    new_df,
    test_size=0.2,
    random_state=2022
)

In [8]:
# using pyarrow we need not to define epoch and batch sizes

import pyarrow as pa
from datasets import Dataset

train_hg = Dataset(pa.Table.from_pandas(train_df))
valid_hg = Dataset(pa.Table.from_pandas(valid_df))

### 4. Creating a model

In [9]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./result", 
    evaluation_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_hg,
    eval_dataset=valid_hg,
    tokenizer=tokenizer
)



### 5. Trainning and Evaluating the model

In [11]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.366552
2,No log,0.444749
3,No log,0.500334


TrainOutput(global_step=300, training_loss=0.24271324157714844, metrics={'train_runtime': 371.8792, 'train_samples_per_second': 6.454, 'train_steps_per_second': 0.807, 'total_flos': 315733266432000.0, 'train_loss': 0.24271324157714844, 'epoch': 3.0})

In [12]:
trainer.evaluate()

{'eval_loss': 0.5003339648246765,
 'eval_runtime': 5.4387,
 'eval_samples_per_second': 36.774,
 'eval_steps_per_second': 4.597,
 'epoch': 3.0}

In [24]:
# Saving the trained model

model.save_pretrained('./model/')

### 6. Loading the model

In [30]:
import torch
from transformers import AutoModelForSequenceClassification

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') this doesnt work with apple silicon M1/M2

device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')

new_model = AutoModelForSequenceClassification.from_pretrained('./model/').to(device)

In [31]:
from transformers import AutoTokenizer

new_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

### 7. Get predictions

In [45]:
import torch
import numpy as np

def get_prediction(text):
    encoding = new_tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
    encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}

    outputs = new_model(**encoding)

    logits = outputs.logits
    device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
    sigmoid = torch.nn.Sigmoid()
#     print(sigmoid)
    probs = sigmoid(logits.squeeze().cpu())
    probs = probs.detach().numpy()
    label = np.argmax(probs, axis=-1)
    
    if label == 1:
        return {
            'sentiment': 'Positive',
            'probability': probs[1]
        }
    else:
        return {
            'sentiment': 'Negative',
            'probability': probs[0]
        }


In [46]:
get_prediction('I liked the movie and it made me happy.')

{'sentiment': 'Positive', 'probability': 0.90790665}

In [47]:
get_prediction('I loved the movie and it made me really very happy.')

{'sentiment': 'Positive', 'probability': 0.9132571}

In [51]:
get_prediction('The movie was not that great i didnt like it but i loved the characters the way they were represented and loved the acting')

{'sentiment': 'Positive', 'probability': 0.9352621}