## Comparision of BERT pre-trained model for sentiment analysis with and without finetuning

In [1]:
from transformers import pipeline, BertTokenizer,BertModel,BertForSequenceClassification

In [2]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
sentiment_classifier = pipeline(task='text-classification',model=model,tokenizer=tokenizer)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


##### using pretrained model on clear postive and negative sentiment statements

In [3]:
print(sentiment_classifier("This is worst movie"))
print(sentiment_classifier("This is very good movie"))

[{'label': 'LABEL_1', 'score': 0.45943698287010193}]
[{'label': 'LABEL_1', 'score': 0.47597670555114746}]


##### even for very clear sentiments, we can see very low scores and both still belong to same classification as per label

In [4]:
# !pip install -qq datasets

##### Will use imbd reviews dataset to finetune the model for sentiment analysis

In [5]:
from datasets import load_dataset

In [6]:
dataset = load_dataset('imdb')

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [8]:
import pandas as pd
pd.DataFrame({"text":dataset["train"]["text"],"label":dataset["train"]["label"]}).head()

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0


###### label 0 for negative sentiment
###### label 1 for positive sentiment

In [10]:
sentences = ["this is to test tokenization","This is another example. Bit lengthy than before one"]

In [11]:
encoded_out = tokenizer(sentences,padding=True,truncation=True)

In [12]:
encoded_out

{'input_ids': [[101, 2023, 2003, 2000, 3231, 19204, 3989, 102, 0, 0, 0, 0], [101, 2023, 2003, 2178, 2742, 1012, 2978, 12401, 2084, 2077, 2028, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

##### sample of tokenized output on two inputs, 
##### fields in output are
##### input_ids = > each corresponding to token of input
##### token_type_ids => varies if case seperator occurs in input
##### attention_mask => specifies data which model should attend/consider, in case of varying lenghts, with padding enables, makes all input tokens of same length with added paddings, where padded is masked as 0


In [13]:
tokenizer.decode(encoded_out["input_ids"][1])

'[CLS] this is another example. bit lengthy than before one [SEP]'

In [14]:
tokenizer.decode(encoded_out["input_ids"][0])

'[CLS] this is to test tokenization [SEP] [PAD] [PAD] [PAD] [PAD]'

function to get process dataset to create above fields in original dataset

In [15]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [16]:
tokenized_dataset = dataset.map(tokenize_function,batched=True)

In [17]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 50000
    })
})

##### Preparing for training/finetuning 

In [18]:
from transformers import Trainer,TrainingArguments,DataCollatorForTokenClassification

In [19]:
args = TrainingArguments("test-classifier",
                         eval_strategy='epoch',
                         learning_rate=2e-5,
                         per_device_train_batch_size=16,
                         per_device_eval_batch_size=16,
                         num_train_epochs=3,
                         weight_decay=0.01,
                        )
data_collator = DataCollatorForTokenClassification(tokenizer)

In [20]:
# !pip install -qq evaluate
# !pip install -qq seqeval
import numpy as np
import evaluate
import numpy as np
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [21]:
# to reduce training resources/time, selcting randomly sampled dataset
small_train_dataset = tokenized_dataset["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_dataset["test"].shuffle(seed=42).select(range(1000))

In [22]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

In [23]:
trainer.train()

#### Using the finetune model specifing the path of above configured directory (test-classifier)

In [24]:
model = BertForSequenceClassification.from_pretrained('test-classifier/checkpoint-189')

In [25]:
sentiment_classifier = pipeline(task='text-classification',model=model,tokenizer=tokenizer)

In [26]:
print(sentiment_classifier("This is worst movie"))
print(sentiment_classifier('This is movie is worth watchiing.'))
print(sentiment_classifier('This movie of that level where i get to sleep immediately'))


[{'label': 'LABEL_0', 'score': 0.8362260460853577}]
[{'label': 'LABEL_1', 'score': 0.8860337734222412}]
[{'label': 'LABEL_1', 'score': 0.8876245617866516}]


#### Now that model is able to classify clear negative(label 0) and positive(label 1) statement with clear scores.
#### But sarcastic statement is still mixing up 

#### Will retrain model with few more data with addtional set with sarcastice reviews/statements as well

In [27]:
# !pip install -qq openpyxl

In [28]:
import pandas as pd
sarcastic_df = pd.read_excel('sarcastic_dataset.xlsx')

In [29]:
from datasets import Dataset
sarcastic_df = Dataset.from_pandas(sarcastic_df)
sarcastic_tokenized = sarcastic_df.map(tokenize_function)

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

In [30]:
sarcastic_tokenized

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 60
})

In [31]:
tiny_train_dataset = sarcastic_tokenized.shuffle(seed=42).train_test_split(test_size=0.2,seed=42)["train"]
tiny_eval_dataset = sarcastic_tokenized.shuffle(seed=42).train_test_split(test_size=0.2,seed=42)["test"]

In [32]:
sarcastic_tokenized.shuffle(seed=42).train_test_split(test_size=0.2,seed=42)["train"]

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 48
})

In [33]:
from transformers import Trainer,TrainingArguments,DataCollatorForTokenClassification
args = TrainingArguments("sarcastic-classifier",
                         eval_strategy='epoch',
                         learning_rate=2e-5,
                         per_device_train_batch_size=16,
                         per_device_eval_batch_size=16,
                         num_train_epochs=3,
                         weight_decay=0.01,
                        )
data_collator = DataCollatorForTokenClassification(tokenizer)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tiny_train_dataset,
    eval_dataset=tiny_eval_dataset,
    compute_metrics=compute_metrics,
)

In [34]:
trainer.train()

In [35]:
retrained_model = BertForSequenceClassification.from_pretrained('sarcastic-classifier/checkpoint-9')

In [36]:
sentiment_classifier = pipeline(task='text-classification',model=retrained_model,tokenizer=tokenizer)

In [37]:
print(sentiment_classifier("This is worst movie"))
print(sentiment_classifier('This is movie is worth watchiing.'))
print(sentiment_classifier('This movie of that level where i get to sleep immediately'))

[{'label': 'LABEL_0', 'score': 0.8150907158851624}]
[{'label': 'LABEL_1', 'score': 0.5845213532447815}]
[{'label': 'LABEL_0', 'score': 0.7711818218231201}]


#### now with more finetuning, i can see model is able to detect/classify sarcastic statement aswell and also score of sceond statement is reduced as per it's sentiment indicators