#**2b_BERT candidates replacement Twitter/Amazon**

This notebook is used for BERT that will decide whether to replace the slang word.
- https://www.kaggle.com/code/tientd95/bert-model-for-anwsering-toeic-reading-test/notebook
- https://github.com/graykode/toeicbert/tree/master

## Libraries

In [1]:
!pip install datasets transformers==4.28.0
!pip install --upgrade accelerate
!pip install cchardet
!pip install -U pytorch-pretrained-bert;

Collecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m38.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers==4.28.0)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.28.0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m107.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from da

In [2]:
import pandas as pd
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
from tqdm import tqdm
import pickle
import copy

## Parameters for the notebook

In [3]:
pd.set_option('display.max_colwidth', None)
# TO RUN LOCALLY OR ON COLAB
# is_local = True
is_local = False

# TO USE SMALL SLANG OR BIGGER ONE
use_small_slang = True
# use_small_slang = False

# TO SET UP BERT CLASS TO USE PROGRESSIVE APPROACH FOR SLANG REPLACEMENT
use_bert_progressively = False
# use_bert_progressively = True
'''
use_bert_progressively

    If set to True then BERT class will incorporate its previous decision on
    slang replacement for the next candidate replacement.
    That is if we have a text "I luv u" and BERT replaces "luv" with "love" then
    to evaluate whether 'u' should be replaced as well BERT will use "I love u" and not "I luv u" as a context.
'''

def get_file_name(dataset_name):
    file_prefix = f'{dataset_name}_after_bert_small' if use_small_slang else f'{dataset_name}_after_bert_big'
    file_suffix = "_progressive_local" if use_bert_progressively and is_local else "_progressive_colab" if use_bert_progressively else "_local" if is_local else "_colab"
    file_name = file_prefix + file_suffix
    return file_name

## Reading in cleaned data

In [4]:
if is_local == False:
    from google.colab import drive
    drive.mount('/content/drive/')
    path_data = "/content/drive/My Drive/NLP_PROJECT/data/"
else:
    path_data = "D:/Google Drive/NLP_PROJECT/data/"

# datasets processed for bert
if use_small_slang:
    with open(path_data + 'BERT_data/amazon_for_bert_small.pkl', 'rb') as handle:
        dict_amazon_for_bert = pickle.load(handle)
    with open(path_data + 'BERT_data/twitter_for_bert_small_400k.pkl', 'rb') as handle:
        dict_twitter_for_bert = pickle.load(handle)
else:
    with open(path_data + 'BERT_data/amazon_for_bert_big.pkl', 'rb') as handle:
        dict_amazon_for_bert = pickle.load(handle)
    with open(path_data + 'BERT_data/twitter_for_bert_big.pkl', 'rb') as handle:
        dict_twitter_for_bert = pickle.load(handle)

Mounted at /content/drive/


## Limit the number of rows for the data set
We are solely interested in texts with replacement words in it. Therefore, we remove texts without any slang/chat word candidates in it. As the data set is still way too large for our computation power to process this amount of data, we reduce the dataset even further, while still considering its underlying structure and distribution of sentiments (binary target values for Twitter or scores for Amazon dataset).

In [5]:
# keep only tweets/reviews that have some candidates detected
print("Length of amazon before filtering: ", len(dict_amazon_for_bert))
dict_amazon_for_bert = {key: value for key, value in dict_amazon_for_bert.items() if len(value['candidates']) > 0}
print("Length of amazon after filtering: ", len(dict_amazon_for_bert))

print("Length of twitter before filtering: ", len(dict_twitter_for_bert))
dict_twitter_for_bert = {key: value for key, value in dict_twitter_for_bert.items() if len(value['candidates']) > 0}
print("Length of twitter after filtering: ", len(dict_twitter_for_bert))

Length of amazon before filtering:  393568
Length of amazon after filtering:  128077
Length of twitter before filtering:  400000
Length of twitter after filtering:  75312


In [6]:
# def limit_dict_size(dictionary, limit):
#     return {k: dictionary[k] for k in list(dictionary.keys())[:limit]}

def check_dataset_distribution(dataset, proportion):
    df = pd.DataFrame.from_dict(dataset, orient='index')
    unique_counts = df['true_sentiment'].value_counts()
    print("Distribution of values: \n", unique_counts)
    proportion
    print("\nTotal count: ", round(sum(unique_counts * proportion), 0))
    max_counts = list((round(unique_counts * proportion, 0)))
    print("\nModified distribution of values: \n", max_counts)
    return max_counts

def limit_dataset(dictionary, m_counts, use_amazon):
    copied_dict = {}
    if use_amazon:
      sentiment_counts = {5: m_counts[0], 4: m_counts[1], 3: m_counts[2], 2: m_counts[3], 1: m_counts[4]}
    else:
      sentiment_counts = {1: m_counts[0], 0: m_counts[1]}
    for key, value in dictionary.items():
        value_sentiment = value['true_sentiment']
        if value_sentiment in sentiment_counts and sentiment_counts[value_sentiment] > 0:
            copied_dict[key] = value
            sentiment_counts[value_sentiment] -= 1
        if all(count == 0 for count in sentiment_counts.values()):
            break
    return copied_dict

def process_dataset(dataset, max_distribution, use_amazon=False):
    max_counts = check_dataset_distribution(dataset, max_distribution)
    print("\n-------------------------------------------")
    dataset = limit_dataset(dataset, max_counts, use_amazon)
    print("\nTotal count: ", len(dataset))
    df = pd.DataFrame.from_dict(dataset, orient='index')
    unique_counts = df['true_sentiment'].value_counts()
    print("\nModified distribution of values: \n", unique_counts)
    return dataset

In [7]:
# Process the Amazon dataset
dict_amazon_for_bert = process_dataset(dict_amazon_for_bert, 0.45, use_amazon=True)

Distribution of values: 
 5    80283
4    19377
1    11211
3    10317
2     6889
Name: true_sentiment, dtype: int64

Total count:  57635.0

Modified distribution of values: 
 [36127.0, 8720.0, 5045.0, 4643.0, 3100.0]

-------------------------------------------

Total count:  57635

Modified distribution of values: 
 5    36127
4     8720
3     5045
2     4643
1     3100
Name: true_sentiment, dtype: int64


In [8]:
# Process the Twitter dataset
dict_twitter_for_bert = process_dataset(dict_twitter_for_bert, 0.75, use_amazon=False)

Distribution of values: 
 0    37977
1    37335
Name: true_sentiment, dtype: int64

Total count:  56484.0

Modified distribution of values: 
 [28483.0, 28001.0]

-------------------------------------------

Total count:  56484

Modified distribution of values: 
 1    28483
0    28001
Name: true_sentiment, dtype: int64


## BERT implementation for deciding on slang replacement

As already introduced in notebook 2a, in order to decide whether some word should be replaced with its translation or not its almost impossible to write a hard coded script that would not introduce errors in the text.
Thus, we decided to use BERT to decide whether to replace the slang word or not.
It should help use in cases where context matters so if we have a sentence like "it is the best day of my life". We should not replace the word "it" with "information technology" in the above stated sentences because it would not make sense context wise and grammar wise.

In [9]:
class SlangBert():
    """
    Model using pretrained Bert for picking the best candidate for slang word replacement.
    Model returns the word for the text with gap based on the highest probability.
    """
    def __init__(self, bertmodel):
        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        print("device: ", self.device)
        self.bertmodel = bertmodel
        # Initial tokenizer to tokenize the question later
        self.tokenizer = BertTokenizer.from_pretrained(self.bertmodel)
        self.model = BertForMaskedLM.from_pretrained(self.bertmodel).to(self.device)
         # We used pretrained BertForMaskedLM to fill in the blank, do not fine tuning so we set model to eval
        self.model.eval()

    def get_score(self,question_tensors, segment_tensors, masked_index, candidate):
        # Tokenize the answer candidate
        # if isinstance(candidate, float): print("CANDIDATE: ", candidate)
        candidate_tokens = self.tokenizer.tokenize(str(candidate))
        # After tokenizing, we convert token to ids, (word to numerical)
        candidate_ids = self.tokenizer.convert_tokens_to_ids(candidate_tokens)
        predictions = self.model(question_tensors, segment_tensors)
        predictions_candidates = predictions[0,masked_index, candidate_ids].mean()
        return predictions_candidates.item()

    def predict(self,row):
        if row['text_to_check'].count('___') > 1:
            raise ValueError("More than 1 ___ in sentence_to_check")
        # Tokenizing questions, convert '___' to '_' so that we can MASK it
        question_tokens = self.tokenizer.tokenize(row['text_to_check'].replace('___', '_'))
        if len(question_tokens) > 500:
            # max is 512
            # print("Token indices sequence length is longer than the specified maximum sequence length for this BERT model")
            return '', ''
        masked_index = question_tokens.index('_')
        # Assign [MASK] to blank that need to be completed
        question_tokens[masked_index] = '[MASK]'
        segment_ids = [0] * len(question_tokens)
        segment_tensors = torch.tensor([segment_ids]).to(self.device)
        question_ids = self.tokenizer.convert_tokens_to_ids(question_tokens)
        question_tensors = torch.tensor([question_ids]).to(self.device)
        candidates = row['candidates']
        # Return probabilities of answer choice in [...]
        predict_tensor = torch.tensor([self.get_score(question_tensors, segment_tensors,
                                                masked_index, candidate) for candidate in candidates])
        # Softmax the predict probability to return the index for maximum values
        predict_idx = torch.argmax(predict_tensor).item()
        return candidates[predict_idx], predict_tensor

## BERT processing of candidates

The function **decide_on_translation** replaces slang/chat words with their corresponding translations only if the SlangModel coded above predicts to replace it. Otherwise, the slang/chat word will not be replaced.

In Notebook 3 we predict the sentiment based on the final output of this notebook (replaced slang/chat words with its translation based on BERT's prediction) of a Amazon Product Review (Score: 1-5) or a tweet (Binary Target: 0,1) as well as with the inital data set, without any slang/chat word preprocessing.

In [10]:
def decide_on_translation(dict_text_for_bert, use_progressive_updated_text, model):
    df_text_after_bert = pd.DataFrame(columns=['id','true_sentiment', 'text', 'candidates',
                                               'updated_candidates', 'processed_text',
                                               'is_same_as_original', 'chosen_translation'])
    dict_text_after_bert = {}

    with tqdm(total=len(dict_text_for_bert)) as pbar:
        for index, row in dict_text_for_bert.items():
            text = row['text']
            index_shift = 0
            list_of_candidates_for_text = row['candidates'] # list of dicts
            updated_list_of_candidates_for_text = []
            updated_text_to_check = ''
            skip_this_row = False
            # print("------------------------------------------------------------------\n", "FULL text: ", text)
            # process the candidates in order that was sorted by start_index
            for candidate in list_of_candidates_for_text:
                # print("\tcandidate: ", candidate)
                if updated_text_to_check != '' and use_progressive_updated_text == True:
                    # temp = updated_text_to_check
                    updated_text_to_check = updated_text_to_check[:candidate['start_index']-index_shift] + '___' + updated_text_to_check[candidate['end_index']-index_shift:]
                    # if candidate['text_to_check'] != updated_text_to_check:
                    #     print('\index_shift: ' + str(index_shift))
                    #     print('\tupdated_text_to_check: ' + temp)
                    #     print("\tCandidate OLD text_to_check: ", candidate['text_to_check'])
                    #     print("\tCandidate NEW text_to_check: ", updated_text_to_check)
                    candidate['text_to_check'] = updated_text_to_check

                my_dict = {'slang': candidate['slang_word'],
                            'candidates': candidate['list_of_translations'],
                            'text_to_check': candidate['text_to_check']}
                predicted_word, predict_tensor = model.predict(my_dict)
                if predicted_word == '' and predict_tensor == '':
                    skip_this_row = True
                    break

                # Extract the highest and second-highest values and indices
                highest_values, _ = torch.topk(predict_tensor, k=2)
                difference = highest_values[0].item() - highest_values[1].item()

                # # print the predictions
                # for word, tensor_value in zip(my_dict['candidates'], predict_tensor.tolist()):
                #     print("\t-", "'"+word+"'", tensor_value)
                # print("\tPREDICTION: ", "'"+predicted_word+"'")
                # print("\t------------------------------------------------------------------\n")

                # update text and index_shift since start_index and end_index is always wrt to original text
                text = text[:candidate['start_index']-index_shift] + predicted_word + text[candidate['end_index']-index_shift:]
                # as long as slang_word == predicted_word, the index_shift will be the same
                index_shift = index_shift + len(candidate['slang_word']) - len(predicted_word)
                updated_text_to_check = text

                # update candidate
                if predicted_word == candidate['slang_word']:
                    processed_candidate =  candidate.copy()
                    processed_candidate['chosen_translation'] = candidate['slang_word']
                    processed_candidate['was_replaced'] = False
                    processed_candidate['final_text'] = my_dict['text_to_check'].replace('___', candidate['slang_word'])
                    processed_candidate['diff'] = difference
                else:
                    processed_candidate = candidate.copy()
                    processed_candidate['chosen_translation'] = predicted_word
                    processed_candidate['was_replaced'] = True
                    processed_candidate['final_text'] = my_dict['text_to_check'].replace('___', predicted_word)
                    processed_candidate['diff'] = difference
                updated_list_of_candidates_for_text.append(processed_candidate)


            if skip_this_row: continue
            # append a row to df
            replacement_tuples = [(candidate['slang_word'], candidate['chosen_translation'], candidate['diff']) for candidate in updated_list_of_candidates_for_text if candidate['was_replaced']]
            df_text_after_bert.loc[len(df_text_after_bert)] = {
                                                                'id': index,
                                                                'true_sentiment': row['true_sentiment'],
                                                                'text': row['text'],
                                                                'candidates': row['candidates'],
                                                                'updated_candidates': updated_list_of_candidates_for_text,
                                                                'processed_text': text,
                                                                'is_same_as_original': text == row['text'],
                                                                'chosen_translation': replacement_tuples
                                                                }
            dict_text_after_bert[index] = {
                'true_sentiment': row['true_sentiment'],
                'text': row['text'],
                'candidates': row['candidates'],
                'updated_candidates': updated_list_of_candidates_for_text,
                'processed_text': text,
                'is_same_as_original': text == row['text'],
                'chosen_translation': replacement_tuples

            }
            pbar.update(1)
    return df_text_after_bert, dict_text_after_bert

In [11]:
model = SlangBert('bert-large-uncased')

device:  cuda


100%|██████████| 231508/231508 [00:00<00:00, 422212.35B/s]
100%|██████████| 1248501532/1248501532 [01:11<00:00, 17433440.50B/s]


In [None]:
# # execution for amazon
# df_amazon_after_bert, dict_amazon_after_bert = decide_on_translation(dict_amazon_for_bert, use_progressive_updated_text = use_bert_progressively, model = model)

# # save df and dict
# df_amazon_after_bert.to_csv(path_data + 'BERT_data/'+ get_file_name('amazon') + '.csv', index=False)
# with open(path_data + 'BERT_data/'+ get_file_name('amazon') + '.pkl', 'wb') as handle:
#     pickle.dump(dict_amazon_after_bert, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [12]:
# execution for twitter
df_twitter_after_bert, dict_twitter_after_bert = decide_on_translation(dict_twitter_for_bert, use_progressive_updated_text = use_bert_progressively, model = model)

# save df and dict
df_twitter_after_bert.to_csv(path_data + 'BERT_data/'+ get_file_name('twitter') + '.csv', index=False)
with open(path_data + 'BERT_data/'+ get_file_name('twitter') + '.pkl', 'wb') as handle:
    pickle.dump(dict_twitter_after_bert, handle, protocol=pickle.HIGHEST_PROTOCOL)

100%|██████████| 56484/56484 [1:22:19<00:00, 11.44it/s]
