#**2a_Find BERT candidates Twitter & Amazon data set**
This notebook is used to find the slang word candidates for each of the product review/tweet, which is later (see notebook 2b) passed to BERT that will decide whether to replace the slang word.

## Libraries

In [None]:
import pandas as pd
import re
from tqdm import tqdm
import pickle

## Parameters for the notebook

In [None]:
pd.set_option('display.max_colwidth', None)
# TO RUN LOCALLY OR ON COLAB
# is_local = True
is_local = False

# TO USE SMALL SLANG OR BIGGER ONE
use_small_slang = True
# use_small_slang = False

## Reading in cleaned data

In [None]:
if is_local == False:
    from google.colab import drive
    drive.mount('/content/drive/')
    path_data = "/content/drive/My Drive/NLP_PROJECT/data/"
else:
    path_data = "D:/Google Drive/NLP_PROJECT/data/"

# slang
df_slang_small = pd.read_csv(path_data + "Slang/m_slang_small_cleaned.csv")
df_slang_big = pd.read_csv(path_data + "Slang/m_slang_big_cleaned.csv")
# datasets
df_amazon = pd.read_csv(path_data + "Amazon/amazon_cleaned.csv")
df_twitter = pd.read_csv(path_data + "Twitter/twitter_cleaned.csv")
# shapes
print("df_slang_small shape:", df_slang_small.shape)
print("df_slang_big shape:", df_slang_big.shape)
print("df_amazon shape:", df_amazon.shape)
print("df_twitter shape:", df_twitter.shape)

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
df_slang_small shape: (159, 2)
df_slang_big shape: (2317, 2)
df_amazon shape: (393568, 3)
df_twitter shape: (1581466, 3)


## Finding the slang word candidates among the words in tweets/product reviews

The purpose of the function **find_slang_for_bert** is to identify the occurences of slang words in the input text, provide the translations for these slang words, and enrichen the candidates with further information that can be used for our processing in notebook 2b, which may replace slang words with their translations based on the probability which word(s) would suit the best, either the slang/chat word or the translation. **We will give you an example for what happens in Notebook 2b to reduce the level of complexity.** Consider the following text and slang/chatword-translation pair:

- **"it has been a great day"**
- {it: "information technology"}

We cannot say we want to replace slang/chat words every time, as it heavily depends on the context. For most cases, we assume that "it" must remain "it" as a pronoun. However, "it" refers to "information technology" when the context of the text is within the domain of information technology. Let's now see how that goes in Jupyter Notebook 2b.


In [None]:
# precompiled regex patterns
if use_small_slang:
    df_slang = df_slang_small
    pattern_dict = {slang_word: re.compile(r'\b' + re.escape(str(slang_word)) + r'\b') for slang_word in set(df_slang_small['slang'].tolist())}
else:
    df_slang = df_slang_big
    pattern_dict = {slang_word: re.compile(r'\b' + re.escape(str(slang_word)) + r'\b') for slang_word in set(df_slang_big['slang'].tolist())}

# columns for dataframe for the results
list_of_columns = ['id', 'true_sentiment', 'text', 'candidates']

In [None]:
def find_slang_for_bert(text, df_slang, pattern_dict):
    # print("------------------------------------------------------------------\n", "FULL text: ", text)
    list_of_candidates_for_text = []
    # COMMENT - look at each slang word and its possible translations
    for slang_word, pattern in pattern_dict.items():
        # find all occurences of slang_word in text
        match_indices = [(m.start(0), m.end(0)) for m in re.finditer(pattern, text)]
        if match_indices:
            # print("\tslang_word: ", slang_word, "\n\tpattern: ", pattern, "\n\tmatch_indices: ", match_indices)
            # COMMENT - for each occurence of THIS slang_word in text process it
            # find all possible translation for the slang_word from df
            list_of_translations = [slang_word] + df_slang[df_slang['slang'] == slang_word]['translation'].to_list()
            for matched_start_index, matched_end_index in match_indices:
                # create a dict for each occurence of slang_word
                candidate = {
                                "slang_word": slang_word,
                                "start_index": matched_start_index,
                                "end_index": matched_end_index,
                                "list_of_translations": list_of_translations,
                                "text_to_check": text[:matched_start_index] + '___' + text[matched_end_index:],
                                "chosen_translation": None,
                                "was_replaced": False,
                                "final_text": None,
                                "diff": None
                            }
                list_of_candidates_for_text.append(candidate)
            #     print("\tcandidate: ", candidate)
            # print("\t------------------------------------------------------------------\n")
    # sort the list of candidates by start_index
    list_of_candidates_for_text = sorted(list_of_candidates_for_text, key=lambda x: x["start_index"])
    # return
    return list_of_candidates_for_text

In [None]:
# # execution for amazon
# df_amazon_for_bert = pd.DataFrame(columns=list_of_columns)
# dict_amazon_for_bert = {}
# i = 0
# with tqdm(total=len(df_amazon)) as pbar:
#     for text, sentiment in zip(df_amazon['Text'], df_amazon['Sentiment']):
#         candidates = find_slang_for_bert(text, df_slang, pattern_dict)
#         df_amazon_for_bert.loc[len(df_amazon_for_bert)] = {
#             'id': i,
#             'true_sentiment': sentiment,
#             'text': text,
#             'candidates': candidates
#         }
#         dict_amazon_for_bert[i] = {
#             'true_sentiment': sentiment,
#             'text': text,
#             'candidates': candidates
#         }
#         pbar.update(1)
#         i += 1

# # save to csv and dict
# if use_small_slang:
#     df_amazon_for_bert.to_csv(path_data + "BERT_data/amazon_for_bert_small.csv", index=False)
#     with open(path_data + "BERT_data/amazon_for_bert_small.pkl", 'wb') as f:
#         pickle.dump(dict_amazon_for_bert, f)
# else:
#     df_amazon_for_bert.to_csv(path_data + "BERT_data/amazon_for_bert_big.csv", index=False)
#     with open(path_data + "BERT_data/amazon_for_bert_big.pkl", 'wb') as f:
#         pickle.dump(dict_amazon_for_bert, f)

## Limit the number of rows for the data set

Due to the huge size of this dataset, we reduce while still considering its underlying structure and distribution of sentiments (binary target values for Twitter or scores for Amazon dataset).

In [None]:
df_twitter = pd.concat([df_twitter.head(200000), df_twitter.tail(200000)])
df_twitter = df_twitter.reset_index(drop=True)

In [None]:
# execution for twitter
df_twitter_for_bert = pd.DataFrame(columns=list_of_columns)
dict_twitter_for_bert = {}
i = 0
with tqdm(total=len(df_twitter)) as pbar:
    for text, sentiment in zip(df_twitter['Text'], df_twitter['Sentiment']):
        candidates = find_slang_for_bert(str(text), df_slang, pattern_dict)
        df_twitter_for_bert.loc[len(df_twitter_for_bert)] = {
            'id': i,
            'true_sentiment': sentiment,
            'text': text,
            'candidates': candidates
        }
        dict_twitter_for_bert[i] = {
            'true_sentiment': sentiment,
            'text': text,
            'candidates': candidates
        }
        pbar.update(1)
        i += 1

# save to csv and dict
if use_small_slang:
    df_twitter_for_bert.to_csv(path_data + "BERT_data/twitter_for_bert_small_400k.csv", index=False)
    with open(path_data + "BERT_data/twitter_for_bert_small_400k.pkl", 'wb') as f:
        pickle.dump(dict_twitter_for_bert, f)
else:
    df_twitter_for_bert.to_csv(path_data + "BERT_data/twitter_for_bert_big_400k.csv", index=False)
    with open(path_data + "BERT_data/twitter_for_bert_big_400k.pkl", 'wb') as f:
        pickle.dump(dict_twitter_for_bert, f)

100%|██████████| 400000/400000 [1:45:32<00:00, 63.17it/s]
