This notebook is used to replace the slang word.

#**2_Preprocessing Slang Candidates**

## Libraries

In [None]:
import pandas as pd
from tqdm import tqdm
import pickle

## Parameters for the notebook

In [None]:
pd.set_option('display.max_colwidth', None)
# TO RUN LOCALLY OR ON COLAB
is_local = True

## Reading in cleaned data sets (Twitter & Amazon product review)

In [None]:
if is_local == False:
    from google.colab import drive
    drive.mount('/content/drive/')
    path_data = "/content/drive/My Drive/NLP_PROJECT/data/"
else:
    path_data = "D:/Google Drive/NLP_PROJECT/data/"

with open(path_data + 'BERT_data/amazon_for_bert_small.pkl', 'rb') as handle:
    dict_amazon_for_bert = pickle.load(handle)
with open(path_data + 'BERT_data/twitter_for_bert_small_400k.pkl', 'rb') as handle:
    dict_twitter_for_bert = pickle.load(handle)

## Preprocessing

We only want to keep tweets/product reviews which actually have at least one slang/chatword. Consequently, text without any slang/chatwords are dropped.

As we can see from the output of the following code cell, we only took a portion of the initial data set, due to their extreme size. Regardless, we have nearly 130k texts for the Amazon dataset, that include at least one word that might want to be replaced by its translation. For the Twitter dataset, we are talking about more than 70k Texts that consists of at least one replacement candidate

In [None]:
# keep only tweets/reviews that have some candidates detected
print("Length of amazon before filtering: ", len(dict_amazon_for_bert))
dict_amazon_for_bert = {key: value for key, value in dict_amazon_for_bert.items() if len(value['candidates']) > 0}
print("Length of amazon after filtering: ", len(dict_amazon_for_bert))

print("Length of twitter before filtering: ", len(dict_twitter_for_bert))
dict_twitter_for_bert = {key: value for key, value in dict_twitter_for_bert.items() if len(value['candidates']) > 0}
print("Length of twitter after filtering: ", len(dict_twitter_for_bert))

Length of amazon before filtering:  393568
Length of amazon after filtering:  128077
Length of twitter before filtering:  400000
Length of twitter after filtering:  75312


In [None]:
# def limit_dict_size(dictionary, limit):
#     return {k: dictionary[k] for k in list(dictionary.keys())[:limit]}

def check_dataset_distribution(dataset, proportion):
    df = pd.DataFrame.from_dict(dataset, orient='index')
    unique_counts = df['true_sentiment'].value_counts()
    print("Distribution of values: \n", unique_counts)
    proportion
    print("\nTotal count: ", round(sum(unique_counts * proportion), 0))
    max_counts = list((round(unique_counts * proportion, 0)))
    print("\nModified distribution of values: \n", max_counts)
    return max_counts

def limit_dataset(dictionary, m_counts, use_amazon):
    copied_dict = {}
    if use_amazon:
      sentiment_counts = {5: m_counts[0], 4: m_counts[1], 3: m_counts[2], 2: m_counts[3], 1: m_counts[4]}
    else:
      sentiment_counts = {1: m_counts[0], 0: m_counts[1]}
    for key, value in dictionary.items():
        value_sentiment = value['true_sentiment']
        if value_sentiment in sentiment_counts and sentiment_counts[value_sentiment] > 0:
            copied_dict[key] = value
            sentiment_counts[value_sentiment] -= 1
        if all(count == 0 for count in sentiment_counts.values()):
            break
    return copied_dict

def process_dataset(dataset, max_distribution, use_amazon=False):
    max_counts = check_dataset_distribution(dataset, max_distribution)
    print("\n-------------------------------------------")
    dataset = limit_dataset(dataset, max_counts, use_amazon)
    print("\nTotal count: ", len(dataset))
    df = pd.DataFrame.from_dict(dataset, orient='index')
    unique_counts = df['true_sentiment'].value_counts()
    print("\nModified distribution of values: \n", unique_counts)
    return dataset

**Here we shrink the dataset even further, as we can only handle around 20k rows within a reasonable time frame**

In [None]:
# Process the Amazon dataset
dict_amazon_for_bert = process_dataset(dict_amazon_for_bert, 0.15, use_amazon=True)

Distribution of values: 
 5    80283
4    19377
1    11211
3    10317
2     6889
Name: true_sentiment, dtype: int64

Total count:  19212.0

Modified distribution of values: 
 [12042.0, 2907.0, 1682.0, 1548.0, 1033.0]

-------------------------------------------

Total count:  19212

Modified distribution of values: 
 5    12042
4     2907
3     1682
2     1548
1     1033
Name: true_sentiment, dtype: int64


In [None]:
# Process the Twitter dataset
dict_twitter_for_bert = process_dataset(dict_twitter_for_bert, 0.25, use_amazon=False)

Distribution of values: 
 0    37977
1    37335
Name: true_sentiment, dtype: int64

Total count:  18828.0

Modified distribution of values: 
 [9494.0, 9334.0]

-------------------------------------------

Total count:  18828

Modified distribution of values: 
 1    9494
0    9334
Name: true_sentiment, dtype: int64


## Processing of candidates with a script
The following code checks replaces all the slang/chatword candidates with "__" while keeping track of the index to then replace them with the corresponding translation


In [None]:
def replace_slang(dict_text_for_bert, use_progressive_updated_text):
    df_text_after = pd.DataFrame(columns=['id','true_sentiment', 'text', 'candidates',
                                               'updated_candidates', 'processed_text',
                                               'is_same_as_original', 'chosen_translation'])
    dict_text_after = {}

    with tqdm(total=len(dict_text_for_bert)) as pbar:
        for index, row in dict_text_for_bert.items():
            text = row['text']
            index_shift = 0
            list_of_candidates_for_text = row['candidates'] # list of dicts
            updated_list_of_candidates_for_text = []
            updated_text_to_check = ''
            # print("------------------------------------------------------------------\n", "FULL text: ", text)
            # process the candidates in order that was sorted by start_index
            for candidate in list_of_candidates_for_text:
                # print("\tcandidate: ", candidate)

                if updated_text_to_check != '' and use_progressive_updated_text == True:
                    # temp = updated_text_to_check
                    updated_text_to_check = updated_text_to_check[:candidate['start_index']-index_shift] + '___' + updated_text_to_check[candidate['end_index']-index_shift:]
                    # if candidate['text_to_check'] != updated_text_to_check:
                    #     print('\index_shift: ' + str(index_shift))
                    #     print('\tupdated_text_to_check: ' + temp)
                    #     print("\tCandidate OLD text_to_check: ", candidate['text_to_check'])
                    #     print("\tCandidate NEW text_to_check: ", updated_text_to_check)
                    candidate['text_to_check'] = updated_text_to_check

                my_dict = {'slang': candidate['slang_word'],
                            'candidates': candidate['list_of_translations'],
                            'text_to_check': candidate['text_to_check']}
                predicted_word = my_dict['candidates'][1]

                # update text and index_shift since start_index and end_index is always wrt to original text
                text = text[:candidate['start_index']-index_shift] + predicted_word + text[candidate['end_index']-index_shift:]
                # as long as slang_word == predicted_word, the index_shift will be the same
                index_shift = index_shift + len(candidate['slang_word']) - len(predicted_word)
                updated_text_to_check = text

                # update candidate
                difference = ''
                if predicted_word == candidate['slang_word']:
                    processed_candidate =  candidate.copy()
                    processed_candidate['chosen_translation'] = candidate['slang_word']
                    processed_candidate['was_replaced'] = False
                    processed_candidate['final_text'] = my_dict['text_to_check'].replace('___', candidate['slang_word'])
                    processed_candidate['diff'] = difference
                else:
                    processed_candidate = candidate.copy()
                    processed_candidate['chosen_translation'] = predicted_word
                    processed_candidate['was_replaced'] = True
                    processed_candidate['final_text'] = my_dict['text_to_check'].replace('___', predicted_word)
                    processed_candidate['diff'] = difference
                updated_list_of_candidates_for_text.append(processed_candidate)

            # append a row to df
            replacement_tuples = [(candidate['slang_word'], candidate['chosen_translation'], candidate['diff']) for candidate in updated_list_of_candidates_for_text if candidate['was_replaced']]
            df_text_after.loc[len(df_text_after)] = {
                                                                'id': index,
                                                                'true_sentiment': row['true_sentiment'],
                                                                'text': row['text'],
                                                                'candidates': row['candidates'],
                                                                'updated_candidates': updated_list_of_candidates_for_text,
                                                                'processed_text': text,
                                                                'is_same_as_original': text == row['text'],
                                                                'chosen_translation': replacement_tuples
                                                                }
            dict_text_after[index] = {
                'true_sentiment': row['true_sentiment'],
                'text': row['text'],
                'candidates': row['candidates'],
                'updated_candidates': updated_list_of_candidates_for_text,
                'processed_text': text,
                'is_same_as_original': text == row['text'],
                'chosen_translation': replacement_tuples

            }
            pbar.update(1)

    return df_text_after, dict_text_after

In [None]:
# execution for amazon
df_amazon_after, dict_amazon_after = replace_slang(dict_amazon_for_bert, use_progressive_updated_text = False)

# save df and dict
file_name = 'amazon_after_script_small'
df_amazon_after.to_csv(path_data + 'BERT_data/' + file_name + '.csv', index=False)
with open(path_data + 'BERT_data/' + file_name + '.pkl', 'wb') as handle:
    pickle.dump(dict_amazon_after, handle, protocol=pickle.HIGHEST_PROTOCOL)

100%|██████████| 19212/19212 [01:33<00:00, 204.92it/s]


In [None]:
# # execution for twitter
df_twitter_after, dict_twitter_after = replace_slang(dict_twitter_for_bert, use_progressive_updated_text = False)

# save df and dict
file_name = 'twitter_after_script_small'
df_twitter_after.to_csv(path_data + 'BERT_data/' + file_name + '.csv', index=False)
with open(path_data + 'BERT_data/' + file_name + '.pkl', 'wb') as handle:
    pickle.dump(dict_twitter_after, handle, protocol=pickle.HIGHEST_PROTOCOL)

100%|██████████| 18828/18828 [01:40<00:00, 187.77it/s]
