In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
from collections import Counter
from tqdm import tqdm
from spacy.lang.en import English
from multiprocessing import Pool, cpu_count
import gensim.downloader as api
import numpy as np
import random
import string
from scipy.special import softmax

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

In [None]:
glove_model = api.load("glove-wiki-gigaword-300")

In [None]:
symbols_dict = {
    '!': 'Exclamation Mark',
    '"': 'Double Quotation Mark',
    '#': 'Hash/Pound Sign',
    '$': 'Dollar Sign',
    '%': 'Percent Sign',
    '&': 'Ampersand',
    "'": 'Single Quotation Mark',
    '(': 'Left Parenthesis',
    ')': 'Right Parenthesis',
    '*': 'Asterisk',
    '+': 'Plus Sign',
    ',': 'Comma',
    '-': 'Hyphen',
    '.': 'Period',
    '/': 'Forward Slash',
    ':': 'Colon',
    ';': 'Semicolon',
    '<': 'Less Than Sign',
    '=': 'Equal Sign',
    '>': 'Greater Than Sign',
    '?': 'Question Mark',
    '@': 'At Sign',
    '[': 'Left Square Bracket',
    '\\': 'Backslash',
    ']': 'Right Square Bracket',
    '^': 'Caret',
    '_': 'Underscore',
    '`': 'Backtick',
    '{': 'Left Curly Brace',
    '|': 'Vertical Bar',
    '}': 'Right Curly Brace',
    '~': 'Tilde'
}

stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours',
              'yourself', 'yourselves', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which',
              'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have',
              'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until',
              'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',
              'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then',
              'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other',
              'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just',
              'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn',
              "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't",
              'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn',
              "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't", 'lol']

stop_words_dict = {word: index for index, word in enumerate(stop_words)}

sens_word_dict = {
    'he': 'person',
    'she': 'person',

}

In [None]:
def get_vocab_and_doc(data_file_path):
    df = pd.read_csv(data_file_path)

    tweets = df['Tweet'].tolist()
    labels = df['Party'].tolist()

    n = int(len(tweets) / 15)
    x = [tweets[i:i + n] for i in range(0, len(tweets), n)]
    y = [labels[i:i + n] for i in range(0, len(labels), n)]

    first_half_tweets = x[0]
    second_half_tweets = x[1]
    third_half_tweets = x[2]
    fourth_half_tweets = x[3]
    fifth_half_tweets = x[4]
    sixth_half_tweets = x[5]
    seventh_half_tweets = x[6]
    eighth_half_tweets = x[7]
    ninth_half_tweets = x[8]
    tenth_half_tweets = x[9]
    eleventh_half_tweets = x[10]
    twelveth_half_tweets = x[11]
    thirteenth_half_tweets = x[12]
    fourteenth_half_tweets = x[13]
    fifteen_half_tweets = x[14]

    first_half_labels = y[0]
    second_half_labels = y[1]
    third_half_labels = y[2]
    fourth_half_labels = y[3]
    fifth_half_labels = x[4]
    sixth_half_labels = x[5]
    seventh_half_labels = x[6]
    eighth_half_labels = x[7]
    ninth_half_labels = x[8]
    tenth_half_labels = x[9]
    eleventh_half_labels = x[10]
    twelveth_half_labels = x[11]
    thirteenth_half_labels = x[12]
    fourteenth_half_labels = x[13]
    fifteen_half_labels = x[14]

    vocab=Counter()

    print("Building Vocabulary...")

    tokenizer = English()
    docs = []
    labels = []

    for i in tqdm(range(len(first_half_tweets))):
        tweet = first_half_tweets[i]
        tokenized_text = [token.text for token in tokenizer(tweet)]
        docs.append(tokenized_text)
        labels.append(first_half_labels[i])
        for token in tokenized_text:
            vocab[token]+=1

    return vocab, docs, labels

In [None]:
# vocab, docs, labels = get_vocab_and_doc('/content/drive/MyDrive/Datasets/dataset.csv')
# print(docs[3])

In [None]:
def get_strong_words(data_file_path):
    df = pd.read_csv(data_file_path)
    strong_words = df['word'].to_list()

    word2id = {}
    all_count = 1

    for i in strong_words:
        word2id[i] = all_count
        all_count += 1

    return word2id

In [None]:
# def get_randomly_synonym_word(word):

#     top_similar_words = 10

#     try:
#         word_vector = glove_model[word]
#         similar_words = glove_model.most_similar(word_vector, topn=top_similar_words)

#         words, score = zip(*similar_words)
#         words = np.array(words)
#         score = np.array(score)
#         score = -score

#         probabilities = softmax(1 * score / 2)

#         # Randomly select an index based on the specified probabilities
#         selected_index = np.random.choice(top_similar_words, p=probabilities)

#         # return selected word
#         return words[selected_index]

#     except KeyError:
#         return word

def get_randomly_synonym_word(word):

    top_similar_words = 10

    try:
        word_vector = glove_model[word]
        similar_words = glove_model.most_similar(word_vector, topn=top_similar_words)

        # Create a linearly decreasing probability distribution
        # Adjust the slope and intercept as needed
        slope = -0.2  # Slope of the linear decay
        intercept = 1.2  # Intercept to ensure non-negative probabilities

        # Calculate probabilities for each index
        probabilities = [max(0, slope * i + intercept) for i in range(top_similar_words)]
        probabilities = np.array(probabilities)

        # Normalize probabilities to ensure they sum to 1
        probabilities /= sum(probabilities)

        # Randomly select an index based on the specified probabilities
        selected_index = np.random.choice(top_similar_words, p=probabilities)

        selected_synonym_word, selected_synonym_word_score = similar_words[selected_index]

        return selected_synonym_word

    except KeyError:
        return word

In [None]:
def santext():
    vocab, docs, labels = get_vocab_and_doc('/content/drive/MyDrive/Datasets/Tweets/ExtractedTweets.csv')

    strong_words = get_strong_words('/content/drive/MyDrive/Datasets/Tweets/BERT/combined_extracted_strong_words_without_duplicates.csv')

    new_docs = []

    p = 0.5 # get from arguments
    for i in tqdm(range(len(docs))):
        doc = docs[i]
        new_doc = []
        for word in doc:
            if word.lower() in sens_word_dict:
              new_doc.append(sens_word_dict[word.lower()])
            elif word in strong_words and word.lower() not in symbols_dict and word.lower() not in stop_words_dict:
                selected_synonym_word = word
                temp = 0
                flag = False
                while True:
                  selected_synonym_word = get_randomly_synonym_word(word.lower())

                  if selected_synonym_word == word:
                    break

                  if selected_synonym_word not in strong_words:
                    break

                  temp = temp + 1
                  if temp >= 10:
                    flag = True
                    break

                if flag:
                  new_doc.append(word)
                else:
                  new_doc.append(selected_synonym_word)
            else:
                flip_p=random.random()
                # if word.lower() in sens_word_dict:
                #   new_doc.append(sens_word_dict[word.lower()])
                if flip_p<=p and word.lower() not in symbols_dict and word.lower() not in stop_words_dict:
                    selected_synonym_word = get_randomly_synonym_word(word.lower())
                    new_doc.append(selected_synonym_word)
                else:
                    new_doc.append(word)

        new_doc = " ".join(new_doc)
        new_docs.append(new_doc)

    df = pd.DataFrame(list(zip(labels,new_docs)), columns=['party', 'tweet'])
    df.to_csv('/content/drive/MyDrive/Datasets/Tweets/BERT/1_tweet_replaced_dataset_top10.csv')

In [None]:
santext()