In [None]:
import gensim.downloader as api
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import random
import numpy as np
from tqdm import tqdm

In [None]:
nltk.download('punkt_tab')
nltk.download('stopwords')

In [None]:
strong_word_file = pd.read_csv("C:/Users/user/Downloads/Project/Tweet/RNN/extracted_strong_word.csv")
original_data = pd.read_csv("C:/Users/user/Downloads/Project/Tweet/Tweets Dataset.csv")

In [None]:
strong_word_file.head()

In [None]:
symbols_dict = {
    '!': 'Exclamation Mark',
    '"': 'Double Quotation Mark',
    '#': 'Hash/Pound Sign',
    '$': 'Dollar Sign',
    '%': 'Percent Sign',
    '&': 'Ampersand',
    "'": 'Single Quotation Mark',
    '(': 'Left Parenthesis',
    ')': 'Right Parenthesis',
    '*': 'Asterisk',
    '+': 'Plus Sign',
    ',': 'Comma',
    '-': 'Hyphen',
    '.': 'Period',
    '/': 'Forward Slash',
    ':': 'Colon',
    ';': 'Semicolon',
    '<': 'Less Than Sign',
    '=': 'Equal Sign',
    '>': 'Greater Than Sign',
    '?': 'Question Mark',
    '@': 'At Sign',
    '[': 'Left Square Bracket',
    ']': 'Right Square Bracket',
    '^': 'Caret',
    '_': 'Underscore',
    '`': 'Backtick',
    '{': 'Left Curly Brace',
    '|': 'Vertical Bar',
    '}': 'Right Curly Brace',
    '~': 'Tilde',
    '...': 'Three Dots'
}

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
glove_model = api.load("glove-wiki-gigaword-300")

In [None]:
def prepare_Vs(comment, label, strong_word_file, n) :
    tokens = word_tokenize(str(comment).lower())
    Vs, Vn = [], []

    for token in tokens :
        strong_word = strong_word_file[strong_word_file['Word'] == token]
        if not strong_word.empty and strong_word['Label'].values[0] == label :
            Vs.append((token, strong_word['limescore'].values[0]))
        else :
            Vn.append(token)

    Vs.sort(key=lambda x: x[1], reverse=True)
    Vn.extend([word for word, _ in Vs[n:]])
    Vs = [word for word, _ in Vs[:n]]

    return Vs, Vn

In [None]:
def d_angular(x,y):
    dot_product = np.dot(x, y)
    norm_x = np.linalg.norm(x)
    norm_y = np.linalg.norm(y)

    cosine_similarity = dot_product / (norm_x * norm_y)
    cosine_similarity = np.clip(cosine_similarity, -1.0, 1.0)
    angular_distance_radians = np.arccos(cosine_similarity)

    return angular_distance_radians

In [None]:
def compute_probability(x, y, Vp, epsilon):

    x_vec = glove_model[x]
    y_vec = glove_model[y]
    d_angular_sim = d_angular(x_vec, y_vec)

    sum_exp = 0
    for v in Vp:
        if v in glove_model:
            sum_exp += np.exp(-0.5 * epsilon * d_angular(x_vec, glove_model[v]))

    Cx = 1 / sum_exp if sum_exp != 0 else 1
    prob = Cx * np.exp(-0.5 * epsilon * d_angular_sim)
    return prob


In [None]:
# Function to find top N semantically similar words
def get_top_similar_words(word, model, top_n):
    threshold = 0.6
    try:
        similar_words = model.most_similar(word, topn=top_n*2)
        return [item[0] for item in similar_words if item[1] >= threshold][:top_n]
    except KeyError:
        return []

In [None]:
def substitute_word(word, Vp, epsilon):
    if not Vp:
        return word, 0.0

    y = np.random.choice(Vp)
    prob = compute_probability(word, y, Vp, epsilon)
    return y, prob

In [None]:
import string

def generator(glove_model, top_strong_word, top_similar_word, epsilon, output_file):
    generated_data = []

    p = 0.5
    for i in tqdm(range(len(original_data)), desc="Generating Tweet", unit="Tweet"):
        comment = original_data.iloc[i]['Tweet']
        label = original_data.iloc[i]['Party']
        genders = 0 if label == "Democrat" else 1

        Vs, Vn = prepare_Vs(comment, genders, strong_word_file, top_strong_word)

        # Tokenize the comment
        tokens = word_tokenize(comment.lower())
        new_comment = []

        for word in tokens:

            if  word in stop_words or any(ord(c) > 127 for c in word) or word in symbols_dict :
                new_comment.append(word)

            elif word in Vs :
                Vg = get_top_similar_words(word, glove_model, top_similar_word)
                Vp = list(set(Vg) - set(Vs))

                new_word, prob = substitute_word(word, Vp, epsilon)
                new_comment.append(new_word)

            else:
                flip = random.random()
                if flip <= p:
                    new_comment.append(word)

                else:
                    Vg = get_top_similar_words(word, glove_model, top_similar_word)
                    Vp = list(set(Vg) - set(Vs))

                    new_word, prob = substitute_word(word, Vp, epsilon)
                    new_comment.append(new_word)

        generated_comment = " ".join(new_comment)

        generated_data.append([comment, generated_comment, label])

    # Save to CSV
    generated_df = pd.DataFrame(generated_data, columns=['Original Tweet', 'Generated Tweet', 'Label'])
    generated_df.to_csv(output_file, index=False)
    print(f"Generated comments saved to {output_file}")

In [None]:
generator(glove_model, 2, 10, 1, "C:/Users/user/Downloads/Project/Tweet/RNN/Using_angular/comment_2_10_10.csv")

In [None]:
generator(glove_model, 2, 15, 1, "C:/Users/user/Downloads/Project/Tweet/RNN/Using_angular/comment_2_15_10.csv")

In [None]:
generator(glove_model, 3, 10, 1, "C:/Users/user/Downloads/Project/Tweet/RNN/Using_angular/comment_3_10_10.csv")

In [None]:
generator(glove_model, 3, 15, 1, "C:/Users/user/Downloads/Project/Tweet/RNN/Using_angular/comment_3_15_10.csv")