In [None]:
!python -m spacy download en_core_web_md
!pip install openpyxl
!pip install transformers
!pip install gensim==3.8.3
!pip install numpy==1.22.0
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

In [None]:
import pandas as pd
from preprocessor import Preprocessing


preprocessor = Preprocessing()

conservative_df = pd.concat([pd.read_csv('cons_comments.csv', header=0), pd.read_csv('rep_comments.csv', header=0)]).sample(10)
conservative_df['preprocess_body'] = conservative_df['body'].apply(preprocessor.pre_process_text)
conservative_df['num_sentences'] = conservative_df['body'].apply(preprocessor.get_num_sentences)
conservative_df['numWords'] = conservative_df['body'].apply(preprocessor.get_num_words)
conservative_df.to_csv('conservative_metadata_and_preprocessed_with_stop_words.csv', index = False)

liberal_df = pd.concat([pd.read_excel('lib_comments.xlsx', header=0), pd.read_excel('dem_comments.xlsx', header=0)]).sample(10)
liberal_df['preprocess_body'] = liberal_df['body'].apply(preprocessor.pre_process_text)
liberal_df['num_sentences'] = liberal_df['body'].apply(preprocessor.get_num_sentences)
liberal_df['numWords'] = liberal_df['body'].apply(preprocessor.get_num_words)
liberal_df.to_csv('liberal_metadata_and_preprocessed_with_stop_words.csv', index = False)

conservative_base_sample_df =  pd.read_csv('conservative_metadata_and_preprocessed_with_stop_words.csv', header=0).sample(n=194944)
liberal_base_sample_df =  pd.read_csv('liberal_metadata_and_preprocessed_with_stop_words.csv', header=0).sample(n=194944)

In [None]:
conservative_vocab = preprocessor.create_vocab(conservative_base_sample_df['preprocess_body'])
liberal_vocab = preprocessor.create_vocab(liberal_base_sample_df['preprocess_body'])

common_vocab = preprocessor.find_common_vocab(conservative_vocab, liberal_vocab)
conservative_vocab_percent = len(conservative_vocab) / len(common_vocab) * 100
liberal_vocab_percent = len(liberal_vocab) / len(common_vocab) * 100

In [None]:
import matplotlib.pyplot as plt
import numpy as np

x = ['conservative', 'liberal']
colors = ['red', 'blue']
size = [len(conservative_base_sample_df['body']), len(liberal_base_sample_df['body'])]
vocab_percent = [conservative_vocab_percent , liberal_vocab_percent]
vocab_size = [len(conservative_vocab), len(liberal_vocab)]
average_sent_length = [np.mean(conservative_base_sample_df['num_sentences']), np.mean(liberal_base_sample_df['num_sentences'])]
average_word_count = [np.mean(conservative_base_sample_df['numWords']), np.mean(liberal_base_sample_df['numWords'])]

# post size
plt.bar(x, size, color=colors)
plt.xlabel('Category')
plt.ylabel('Number of Posts')
plt.title('Reddit Posts size comparison')
for i, v in enumerate(size):
    plt.text(i, v + 1, str(v), color='black', fontweight='bold', ha='center')
plt.gcf().set_size_inches(6, 4) # set the figure size to 6 inches wide by 4 inches high
plt.subplots_adjust(left=0.15)
plt.show()

# vocabulary distribution
chart = plt.pie(vocab_percent, labels=x, autopct='%1.1f%%', colors=colors)
plt.title('Vocabulary Distribution')
legend_labels = []
for i in range(len(vocab_size)):
    legend_labels.append(f"{x[i]}: {vocab_size[i]}")
plt.legend(chart[0], legend_labels, title='Category', loc='center left')
plt.show()

# Sentence length
plt.barh(x, average_sent_length, color=colors)
plt.xlabel('Number of sentences per post')
plt.ylabel('Category')
plt.title('Average Sentences per Post')
for i, v in enumerate(average_sent_length):
    plt.text(v + 0.3, i, str(round(v, 2)), color='black', fontweight='bold', ha='center')
plt.show()

# Number of words
plt.scatter(x, average_word_count, s=[a * 25 for a in average_word_count], alpha=0.5, c=colors, cmap='viridis')
plt.xlabel('Category')
plt.ylabel('Number of words per post')
plt.title('Average word count')
for i, v in enumerate(average_word_count):
    plt.text(i, v, str(int(v)), color='black', fontweight='bold', ha='center')
plt.show()

In [None]:
conservative_words = preprocessor.get_meta_data(conservative_base_sample_df).sort_values('score', ascending = False)["word"].head(100)
liberal_words = preprocessor.get_meta_data(liberal_base_sample_df).sort_values('score', ascending = False)["word"].head(100)
intersection = set(conservative_words).intersection(set(liberal_words))
print(intersection)

In [None]:
from collections import defaultdict

words_of_interest = [
    "conservative",
    "trump",
    "democrat",
    "election",
    "party",
    "job",
    "news",
    "vote",
    "president",
    "american",
    "right",
    "pay",
    "run",
    "government",
    "country",
    "democrats",
    "bernie",
    "biden",
    "state",
    "life",
    "support",
]

similar_sentences_group = {}

conservative_metadata = preprocessor.get_meta_data(conservative_base_sample_df)
liberal_metadata = preprocessor.get_meta_data(liberal_base_sample_df)

for word in words_of_interest:
    similar_sentences_group[word] = defaultdict(list)
    conservative_docs = list(set([postId 
                                  for postSet in conservative_metadata[conservative_metadata["word"] == word]["doc_ids"]
                                  for postId in postSet]
                                )
                            )
    conservative_docs = list(map(lambda x: x.strip("'"), conservative_docs))
    for index, row in conservative_base_sample_df[conservative_base_sample_df["id"].isin(conservative_docs)].iterrows():
        doc = preprocessor.get_processed_document(row["body"])
        similar_sentences_group[word]["conservative"].extend([sent for sent in doc.sents])
     
    
    liberal_docs = list(set([postId 
                             for postSet in liberal_metadata[liberal_metadata["word"] == word]["doc_ids"]
                             for postId in postSet]
                           )
                       )
    liberal_docs = list(map(lambda x: x.strip("'"), liberal_docs))
    for index, row in liberal_base_sample_df[liberal_base_sample_df["id"].isin(liberal_docs)].iterrows():
        doc = preprocessor.get_processed_document(row["body"])
        similar_sentences_group[word]["liberal"].extend([sent for sent in doc.sents])

In [None]:
import warnings
warnings.filterwarnings("ignore")
from bert_sentence_embeddings import BertSentenceEmbeddings

bert_sent_embds = BertSentenceEmbeddings()

for word in similar_sentences_group:
    conservative_bucket_df = pd.DataFrame({"sentence": similar_sentences_group[word]["conservative"]})
    conservative_bucket_df["sentence_processed"] = conservative_bucket_df["sentence"].apply(bert_sent_embds.preprocess_cosine_similarity)
    conservative_final_df = pd.DataFrame(columns=["sentence", "similar_sentence", "cosine_sim_score"])
    for index, row in conservative_bucket_df.iterrows():
        similar_sentence, cosine_similarity_score = bert_sent_embds.find_most_similar_document_cosine(conservative_bucket_df[conservative_bucket_df.index != index].reset_index(), row["sentence_processed"])
        conservative_final_df = pd.concat([conservative_final_df,
                              pd.DataFrame(
                                  [(row["sentence"], similar_sentence, cosine_similarity_score)], columns=["sentence", "similar_sentence", "cosine_sim_score"]
                                )
                              ]
                            )
    
    liberal_bucket_df = pd.DataFrame({"sentence": similar_sentences_group[word]["liberal"]})
    liberal_bucket_df["sentence_processed"] = liberal_bucket_df["sentence"].apply(bert_sent_embds.preprocess_cosine_similarity)
    liberal_final_df = pd.DataFrame(columns=["sentence", "similar_sentence", "cosine_sim_score"])
    for index, row in liberal_bucket_df.iterrows():
        similar_sentence, cosine_similarty_score = bert_sent_embds.find_most_similar_document_cosine(liberal_bucket_df[liberal_bucket_df.index != index].reset_index(), row["sentence_processed"])
        liberal_final_df = pd.concat([liberal_final_df,
                              pd.DataFrame(
                                  [(row["sentence"], similar_sentence, cosine_similarty_score)], columns=["sentence", "similar_sentence", "cosine_sim_score"]
                                )
                              ]
                            )
    
conservative_final_df.to_csv("similar_sents_conservative.csv",index = False)
liberal_final_df.to_csv("similar_sents_liberal.csv", index = False)
    

In [None]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from bert_word_embeddings import BertWordEmbeddings


conservative_base_df = pd.read_csv('conservative_base_sample.csv', header=0).sample(100_000)
conservative_reddit_posts = [body for body in conservative_base_df["body"] if type(body) == str]
bertWordEmbeddings = BertWordEmbeddings("ChathuriJ/bert-base-uncased-finetuned-reddit_conservative")
conservative_encodings = bertWordEmbeddings.generate_encodings(conservative_reddit_posts)
conservative_word_vector, conservative_word_freq = bertWordEmbeddings.runFFN(conservative_encodings)
bertWordEmbeddings.save_pkl(conservative_word_vector, 'conservative_vectors.pkl')

In [None]:
import gensim
gensim_model = gensim.models.Word2Vec(
    size=768, # size of BERT embeddings
    min_count=1,
    window=5,
    sg=1
)
gensim_model.build_vocab_from_freq(conservative_word_freq)
gensim_model.wv.vectors = np.array([arr for arr in conservative_word_vector.values()])
model_path = "conservative.model"
gensim_model.save(model_path)

In [None]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from bert_word_embeddings import BertWordEmbeddings

liberal_base_df = pd.read_csv('liberal_base_sample.csv', header=0).sample(n=100_000)
liberal_reddit_posts = [body for body in liberal_base_df["body"] if type(body) == str]
bertWordEmbeddings = BertWordEmbeddings("ChathuriJ/bert-base-uncased-finetuned-reddit_liberal")
liberal_encodings = bertWordEmbeddings.generate_encodings(liberal_reddit_posts)
liberal_word_vector, liberal_word_freq = bertWordEmbeddings.runFFN(liberal_encodings)
bertWordEmbeddings.save_pkl(liberal_word_vector, 'liberal_vectors.pkl')

In [None]:
import gensim
gensim_model = gensim.models.Word2Vec(
    size=768, # size of BERT embeddings
    min_count=1,
    window=5,
    sg=1
)
gensim_model.build_vocab_from_freq(liberal_word_freq)
gensim_model.wv.vectors = np.array([arr for arr in liberal_word_vector.values()])
model_path = "liberal.model"
gensim_model.save(model_path)

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
most_similar_word_group = {}

# word_vectors = bertWordEmbeddings.load_pkl("liberal_vectors.pkl")
for key_word in words_of_interest:
    if key_word in word_vectors:
        # Calculate the cosine similarity between the key word's vector and all other vectors
        similarity_scores = cosine_similarity(word_vectors[key_word].reshape(1, -1), list(word_vectors.values()))[0]
        # Sort the similarity scores in descending order and get the top 50 words
        most_similar_words = [word for _, word in sorted(zip(similarity_scores, word_vectors.keys()), reverse=True)[1:51]]
        most_similar_word_group[key_word] = most_similar_words

# bertWordEmbeddings.save_pkl(most_similar_word_group, 'liberal_top50_similar_word_groups.pkl')