In [None]:
!python -m spacy download en_core_web_md
!pip install openpyxl
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

In [None]:
import pandas as pd
import spacy
from spacy.tokens.token import Token
import string

from typing import List, Set
nlp = spacy.load("en_core_web_md")

In [None]:
def convert_to_string(data) -> str:
    return data if isinstance(data, str) else str(data)

def tokenize(text: str) -> List[Token]:
  doc = nlp(text)
  return [w for sent in doc.sents for w in sent]

def remove_punctuation(tokens: List[Token]) -> List[Token]:
  return [t for t in tokens if t.text not in string.punctuation]

def remove_stop_words(tokens: List[Token]) -> List[Token]:
  return [t for t in tokens if not t.is_stop]

def lemmatize(tokens: List[Token]) -> List[str]:
  return [t.lemma_ for t in tokens]

def case_fold(tokens: List[str]) -> List[str]:
    return [t.lower() for t in tokens]

def pre_process_text(text: str) -> List[str]:
    return case_fold(lemmatize(remove_punctuation(tokenize(convert_to_string(text)))))

def get_num_sentences(data) -> int:
    doc = nlp(data if isinstance(data, str) else str(data))
    return len(list(doc.sents))

def get_num_words(data) -> int:
    doc = nlp(data if isinstance(data, str) else str(data))
    sentence_length = 0
    for sent in doc.sents: sentence_length += len(sent)
    return sentence_length


In [None]:
# conservative_df = pd.read_csv('cons_comments.csv', header=0)
# liberal_df = pd.concat([pd.read_excel('lib_comments.xlsx', header=0), pd.read_excel('dem_comments.xlsx', header=0)])
conservative_df = pd.read_csv('conservative_metadata_and_preprocessed_with_stop_words.csv', header=0)
liberal_df = pd.read_csv('liberal_metadata_preprocessed_with_stop_words.csv', header=0)

In [None]:
conservative_df['preprocess_body'] = conservative_df['body'].apply(pre_process_text)
conservative_df['num_sentences'] = conservative_df['body'].apply(get_num_sentences)
conservative_df['numWords'] = conservative_df['body'].apply(get_num_words)
conservative_df.to_csv('conservative_metadata_and_preprocessed_with_stop_words.csv', index = False)

liberal_df['preprocess_body'] = liberal_df['body'].apply(pre_process_text)
liberal_df['num_sentences'] = liberal_df['body'].apply(get_num_sentences)
liberal_df['numWords'] = liberal_df['body'].apply(get_num_words)
liberal_df.to_csv('liberal_metadata_preprocessed_with_stop_words.csv', index = False)
# random_sample_size = 194944
# conservative_base_sample_df = conservative_df.sample(n=194944)
# conservative_base_sample_df.to_csv('conservative_base_sample.csv', index = False)
# liberal_base_sample_df = liberal_df
# conservative_2k_sample = conservative_base_sample_df.sample(n=2000)
# liberal_2k_sample = liberal_base_sample_df.sample(n=2000)
# conservative_2k_sample.to_csv('conservative_2k_sample.csv', index = False)
# liberal_2k_sample.to_csv('liberal_2k_sample.csv', index = False)

In [None]:
def create_vocab(data: List[List[str]]) -> Set[str]:
    data = list(map(lambda x: x.strip('[]').split(",") if type(x) == str else [], data))
    vocab = {str(token).lower() for tokens in data for token in tokens}
    return vocab

def find_common_vocab(vocab1, vocab2):
    return list(vocab1.intersection(vocab2))

In [None]:
conservative_vocab = create_vocab(conservative_df['preprocess_body'])
liberal_vocab = create_vocab(liberal_df['preprocess_body'])

common_vocab = find_common_vocab(conservative_vocab, liberal_vocab)
conservative_vocab_percent = len(conservative_vocab) / len(common_vocab) * 100
liberal_vocab_percent = len(liberal_vocab) / len(common_vocab) * 100

In [None]:
import collections

def get_meta_data(dataframe):
    word_to_doc_map = collections.defaultdict(set)
    word_to_score_map = collections.defaultdict(int)
    word_to_freq_map = collections.defaultdict(int)
    
    for index, row in dataframe.iterrows():
        doc_id, score = row["id"], row["score"]
        try:
            for word in row["preprocess_body"].strip('[]').split(','):
                word_to_doc_map[word].add(doc_id)
                word_to_score_map[word] += score
                word_to_freq_map[word] += 1
        except Exception as e:
            print("Row: ", row["preprocess_body"])
            print("The row might be a non string value: ", e)
    
    meta_data = {"word": [], "freq": [], "score": [], "doc_ids": []}
    for word, freq in word_to_freq_map.items():
        meta_data["word"].append(word)
        meta_data["freq"].append(freq)
        meta_data["score"].append(word_to_score_map.get(word, 0))
        meta_data["doc_ids"].append(word_to_doc_map.get(word, []))
        
    return pd.DataFrame(meta_data)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

x = ['conservative', 'liberal']
colors = ['red', 'blue']
size = [len(conservative_df['body']), len(liberal_df['body'])]
vocab_percent = [conservative_vocab_percent , liberal_vocab_percent]
vocab_size = [len(conservative_vocab), len(liberal_vocab)]
average_sent_length = [np.mean(conservative_df['num_sentences']),np.mean(liberal_df['num_sentences'])]
average_word_count = [np.mean(conservative_df['numWords']), np.mean(liberal_df['numWords'])]

# post size
plt.bar(x, size, color=colors)
plt.xlabel('Category')
plt.ylabel('Number of Posts')
plt.title('Reddit Posts size comparison')
for i, v in enumerate(size):
    plt.text(i, v + 1, str(v), color='black', fontweight='bold', ha='center')
plt.show()

# vocabulary distribution
chart = plt.pie(vocab_percent, labels=x, autopct='%1.1f%%', colors=colors)
plt.title('Vocabulary Distribution')
legend_labels = []
for i in range(len(vocab_size)):
    legend_labels.append(f"{x[i]}: {vocab_size[i]}")
plt.legend(chart[0], legend_labels, title='Category', loc='center left')
plt.show()

# Sentence length
plt.barh(x, average_sent_length, color=colors)
plt.xlabel('Number of sentences per post')
plt.ylabel('Category')
plt.title('Average Sentences per Post')
for i, v in enumerate(average_sent_length):
    plt.text(v + 0.3, i, str(round(v, 2)), color='black', fontweight='bold', ha='center')
plt.show()

# Number of words
plt.scatter(x, average_word_count, s=[a * 25 for a in average_word_count], alpha=0.5, c=colors, cmap='viridis')
plt.xlabel('Category')
plt.ylabel('Number of words per post')
plt.title('Average word count')
for i, v in enumerate(average_word_count):
    plt.text(i, v, str(int(v)), color='black', fontweight='bold', ha='center')
plt.show()


In [None]:
conservative_words = get_meta_data(conservative_df).sort_values('score', ascending = False)["word"].head(100)
liberal_words = get_meta_data(liberal_df).sort_values('score', ascending = False)["word"].head(100)
intersection = set(conservative_words).intersection(set(liberal_words))
print(intersection)

In [None]:
words_of_interest = ['conservative','trump','democrat','election','party','job','news','vote','president','american','right','pay','run','government','country','democrats','bernie','biden','state','life','support']

In [None]:
from collections import defaultdict

similar_sentences_group = {}

conservative_metadata = get_meta_data(conservative_df)
liberal_metadata = get_meta_data(liberal_df)
def padWord(word): return "'" + str(word) + "'"


for word in list(map(lambda x: padWord(x), words_of_interest)):
    similar_sentences_group[word] = defaultdict(list)
    conservative_docs = list(set([postId 
                                  for postSet in conservative_metadata[conservative_metadata["word"] == word]["doc_ids"]
                                  for postId in postSet]
                                )
                            )
    conservative_docs = list(map(lambda x: x.strip("'"), conservative_docs))
    for index, row in conservative_df[conservative_df["id"].isin(conservative_docs)].iterrows():
        doc = nlp(row["body"])
        similar_sentences_group[word]["conservative"].extend([sent for sent in doc.sents])
     
    
    liberal_docs = list(set([postId 
                             for postSet in liberal_metadata[liberal_metadata["word"] == word]["doc_ids"]
                             for postId in postSet]
                           )
                       )
    liberal_docs = list(map(lambda x: x.strip("'"), liberal_docs))
    for index, row in liberal_df[liberal_df["id"].isin(liberal_docs)].iterrows():
        doc = nlp(row["body"])
        similar_sentences_group[word]["liberal"].extend([sent for sent in doc.sents])

In [None]:
import numpy as np
def preprocess_cosine_similarity(text: str) -> np.array:
  tokens = [token for token in text]
  tokens = remove_punctuation(tokens)
  tokens = remove_stop_words(tokens)
  vector = np.mean([word.vector for word in tokens], axis=0)
  return vector

from scipy.spatial.distance import cosine

def cosine_similarity(vector_1: np.array, vector_2: np.array) -> float:
    cosine_distance = cosine(vector_1, vector_2, w=None) 
    return 1 - cosine_distance

def find_most_similar_document_cosine(sentence_df, data_point: np.array) -> int:
  sentence_df["similarity_score"] = sentence_df["sentence_processed"].apply(lambda x: cosine_similarity(x, data_point))
  return sentence_df.iloc[sentence_df["similarity_score"].idxmax()].sentence

In [None]:
for word in similar_sentences_group:
    conservative_bucket_df = pd.DataFrame({"sentence": similar_sentences_group[word]["conservative"]})
    conservative_bucket_df["sentence_processed"] = conservative_bucket_df["sentence"].apply(preprocess_cosine_similarity)
    conservative_final_df = pd.DataFrame(columns=["sentence", "similar_sentence"])
    for index, row in conservative_bucket_df.iterrows():
        conservative_final_df = pd.concat([conservative_final_df,
                              pd.DataFrame(
                                  [(row["sentence"], 
                                   find_most_similar_document_cosine(conservative_bucket_df[conservative_bucket_df.index != index], row["sentence_processed"]))
                                  ], columns=["sentence", "similar_sentence"]
                                )
                              ]
                            )
    
    
    liberal_bucket_df = pd.DataFrame({"sentence": similar_sentences_group[word]["liberal"]})
    liberal_bucket_df["sentence_processed"] = liberal_bucket_df["sentence"].apply(preprocess_cosine_similarity)
    liberal_final_df = pd.DataFrame(columns=["sentence", "similar_sentence"])
    for index, row in liberal_bucket_df.iterrows():
        liberal_final_df = pd.concat([liberal_final_df,
                              pd.DataFrame(
                                  [(row["sentence"], 
                                   find_most_similar_document_cosine(liberal_bucket_df[liberal_bucket_df.index != index], row["sentence_processed"]))
                                  ], columns=["sentence", "similar_sentence"]
                                )
                              ]
                            )
    
print(conservative_final_df.tail(5))
print(liberal_final_df.tail(5))
    