In [None]:
import nltk
import re
import json
import pandas as pd
from nltk.util import ngrams
from collections import Counter, defaultdict
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt

In [None]:
def load_json(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        data = json.load(f)  # This loads the list of combined strings

    # Split each entry into [submission, comments] by the separator
    processed = []
    for entry in data:
        if " [======>] " in entry:
            submission, comments = entry.split(" [======>] ", 1)
            processed.append([submission.strip(), comments.strip()])
        else:
            # Handle any malformed entry without the separator
            processed.append([entry.strip(), ""])

    return processed

In [None]:
def preprocess(text):
    text = text.lower()  # convert text to lower-case
    text = re.sub('&gt;', '', text) # remove some special characters from the data &gt; corresponds to >
    text = re.sub('&amp;', '', text) # remove some special characters from the data &amp; corresponds to &
    text = re.sub(r'\s+', ' ', text)  # eliminate duplicate whitespaces using regex
    text = re.sub(r'\[[^]]*\]', '', text)  # remove text in square brackets
    text = re.sub(r'http\S+', '', text)  # remove URLs
    text = re.sub(r'[^a-z0-9\s]', '', text)  # remove punctuation (keep only characters and numbers)
    return text

In [None]:
def count_nta_yta(strings):
    results = []
    for text in strings:
        nta_count = len(re.findall(r'\bnta\b', text))
        yta_count = len(re.findall(r'\byta\b', text))
        results.append((nta_count, yta_count))
    return results

In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(word.lower()) for word in tokens if word.lower() not in stop_words]

In [None]:
corpus = load_json("./data/output.json")

In [None]:
corpus_submissions = [x[0] for x in corpus]
corpus_comment = [x[1] for x in corpus]

In [None]:
corpus_submissions = corpus_submissions[0:15]
corpus_comment = corpus_comment[0:15]

In [None]:
# run our function to preprocess all comments
preprocessed_comments = [preprocess(comment) for comment in corpus_comment]

In [None]:
nta_yta = count_nta_yta(preprocessed_comments)

In [None]:
proportions_ratios = [p[0] / sum(p) if sum(p) != 0 else 0 for p in nta_yta]
plt.hist(proportions_ratios)
plt.title("NTA proportion 500 submissions")
plt.grid()

In [None]:
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
# Tokenize the comments and remove the stopwords that have yta proportion < 0.5
all_words_yta = [] # list of lists holding our dataset (each list corresponds to a comment and it includes the tokenized words)
for comment in range(len(preprocessed_comments)):
    if proportions_ratios[comment] >= 0.5:
        # tokenize the comments and remove stopwords
        all_words_yta.append([ w for w in word_tokenize(preprocessed_comments[comment]) if w not in stop_words])

        # all_words_yta.append([ w for w in word_tokenize(preprocessed_comments[comment]) if w not in stop_words])

In [None]:
# Tokenize the comments and remove the stopwords that have nta proportion > 0.5
all_words_nta = []
for comment in range(len(preprocessed_comments)):
    if proportions_ratios[comment] < 0.5:
        all_words_nta.append([ w for w in word_tokenize(preprocessed_comments[comment])])

In [None]:
lemmatized_yta = [lemmatize_tokens(tokens) for tokens in all_words_yta]
lemmatized_nta = [lemmatize_tokens(tokens) for tokens in all_words_nta]

In [None]:
all_bigrams_yta = []
all_trigrams_yta = []
all_fourgrams_yta = []

all_bigrams_nta = []
all_trigrams_nta = []
all_fourgrams_nta = []

for doc in lemmatized_nta:
    all_bigrams_nta.extend(list(ngrams(doc, 2)))
    all_trigrams_nta.extend(list(ngrams(doc, 3)))
    all_fourgrams_nta.extend(list(ngrams(doc, 4)))

for doc in lemmatized_yta:
    all_bigrams_yta.extend(list(ngrams(doc, 2)))
    all_trigrams_yta.extend(list(ngrams(doc, 3)))
    all_fourgrams_yta.extend(list(ngrams(doc, 4)))

In [None]:
all_bigrams_yta_set = list(set(all_bigrams_yta))
all_trigrams_yta_set = list(set(all_trigrams_yta))
all_fourgrams_yta_set = list(set(all_fourgrams_yta))

In [None]:
# filter al the bi-,tri-,fourgrams of nta with yta
# very

filtered_bigrams = [bg for bg in all_bigrams_nta if bg not in all_bigrams_yta_set]
print("bigrams done")
filtered_bigrams = [bg for bg in all_trigrams_yta if bg not in all_trigrams_yta_set]
print("trigrams done")
filtered_bigrams = [bg for bg in all_fourgrams_yta if bg not in all_fourgrams_yta_set]
print("fourgrams done")

In [None]:
# we use the Counter class from Collections to find the top N most occurring Ngrams in our dataset
top_bigrams = Counter(all_bigrams_nta).most_common(20)
top_trigrams = Counter(all_trigrams_nta).most_common(20)
top_fourgrams = Counter(all_fourgrams_nta).most_common(20)

print(top_bigrams)
print(top_trigrams)
print(top_fourgrams)

In [None]:
# filter al the bi-,tri-,fourgrams of yta with nta
# very slow

all_bigrams_nta_set = list(set(all_bigrams_nta))
all_trigrams_nta_set = list(set(all_trigrams_nta))
all_fourgrams_nta_set = list(set(all_fourgrams_nta))

filtered_bigrams = [bg for bg in all_bigrams_yta if bg not in all_bigrams_nta_set]
print("bigrams done")
filtered_bigrams = [bg for bg in all_trigrams_yta if bg not in all_trigrams_nta_set]
print("trigrams done")
filtered_bigrams = [bg for bg in all_fourgrams_yta if bg not in all_fourgrams_nta_set]
print("fourgrams done")

In [None]:
# we use the Counter class from Collections to find the top N most occurring Ngrams in our dataset
top_bigrams = Counter(all_bigrams_yta).most_common(20)
top_trigrams = Counter(all_trigrams_yta).most_common(20)
top_fourgrams = Counter(all_fourgrams_yta).most_common(20)

print(top_bigrams)
print(top_trigrams)
print(top_fourgrams)