### 1. Import Dependencies

In [1]:
import os
import numpy as np
import math
import spacy
from collections import defaultdict, Counter
import nltk
from nltk import FreqDist, word_tokenize, bigrams, ngrams
from nltk.corpus import stopwords
import nltk
import requests
import json
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from tqdm import tqdm
# import praw
# from bs4 import BeautifulSoup

In [2]:
# from IPython.display import display, HTML
# display(HTML('<style>div.output_scroll { height: 44em; }</style>'))

In [3]:
nltk_data_dir = os.path.join(os.getcwd(), 'venv', 'nltk_data')
os.makedirs(nltk_data_dir, exist_ok=True)
nltk.data.path.append(nltk_data_dir)
nltk.download('punkt', download_dir=nltk_data_dir)
nltk.download('punkt_tab', download_dir=nltk_data_dir)
nltk.download('stopwords', download_dir=nltk_data_dir)

[nltk_data] Downloading package punkt to D:\CS Dev\CS
[nltk_data]     Stuff\Projects\Scripts\RedditRec\venv\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to D:\CS Dev\CS
[nltk_data]     Stuff\Projects\Scripts\RedditRec\venv\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to D:\CS Dev\CS
[nltk_data]     Stuff\Projects\Scripts\RedditRec\venv\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
stop_words = set(stopwords.words('english'))

In [5]:
nlp = spacy.load('en_core_web_sm')

In [6]:
spacy_stop_words = nlp.Defaults.stop_words

In [7]:
custom_stop_words = spacy_stop_words.union(ENGLISH_STOP_WORDS)

### 2. Fetch Reddit JSON

In [8]:
url = 'https://www.reddit.com/r/kdramarecommends/comments/17ehrue/best_kdramas/'
# url = 'https://www.reddit.com/r/televisionsuggestions/comments/1cfrjwa/must_watch_tv_shows/'

In [9]:
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(url + '.json', headers=headers) # Add JSON tag to properly extract all website data. Scraping does not work.

if response.status_code == 200:
    data = response.json()
    print('Successfully fetched JSON data!')
else:
    print('Failed to fetch data')

Successfully fetched JSON data!


In [10]:
# data

### 3. Extract Comments

In [11]:
def extract_all_comments(json_data, comments_list):
    """Recursively traverses JSON to extract all 'body' texts and their corresponding 'score' values."""
    if isinstance(json_data, dict):
        # If the current object has a 'body' and 'score', we add them to our list
        if 'body' in json_data and 'score' in json_data:
            comment_body = json_data['body']
            score = json_data['score']
            comments_list.append({'text': comment_body, 'score': score})

        # Recursively check each key-value pair in the dictionary
        for key, value in json_data.items():
            extract_all_comments(value, comments_list)

    elif isinstance(json_data, list):
        # If it's a list, apply the extraction function to each item
        for item in json_data:
            extract_all_comments(item, comments_list)

In [12]:
comments = []
extract_all_comments(data, comments)

In [13]:
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    return text.strip()

In [14]:
for comment in comments:
    comment['text'] = clean_text(comment['text'])

In [15]:
print(f'Total comments extracted: {len(comments)}\n')

# for comment in comments:
#     print(f'{comment}')

Total comments extracted: 88



### 4. Calculate TF-IDF Scores for Upvote Weighting

In [16]:
comment_t = [comment['text'] for comment in comments]
comment_s = [comment['score'] for comment in comments]

In [17]:
def spacy_tokenizer(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]

In [18]:
vectorizer = TfidfVectorizer(tokenizer=spacy_tokenizer)
tfidf_mx = vectorizer.fit_transform(comment_t)
w_tfidf_mx = tfidf_mx.multiply(np.array(comment_s).reshape(-1, 1))

# print(w_tfidf_mx)



In [19]:
term_scores = w_tfidf_mx.sum(axis=0)
terms = vectorizer.get_feature_names_out()
term_scores_dict = {terms[i]: term_scores[0, i] for i in range(len(terms))}
sorted_terms = sorted(term_scores_dict.items(), key=lambda x: x[1], reverse=True)

print('\nTop 10 Terms Weighted by Upvotes and TF-IDF:')
for term, score in sorted_terms[:10]:
    print(f'{term}: {score}')


Top 10 Terms Weighted by Upvotes and TF-IDF:
crash: 18.948682870281164
landing: 18.591948522936583
love: 18.543791923322075
adore: 17.285165707210997
blindly: 17.285165707210997
forever: 17.285165707210997
beginner: 16.781174297886682
mr: 14.92813530519487
mister: 13.271694012395832
goblin: 10.53226674261094


### 5. Co-occurrence Detection for Phrase Extraction

In [20]:
# def extract_named_entities(comments):
#     entity_counts = defaultdict(Counter)
#     all_entities = []
    
#     for comment in tqdm(comments, desc='Extracting named entities'):
#         doc = nlp(comment['text'])
        
#         for ent in doc.ents:
#             entity_text = ent.text.lower()
#             entity_type = ent.label_
            
#             entity_counts[entity_type][entity_text] += comment['score']
#             all_entities.append({
#                 'text': entity_text,
#                 'type': entity_type,
#                 'score': comment['score']
#             })
    
#     return entity_counts, all_entities

In [21]:
# entity_counts, all_entities = extract_named_entities(comments)

# print('\nEntity type distribution:')
# for entity_type, counts in entity_counts.items():
#     print(f'\n{entity_type}:')
#     sorted_entities = sorted(counts.items(), key=lambda x: x[1], reverse=True)[:5]
#     for entity, score in sorted_entities:
#         print(f'  - {entity}: {score}')

In [22]:
# def filter_relevant_entities(entity_counts, relevant_types=None):
#     if relevant_types is None:
#         relevant_types = {'WORK_OF_ART', 'PERSON', 'PRODUCT', 'EVENT'}
    
#     filtered_entities = {}
#     for etype, counts in entity_counts.items():
#         if etype in relevant_types:
#             filtered_entities[etype] = counts
    
#     return filtered_entities

In [23]:
# filtered_entities = filter_relevant_entities(entity_counts)

# print('\nFiltered relevant entities:')
# for entity_type, counts in filtered_entities.items():
#     print(f'\n{entity_type}:')
#     sorted_entities = sorted(counts.items(), key=lambda x: x[1], reverse=True)[:5]
#     for entity, score in sorted_entities:
#         print(f'  - {entity}: {score}')

In [24]:
# def verify_phrases_with_ner(phrases, entities, threshold=0.3):
#     verified_phrases = []
#     entity_texts = [e['text'] for e in entities]
    
#     for phrase in phrases:
#         phrase_doc = nlp(phrase.lower())
        
#         if phrase.lower() in entity_texts:
#             verified_phrases.append(phrase)
#             continue
            
#         for entity in entities:
#             entity_doc = nlp(entity['text'])
#             if phrase_doc.similarity(entity_doc) > threshold:
#                 verified_phrases.append(phrase)
#                 break
    
#     return verified_phrases

In [25]:
def extract_common_phrases(comments, ngram_limit=10, min_occurrences=3):
    ngram_list = []
    
    for comment in comments:
        tokens = nltk.word_tokenize(comment['text'])
        
        for n in range(2, ngram_limit + 1):
            for ngram in ngrams(tokens, n):
                if not all(word.lower() in stop_words for word in ngram):
                    ngram_list.append(ngram)
    
    ngram_counts = Counter(ngram_list)
    common_phrases = [ngram for ngram, count in ngram_counts.items() if count >= min_occurrences]
    
    return common_phrases

In [26]:
def tuple_to_string(phrase_tuple):
    """Convert a tuple of words into a single string."""
    return ' '.join(phrase_tuple)

In [27]:
def is_meaningful_phrase(phrase):
    """Retain phrases with at least 2 capitalized words."""
    capitalized_count = sum(1 for word in phrase if word[0].isupper())
    return capitalized_count >= 2

In [28]:
def remove_substrings(phrases):
    """Remove phrases that are substrings of other phrases."""
    final_phrases = set()
    sorted_phrases = sorted(phrases, key=lambda x: len(x), reverse=True)
    
    for phrase in sorted_phrases:
        if not any(phrase.lower() in other.lower() for other in final_phrases):
            final_phrases.add(phrase)
    
    return final_phrases

In [29]:
def remove_grammar(phrases):
    """Remove leading and trailing grammar words from a list of phrases and remove leading numbers in the form 'X '."""
    grammar = {'a', 'an', 'the', 'i'}
    processed_phrases = []

    for phrase in phrases:
        words = phrase.split()

        if words and re.match(r'^\d+$', words[0]):
            words = words[1:]

        # Remove leading grammar words
        if words and words[0].lower() in grammar:
            words = words[1:]

        # Remove trailing grammar words
        if words and words[-1].lower() in grammar:
            words = words[:-1]

        processed_phrases.append(' '.join(words))

    return processed_phrases

In [30]:
def remove_custom_words(phrase, custom_words):
    """Remove custom words from the phrase."""
    words = phrase.split()
    cleaned_words = [word for word in words if word.lower() not in custom_words]
    return ' '.join(cleaned_words)

In [31]:
def remove_common_only_phrases(phrases, stop_words):
    """Remove phrases that consist entirely of stopwords or are too short."""
    filtered_phrases = []
    for phrase in phrases:
        words = phrase.split()
        if not all(word.lower() in stop_words for word in words) and len(words) > 2:
            filtered_phrases.append(phrase)
    return filtered_phrases

In [32]:
def remove_all_lowercase_phrases(phrases):
    """Remove phrases that consist entirely of lowercase words or start with common non-informative words."""
    articles_and_common_words = {
        'A', 'An', 'The', 'And', 'But', 'Or', 'So', 'Because', 'However', 'If', 'In', 'On', 'At', 
        'For', 'By', 'To', 'From', 'With', 'About', 'Over', 'Under', 'Before', 'After', 
        'I', 'Ive', 'He', 'Hes', 'She', 'Shes', 'It', 'Its', 'They', 'Theyve', 'We', 'Weve', 'This', 'That', 'These', 'Those', 'Then', 
        'Now', 'Here', 'There', 'What', 'When', 'Where', 'Why', 'How', 'Who', 'Which'
    }
    
    filtered_phrases = []
    
    for phrase in phrases:
        words = phrase.split()
        capitalized_words = [word for word in words if word[0].isupper() and word not in articles_and_common_words]

        if capitalized_words:
            filtered_phrases.append(phrase)
    
    return filtered_phrases

In [33]:
def post_process_ngrams(phrases):
    """Process n-grams to remove duplicates, check capitalization, and filter based on length."""
    phrases = [tuple_to_string(phrase) for phrase in phrases]
    processed_phrases = []
    seen_phrases = set()

    for phrase in phrases:
        phrase_lower = ' '.join(word.lower() for word in phrase.split())

        if phrase_lower not in seen_phrases:
            processed_phrases.append(phrase)
            seen_phrases.add(phrase_lower)

    final_phrases = set()
    for phrase in processed_phrases:
        phrase_str = phrase

        if is_meaningful_phrase(phrase_str.split()) or len(phrase_str.split()) > 2:
            final_phrases.add(phrase_str)

    return final_phrases

In [34]:
def combine_similar_phrases(phrases, similarity_threshold=3):
    """Combine phrases that have similar starts or ends."""
    combined_phrases = list(phrases)
    merged = True
    
    while merged:
        merged = False
        new_phrases = []
        skip = set()

        for i, phrase in enumerate(combined_phrases):
            if i in skip:
                continue

            phrase_words = phrase.split()
            combined_phrase = phrase

            for j, other_phrase in enumerate(combined_phrases):
                if i != j and j not in skip:
                    other_phrase_words = other_phrase.split()

                    if phrase_words[:similarity_threshold] == other_phrase_words[:similarity_threshold]:
                        combined_phrase = ' '.join(phrase_words + other_phrase_words[similarity_threshold:])
                        skip.add(j)
                        merged = True
                    elif phrase_words[-similarity_threshold:] == other_phrase_words[-similarity_threshold:]:
                        combined_phrase = ' '.join(phrase_words[:-similarity_threshold] + other_phrase_words)
                        skip.add(j)
                        merged = True
                    else:
                        combined_phrase = phrase

            new_phrases.append(combined_phrase)

        combined_phrases = new_phrases

    return set(combined_phrases)

In [35]:
def final_post_process(phrases):
    """Apply all post-processing steps."""
    phrases = remove_substrings(phrases)
    phrases = [remove_custom_words(phrase, custom_words) for phrase in phrases]
    phrases = combine_similar_phrases(phrases)
    phrases = remove_common_only_phrases(phrases, stop_words)
    phrases = remove_all_lowercase_phrases(phrases)
    phrases = remove_grammar(phrases)
    
    return phrases

In [36]:
custom_words = {'netflix', 'viki', 'mdl', 'rating', 'ml', 'fl'}
stop_words = custom_stop_words

common_phrases = extract_common_phrases(comments, ngram_limit=5, min_occurrences=2)
common_phrases = post_process_ngrams(common_phrases)
common_phrases = final_post_process(common_phrases)

common_phrases

['Crowned Clown 84',
 'King the Land',
 'King of the Land',
 'Flower of Evil',
 'Our Beloved Summer',
 'Strong Woman Do Bong Soon',
 'Love To Hate You',
 'Extraordinary Attorney Woo',
 'Destined with you',
 'Weightlifting Fairy Kim Bok',
 'Kim Bok Joo',
 'Move to Heaven',
 'Crowned Clown',
 'Red Sleeve',
 'Welcome to Waikiki',
 'Hometown Cha Cha Cha',
 'Moon Lovers Scarlet Heart',
 'Crash Landing on You',
 'My Love from',
 'Business Proposal',
 'Good Bad Mother']

### 6. Compute Top 3 Recommendations using TF-IDF + Upvotes

In [37]:
def phrase_tfidf(phrases, comments, ngram_limit=5):
    comment_texts = [comment['text'] for comment in comments]
    
    vectorizer = TfidfVectorizer(vocabulary=phrases, ngram_range=(1, ngram_limit + 2), lowercase=False)
    tfidf_matrix = vectorizer.fit_transform(comment_texts)

    tfidf_scores = np.sum(tfidf_matrix.toarray(), axis=0)
    phrase_tfidf_map = dict(zip(vectorizer.get_feature_names_out(), tfidf_scores))

    return phrase_tfidf_map

In [38]:
def compute_phrase_upvotes(phrases, comments):
    phrase_upvote_map = defaultdict(int)

    for comment in comments:
        comment_text_lower = comment['text'].lower()

        for phrase in phrases:
            if phrase.lower() in comment_text_lower:
                phrase_upvote_map[phrase] += comment['score']

    return phrase_upvote_map

In [39]:
def top_phrases_combined(phrases, comments, top_n=10, ngram_limit=5):
    phrase_tfidf_map = phrase_tfidf(phrases, comments, ngram_limit)
    phrase_upvotes = compute_phrase_upvotes(phrases, comments)

    combined_scores = {}
    for phrase in phrases:
        combined_score = phrase_tfidf_map.get(phrase, 0) + phrase_upvotes.get(phrase, 0)
        combined_scores[phrase] = combined_score

    sorted_phrases = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
    
    return sorted_phrases[:top_n]

In [40]:
top_phrases = top_phrases_combined(common_phrases, comments, top_n=3)
top_phrases

[('Crash Landing on You', np.float64(92.18636027299029)),
 ('Our Beloved Summer', np.float64(48.42416876922289)),
 ('Business Proposal', np.float64(31.2676855558996))]

### 7. Final Function

In [45]:
def get_top_reddit_phrases():
    url = input('Please enter the Reddit URL: ')
    url = url if url else 'https://www.reddit.com/r/kdramarecommends/comments/17ehrue/best_kdramas/'
    
    top_n = input('How many top phrases would you like to return? (default is 3): ')
    top_n = int(top_n) if top_n.isdigit() else 3

    custom_words_input = input('Enter custom words to exclude (comma separated, leave empty for none): ')
    custom_words = set(custom_words_input.lower().split(',')) if custom_words_input else None

    apply_remove_lowercase = input('Do you want to remove phrases that are all lowercase? (y/n, default is y): ').lower() != 'n'

    ngram_limit = input('What is the maximum n-gram length? (default is 5): ')
    ngram_limit = int(ngram_limit) if ngram_limit.isdigit() else 5

    print_scores = input('Would you like to print scores? (y/n, default is n): ').lower() == 'y'

    # Step 1: Fetch Reddit JSON
    print('\nStep (1/4): Fetching Reddit JSON data...')
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url + '.json', headers=headers)

    if response.status_code != 200:
        print('Failed to fetch data from Reddit')
        return None

    data = response.json()
    print('Done.\n')

    # Step 2: Extract and Clean Comments
    print('Step (2/4): Extracting and cleaning comments...')
    comments = []
    extract_all_comments(data, comments)
    
    for comment in tqdm(comments, desc='Cleaning comments'):
        comment['text'] = clean_text(comment['text'])
    print('Done.\n')

    # Step 3: Extract Common Phrases
    print('Step (3/4): Extracting common phrases...')
    min_occurrences = max(math.ceil(len(comments) / 40), 2)
    all_common_phrases = set()
    all_common_phrases_lower = set()

    while min_occurrences >= 2:
        common_phrases = extract_common_phrases(comments, ngram_limit=ngram_limit, min_occurrences=min_occurrences)
        common_phrases = post_process_ngrams(common_phrases)
        common_phrases = final_post_process(common_phrases)
        
        if apply_remove_lowercase:
            common_phrases = remove_all_lowercase_phrases(common_phrases)

        new_phrases = set(phrase for phrase in common_phrases if phrase.lower() not in all_common_phrases_lower)
        all_common_phrases.update(new_phrases)
        all_common_phrases_lower.update(phrase.lower() for phrase in new_phrases)

        print(f'Found {len(new_phrases)} unique phrases with min_occurrences={min_occurrences}. Total: {len(all_common_phrases)} phrases.')
        
        if len(all_common_phrases) >= top_n:
            break

        min_occurrences -= 1

    if not all_common_phrases:
        print('Comments are too varied or scattered. Unable to extract meaningful phrases.')
        return None

    print('Done.\n')

    # Step 4: Compute Top Phrases Using TF-IDF + Upvotes
    print('Step (4/4): Calculating top phrases...')
    top_phrases = top_phrases_combined(all_common_phrases, comments, top_n=top_n, ngram_limit=ngram_limit)
    print('Done.\n')

    if len(top_phrases) < top_n:
        print(f'\nOnly {len(top_phrases)} common phrases found. Displaying top {len(top_phrases)} phrases.')

    print(f'\nTop {len(top_phrases)} Phrases:')
    for idx, (phrase, score) in enumerate(top_phrases, 1):
        if print_scores:
            print(f'{idx}. {phrase}: {score}')
        else:
            print(f'{idx}. {phrase}')

In [46]:
get_top_reddit_phrases()

Please enter the Reddit URL:  
How many top phrases would you like to return? (default is 3):  10
Enter custom words to exclude (comma separated, leave empty for none):  
Do you want to remove phrases that are all lowercase? (y/n, default is y):  
What is the maximum n-gram length? (default is 5):  
Would you like to print scores? (y/n, default is n):  y



Step (1/4): Fetching Reddit JSON data...
Done.

Step (2/4): Extracting and cleaning comments...


Cleaning comments: 100%|████████████████████████████████████████████████████████| 88/88 [00:00<?, ?it/s]

Done.

Step (3/4): Extracting common phrases...
Found 5 unique phrases with min_occurrences=3. Total: 5 phrases.
Found 16 unique phrases with min_occurrences=2. Total: 21 phrases.
Done.

Step (4/4): Calculating top phrases...
Done.


Top 10 Phrases:
1. Crash Landing On You: 89.92065576911217
2. Our Beloved Summer: 47.77858906503419
3. Business Proposal: 32.30881968270583
4. Strong Woman Do Bong Soon: 20.82337592780974
5. Love To Hate You: 20.192954255432088
6. Extraordinary Attorney Woo: 19.374665238438944
7. Weightlifting Fairy Kim Bok: 17.98521795313214
8. Kim Bok Joo: 17.297055972213187
9. Flower of Evil: 14.102375543292046
10. Destined with you: 12.414213562373096



