# Representative Words Extractor

## Setup and Installation

In [1]:
import pandas as pd
import nltk
import ssl

In [2]:
NLTK_DATA_PATH='/Users/chloeyamtai/nltk_data'
nltk.data.path.append(NLTK_DATA_PATH)

In [3]:
from nltk.corpus import stopwords, wordnet
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer, PorterStemmer
from collections import defaultdict

In [4]:
# Bypass SSL certificate verification for nltk downloads
try:
    _ctx = ssl._create_unverified_context
except AttributeError:
    pass  # Older Python that doesn’t support it
else:
    ssl._create_default_https_context = _ctx

In [5]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chloeyamtai/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/chloeyamtai/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Loading the Cleaned Comments Dataset

In [6]:
df = pd.read_csv('../../data_preprocessed/comments_cleaned.csv')

## Scoring System

In [7]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
porter = PorterStemmer()

# Map Treebank POS tags to WordNet POS tags.
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    # elif treebank_tag.startswith('V'):
    #     return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    # elif treebank_tag.startswith('R'):
    #     return wordnet.ADV
    else:
        return None

# Tokenize, POS-tag, filter, lemmatize and stem a string of text
def extract_valid_words(text):
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    valid = []
    for token, tag in tagged:
        wn_pos = get_wordnet_pos(tag)
        if wn_pos and token.isalpha():
            tok_lower = token.lower()
            if tok_lower not in stop_words:
                # Lemmatize
                lemma = lemmatizer.lemmatize(tok_lower, pos=wn_pos)
                # Stem to merge variants
                stem = porter.stem(lemma)
                # Manually merge “-ian” adjectives into root
                if stem.endswith('ian') and len(stem) > 4:
                    stem = stem[:-3]
                valid.append(stem)
    return valid

global_scores = defaultdict(int)
for _, row in df.iterrows():
    text = str(row['comment'])
    likes = int(row.get('num_of_likes', 0))
    replies = int(row.get('reply_count', 0))

    # Base score
    score = 1
    # Likes-based scoring
    if likes > 1000:
        score += 4
    elif likes > 100:
        score += 3
    else:
        score += 2
    # Replies-based scoring
    if replies > 0:
        score += 2

    # Add score to each valid word
    for word in extract_valid_words(text):
        global_scores[word] += score

# Select top 50 representative words
complete_ranking = sorted(global_scores.items(), key=lambda kv: kv[1], reverse=True)
complete_df = pd.DataFrame(complete_ranking, columns=['word','score'])

# Demo
complete_df[:10]

Unnamed: 0,word,score
0,egypt,11870
1,video,9555
2,pyramid,5494
3,best,3772
4,bro,3272
5,great,3202
6,thank,3193
7,hour,3146
8,water,2066
9,proud,2063


## Export to csv file

In [8]:
output_path_for_complete_ranking = 'RW_res/complete_words_ranking.csv'
complete_df.to_csv(output_path_for_complete_ranking, index=False)