# Representative Words Extractor

## Setup and Installation

In [1]:
import pandas as pd
import nltk
import ssl
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\chase\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chase\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\chase\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\chase\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
NLTK_DATA_PATH='/Users/chase/AppData/Roaming/nltk_data'
nltk.data.path.append(NLTK_DATA_PATH)

In [4]:
from nltk.corpus import stopwords, wordnet
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer, PorterStemmer
from collections import defaultdict

In [5]:
# Bypass SSL certificate verification for nltk downloads
try:
    _ctx = ssl._create_unverified_context
except AttributeError:
    pass  # Older Python that doesn’t support it
else:
    ssl._create_default_https_context = _ctx

In [6]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chase\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\chase\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Loading the Cleaned Comments Dataset

In [7]:
df = pd.read_csv('../../data_preprocessed/comments_cleaned.csv')

## Scoring System

In [8]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Map Treebank POS tags to WordNet POS tags.
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun if POS unknown

# Tokenize, POS-tag, filter, and lemmatize a string of text
def extract_valid_words(text):
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    valid = []
    for token, tag in tagged:
        tok_lower = token.lower()
        if tok_lower.isalpha() and tok_lower not in stop_words:
            wn_pos = get_wordnet_pos(tag)
            lemma = lemmatizer.lemmatize(tok_lower, pos=wn_pos)
            valid.append(lemma)
    return valid

global_scores = defaultdict(int)
for _, row in df.iterrows():
    text = str(row['comment'])
    likes = int(row.get('num_of_likes', 0))
    replies = int(row.get('reply_count', 0))

    # Base score
    score = 1
    # Likes-based scoring
    if likes > 1000:
        score += 4
    elif likes > 100:
        score += 3
    else:
        score += 2
    # Replies-based scoring
    if replies > 0:
        score += 2

    # Add score to each valid word
    for word in extract_valid_words(text):
        global_scores[word] += score

# Select top 50 representative words
complete_ranking = sorted(global_scores.items(), key=lambda kv: kv[1], reverse=True)
complete_df = pd.DataFrame(complete_ranking, columns=['word','score'])

# Demo
complete_df[:10]

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\chase/nltk_data'
    - 'c:\\Users\\chase\\AppData\\Local\\Programs\\Python\\Python312\\nltk_data'
    - 'c:\\Users\\chase\\AppData\\Local\\Programs\\Python\\Python312\\share\\nltk_data'
    - 'c:\\Users\\chase\\AppData\\Local\\Programs\\Python\\Python312\\lib\\nltk_data'
    - 'C:\\Users\\chase\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - '/Users/chase/AppData/Roaming/nltk_data'
    - '/Users/chase/AppData/Roaming/nltk_data'
**********************************************************************


## Export to csv File

In [8]:
output_path_for_complete_ranking = 'RW_res/complete_words_ranking.csv'
complete_df.to_csv(output_path_for_complete_ranking, index=False)