In [1]:
import os
import json
import pandas as pd
import io
import datetime
import logging
import zstandard as zstd
from gensim.models import KeyedVectors
from concurrent.futures import ProcessPoolExecutor, as_completed

In [2]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


In [5]:
word2vec_model_path = r"C:\Users\ntu-s\OneDrive - Nanyang Technological University\sherry\__pycache__\GoogleNews-vectors-negative300.bin\GoogleNews-vectors-negative300.bin"
word2vec_model = KeyedVectors.load_word2vec_format(word2vec_model_path, binary=True)

2024-06-05 15:58:46,726 - INFO - loading projection weights from C:\Users\ntu-s\OneDrive - Nanyang Technological University\sherry\__pycache__\GoogleNews-vectors-negative300.bin\GoogleNews-vectors-negative300.bin
2024-06-05 15:59:03,042 - INFO - KeyedVectors lifecycle event {'msg': 'loaded (3000000, 300) matrix of type float32 from C:\\Users\\ntu-s\\OneDrive - Nanyang Technological University\\sherry\\__pycache__\\GoogleNews-vectors-negative300.bin\\GoogleNews-vectors-negative300.bin', 'binary': True, 'encoding': 'utf8', 'datetime': '2024-06-05T15:59:03.042790', 'gensim': '4.3.0', 'python': '3.11.7 | packaged by Anaconda, Inc. | (main, Dec 15 2023, 18:05:47) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22631-SP0', 'event': 'load_word2vec_format'}


In [60]:
def get_related_words_word2vec(keyword, top_k=10, similarity_threshold=0.6):
    try:
        related_words = word2vec_model.most_similar(keyword, topn=top_k)
        return [word.lower() for word, similarity in related_words if similarity >= similarity_threshold]
    except KeyError:
        logging.warning(f"Keyword '{keyword}' not in Word2Vec vocabulary.")
        return []

In [61]:
def generate_keywords_list(keywords, top_k=20, similarity_threshold=0.6, output_path=None):

    all_keywords = set(keywords)
    for keyword in keywords:
        related_words = get_related_words_word2vec(keyword, top_k, similarity_threshold)
        all_keywords.update(related_words)
    extended_keywords_list = sorted(list(all_keywords))  
    
    print("Extended Keywords List:")
    for keyword in extended_keywords_list:
        print(keyword)
    
    if output_path:
        with open(output_path, 'w', encoding='utf-8') as f:
            for keyword in extended_keywords_list:
                f.write(keyword + '\n')
    
    return extended_keywords_list

In [62]:
def main():
    keywords_output_path = "C:\\Users\\ntu-s\\OneDrive - Nanyang Technological University\\sherry\\reddit output\\extended_keywords.txt"
    keywords = [
    'mentalhealth', 'suicide', 'depressed', 'depression', 'anxiety', 'stressed', 'stressful',
    'burnout', 'hopeless', 'hopelessness', 'meaningless', 'meaninglessness', 'sad',
    'failure', 'loser', 'toxic'
    ]
    extended_keywords = generate_keywords_list(keywords, top_k=10, similarity_threshold=0.6, output_path=keywords_output_path)
    logging.info(f"Extended Keywords: {extended_keywords}")


if __name__ == "__main__":
    main()

2024-06-05 16:31:27,102 - INFO - Extended Keywords: ['alcoholism', 'angst', 'anticipatory_anxiety', 'anxieties', 'anxiety', 'anxiousness', 'asserted', 'belt_flak_vest', 'bi_polar_disorder', 'bipolar_depression', 'bipolar_disorder', 'burnout', 'commit_suicide', 'depressed', 'depressed_maruca_kovac', 'depression', 'depression_anxiety', 'depressive', 'depressive_illness', 'depressive_illnesses', 'desolation', 'despair', 'despondency', 'disheartening', 'distressing', 'emphasized', 'fail', 'failed', 'failing', 'failure', 'failures', 'heartbreaking', 'hectic', 'helplessness', 'hopeless', 'hopelessness', 'inability', 'inconsequential', 'insisted', 'irrelevant', 'loser', 'losers', 'meaningless', 'meaninglessness', 'mental_illness', 'mentalhealth', 'meny_friedman', 'nerve_racking', 'nerve_wracking', 'nervousness', 'noted', 'parishioner_pat_patello', 'pointless', 'poisonous', 'powerlessness', 'psychosis', 'reiterated', 'reminders_bobbing', 'sad', 'saddened', 'saddening', 'saddens_me', 'said', 's

Extended Keywords List:
alcoholism
angst
anticipatory_anxiety
anxieties
anxiety
anxiousness
asserted
belt_flak_vest
bi_polar_disorder
bipolar_depression
bipolar_disorder
burnout
commit_suicide
depressed
depressed_maruca_kovac
depression
depression_anxiety
depressive
depressive_illness
depressive_illnesses
desolation
despair
despondency
disheartening
distressing
emphasized
fail
failed
failing
failure
failures
heartbreaking
hectic
helplessness
hopeless
hopelessness
inability
inconsequential
insisted
irrelevant
loser
losers
meaningless
meaninglessness
mental_illness
mentalhealth
meny_friedman
nerve_racking
nerve_wracking
nervousness
noted
parishioner_pat_patello
pointless
poisonous
powerlessness
psychosis
reiterated
reminders_bobbing
sad
saddened
saddening
saddens_me
said
severely_depressed
stress
stressed
stresses
stressful
stressing
suicidal
suicide
suicides
toxic
toxic_chemicals
toxic_dioxins
toxic_substances
toxic_waste
toxins
underlined
unease
uneasiness
unimportant
useless
utter_des