In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
df_suicide_detection = pd.read_csv('../../data/prepared/prepared_2.csv').drop('Unnamed: 0',axis=1)
df_suicide_detection['corpus'] = df_suicide_detection['corpus'].apply(lambda x: x[1:-1].replace("'", "").split(', '))

In [3]:
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
X = vectorizer.fit_transform(df_suicide_detection['corpus'].apply(lambda x: ' '.join(x)))
Y = df_suicide_detection['class'].values

In [4]:
X_suicide = X[Y == 1]
X_non_suicide = X[Y == 0]

In [5]:
lda_model = LatentDirichletAllocation(n_components=5, max_iter=5, learning_method='online', random_state=42)
lda_output = lda_model.fit_transform(X_suicide)

In [6]:
for idx, topic in enumerate(lda_model.components_):
    print(f"Suicide topic {idx+1}:")
    print([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])

Suicide topic 1:
['lifeit', 'whywhi', 'quickest', 'talki', 'nail', 'tomorrowi', 'tonightim', 'statist', 'meit', 'endi']
Suicide topic 2:
['suicid', 'time', 'think', 'ive', 'know', 'like', 'life', 'feel', 'want', 'dont']
Suicide topic 3:
['idc', 'plz', 'lovei', 'spit', 'copi', 'euthanasia', 'troll', 'ibuprofen', 'sincer', 'fuckin']
Suicide topic 4:
['booz', 'holi', 'happyi', 'photo', 'overthink', 'enoughi', 'canada', 'paini', 'alivei', 'chicken']
Suicide topic 5:
['cowardi', 'tommorow', 'homei', 'helppleas', 'sooth', 'ativan', 'rapist', 'everyonei', 'inch', 'alonei']


In [7]:
lda_model = LatentDirichletAllocation(n_components=5, max_iter=5, learning_method='online', random_state=42)
lda_output = lda_model.fit_transform(X_non_suicide)

In [8]:
for idx, topic in enumerate(lda_model.components_):
    print(f"Non suicide topic {idx+1}:")
    print([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])

Non suicide topic 1:
['remov', 'book', 'code', 'tran', 'fail', 'american', 'math', 'lone', 'join', 'song']
Non suicide topic 2:
['twitch', 'face_with_rolling_ey', 'weary_fac', 'stream', 'u200b', 'simp', 'loudly_crying_fac', 'ampxb', 'pensive_fac', 'karma']
Non suicide topic 3:
['gang', 'uwu', 'gold', 'thigh', 'homi', 'nut', 'smiling_face_with_sunglass', 'flushed_fac', 'minecraft', 'filler']
Non suicide topic 4:
['shitpost', 'smiling_face_with_heart', 'smell', 'pleading_fac', 'nnn', 'chees', 'star', 'luck', 'cat', 'horni']
Non suicide topic 5:
['day', 'peopl', 'girl', 'fuck', 'know', 'post', 'guy', 'want', 'dont', 'like']


1. Suicide topics: Keywords suggest various aspects of thoughts, emotions, and actions related to suicide, from wanting to talk about problems to desperation or final action.

2. Topics unrelated to suicide: Here, keywords refer to completely different areas, such as social media, video games or everyday conversations, unrelated to suicidal thoughts or emotions.