In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
df_suicide_detection = pd.read_csv('../../data/prepared/prepared.csv').drop('Unnamed: 0',axis=1)
df_suicide_detection['corpus'] = df_suicide_detection['corpus'].apply(lambda x: x[1:-1].replace("'", "").split(', '))

In [3]:
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
X = vectorizer.fit_transform(df_suicide_detection['corpus'].apply(lambda x: ' '.join(x)))
Y = df_suicide_detection['class'].values

In [4]:
X_suicide = X[Y == 1]
X_non_suicide = X[Y == 0]

In [5]:
lda_model = LatentDirichletAllocation(n_components=5, max_iter=5, learning_method='online', random_state=42)
lda_output = lda_model.fit_transform(X_suicide)

In [6]:
for idx, topic in enumerate(lda_model.components_):
    print(f"Suicide topic {idx+1}:")
    print([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])

Suicide topic 1:
['suicideive', 'ambien', 'meand', 'yourselfi', 'balding', 'meit', 'cowardice', 'everyonei', 'emptyi', 'coronavirus']
Suicide topic 2:
['caresi', 'plz', 'cyanide', 'coffin', 'friendsi', 'unimportant', 'peoplei', 'iam', 'canti', 'whywhy']
Suicide topic 3:
['loudly_crying_face', 'ativan', 'toim', 'penis', 'ibuprofen', 'aswell', 'anymoremy', 'dayi', 'blah', 'mg']
Suicide topic 4:
['people', 'time', 'ive', 'know', 'like', 'life', 'feel', 'dont', 'want', 'im']
Suicide topic 5:
['charcoal', 'worki', 'insulin', 'morei', 'rationally', 'euthanasia', 'survivor', 'sea', 'betteri', 'pleasei']


In [7]:
lda_model = LatentDirichletAllocation(n_components=5, max_iter=5, learning_method='online', random_state=42)
lda_output = lda_model.fit_transform(X_non_suicide)

In [8]:
for idx, topic in enumerate(lda_model.components_):
    print(f"Non suicide topic {idx+1}:")
    print([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])

Non suicide topic 1:
['smirking_face', 'band', 'username', 'pog', 'america', 'racist', 'white', 'porn', 'men', 'ya']
Non suicide topic 2:
['homies', 'follower', 'face_with_tears_of_joy', 'lady', 'nut', 'cum', 'star', 'pensive_face', 'horny', 'filler']
Non suicide topic 3:
['cock', 'pleading_face', 'draw', 'smh', 'weary_face', 'u200b', 'loudly_crying_face', 'bro', 'smiling_face_with_sunglasses', 'flushed_face']
Non suicide topic 4:
['pc', 'taste', 'tiktok', 'playlist', 'drink', 'trans', 'assignment', 'listening', 'music', 'song']
Non suicide topic 5:
['friend', 'want', 'people', 'day', 'girl', 'know', 'na', 'guy', 'im', 'like']


Texts related to suicide:
1. Emotional themes: These texts are dominated by emotions of desperation, hopelessness, requests for help and mentions of death.
2. Specific vocabulary: Contains words related to suicide methods, medications, and mental and emotional symptoms associated with suicidal thoughts.  
3. Emptiness and desperation: They often mention emptiness, lack of meaning in life, and the desire to end life.

Texts not related to suicide:
1. Cultural Topics: Focus on internet culture, humor, interpersonal relationships, and popular trends.
2. No mentions of suicide: They do not contain specific words related to suicidal thoughts or emotional aspects related to this topic.
3. Everyday life and entertainment: They cover a variety of areas, from technology to everyday conversations and Internet interests.