In [2]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df_suicide_detection = pd.read_csv('../../data/prepared/prepared.csv').drop('Unnamed: 0',axis=1)
df_suicide_detection['corpus'] = df_suicide_detection['corpus'].apply(lambda x: x[1:-1].replace("'", "").split(', '))

In [4]:
nltk.download('vader_lexicon')

sid = SentimentIntensityAnalyzer()

def sentiment_analysis(text):
    text = ' '.join(text)
    sentiment = sid.polarity_scores(text)
    return sentiment

df_suicide_detection['sentiment'] = df_suicide_detection['corpus'].apply(sentiment_analysis)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [16]:
grouped = df_suicide_detection.groupby(df_suicide_detection['class'])['sentiment'].apply(list).reset_index()

df_sentiment = pd.concat([
    pd.DataFrame(grouped['sentiment'].iloc[0]).mean(),
    pd.DataFrame(grouped['sentiment'].iloc[1]).mean()],
axis=1,
)
df_sentiment.columns=['non_suicide','suicide']
df_sentiment

Unnamed: 0,non_suicide,suicide
neg,0.154675,0.265312
neu,0.629486,0.540684
pos,0.215837,0.194007
compound,0.124115,-0.353753


Suicide-related texts show a higher level of negative sentiment (26.53%) compared to non-suicide-related texts (15.47%). This suggests that texts related to suicide contain more negative content. Texts unrelated to suicide have a slightly higher level of neutral sentiment (62.95%) compared to texts related to suicide (54.07%). Texts related to suicide contain more neutral content. The level of positive sentiment is similar for both categories.

Total Indicator (Compound): The average composite index is positive for non-suicide-related texts (12.41%), indicating a slightly positive trend. In contrast, for texts related to suicide, the index is significantly negative (-35.38%), suggesting a higher level of overall negative sentiment.

Sentiment analysis shows that texts related to suicide contain more negative content compared to unrelated texts, which show a slightly higher level of positive sentiment and a lower level of overall negative sentiment.

Verification of extreme cases

In [17]:
extremely_negative = df_suicide_detection[df_suicide_detection['sentiment'].apply(lambda x: x['compound'] < -0.999)]

print("Extremely negative:")
print(extremely_negative[['corpus', 'class']])

Extremely negative:
                                                   corpus  class
349     [want, end, myselfsad, pain, sad, pain, sad, p...      1
1118    [tried, kill, nighti, dont, really, know, talk...      1
1235    [final, wordsi, finally, decided, commit, suic...      1
2653    [hate, feel, deserve, feel, way, dont, deserve...      1
...                                                   ...    ...
229285  [want, fuck, fuck, fuck, fuck, fuck, fuck, fuc...      0
229767  [advice, neededi, would, like, advice, situati...      1
230676  [updatetldr, bottom, post, survived, attempt, ...      1
230727  [recent, painful, graduation, subreddit, hey, ...      0
231636  [suicidal, begging, help, turned, away, retali...      1

[310 rows x 2 columns]


In [18]:
extremely_positive = df_suicide_detection[df_suicide_detection['sentiment'].apply(lambda x: x['compound'] > 0.999)]

print("Extremely positive:")
print(extremely_positive[['corpus', 'class']])
print(len(extremely_positive[extremely_positive['class']==1]))

Extremely positive:
                                                   corpus  class
3755    [plz, message, meplz, plz, plz, plz, plz, plz,...      1
4182    [talk, mei, need, lp, need, help, need, help, ...      1
5227    [casual, reminder, le, hour, left, global, mas...      0
6044    [originally, posted, throw, away, might, well,...      1
12008   [need, help, sure, say, feel, like, need, get,...      0
...                                                   ...    ...
219944  [best, friend, crushed, passion, life, destroy...      1
222349  [please, kill, meplease, please, please, pleas...      1
229122  [dont, let, lust, take, love, oh, boy, even, s...      0
230263  [screwed, everything, lost, best, friendfirst,...      1
231129  [sacred, text, talk, girlshow, get, girlfriend...      0

[106 rows x 2 columns]
40


40 Examples that were marked as extremely positive, but are related to the topic of suicide (1=suicide), may be the result of sentiment incorrectly assigned due to context or sentence structure. It is likely that the sentiment analysis algorithm may have made an identification error by focusing on individual words rather than the context of the entire sentence or text.

Extremely positive cases include those belonging to the "suicide" class containing cries for help.  
Sentiment correction for the word "help" nad some words from the thematic analysis.

Adjusted

In [39]:
def adjust_sentiment(text, sentiment):
    help_words = ['help', 'pleas', 'plz',
    'suicide','ibuprofen']
    
    if any(word in (text) for word in help_words) and sentiment > 0: 
        return sentiment-0.3 
    return sentiment 

def sentiment_analysis(text):
    text = " ".join(text)
    sentiment = sid.polarity_scores(text)
    if 'help' in str(text) or 'please' in text and sentiment>0:
        sentiment['compound'] -= 0.3 
    return sentiment

df_suicide_detection['adjusted_sentiment'] = df_suicide_detection.apply(lambda row: adjust_sentiment(' '.join(row['corpus']), row['sentiment']['compound']), axis=1)

In [40]:
grouped = df_suicide_detection.groupby(df_suicide_detection['class'])['adjusted_sentiment'].apply(list).reset_index()

df_sentiment = pd.concat([
    pd.DataFrame(grouped['adjusted_sentiment'].iloc[0]).mean(),
    pd.DataFrame(grouped['adjusted_sentiment'].iloc[1]).mean()],
axis=1,
)
df_sentiment.columns=['non_suicide','suicide']
df_sentiment

Unnamed: 0,non_suicide,suicide
0,0.098746,-0.394805


In [41]:
extremely_positive = df_suicide_detection[df_suicide_detection['adjusted_sentiment'].apply(lambda x: x > 0.999)]

print("Extremely positive:")
print(extremely_positive[['corpus', 'class']])

Extremely positive:
                                                   corpus  class
15740   [boy, kissed, boy, kissed, also, pretty, long,...      0
15959   [read, love, love, love, love, love, love, lov...      0
17501   [calling, beautiful, understand, beautiful, pi...      0
17888   [slept, cousin´s, sister, yes, im, actual, tee...      0
47129   [ever, sad, remember, world, glam, punk, love,...      0
48636   [solid, man, penetrates, water, like, solid, f...      0
51227   [reminder, kid, eat, rich, eat, rich, eat, ric...      0
87115   [find, pizza, pizza, gt, smiling_face_with_ope...      0
100531  [know, yall, struggling, wow, wow, wow, wow, w...      0
114976  [speech, wrote, happy, join, today, go, histor...      0
143097  [calling, beautiful, till, understand, beautif...      0
144639  [ok, girl, met, back, talking, friend, however...      0
155417  [school, motto, kinda, cringe, care, share, da...      0
156445  [katy, perry, gush, hey, guy, gal, non, binary...      0
16357

Length correlation

In [49]:
correlation = pd.Series([len(x) for x in df_suicide_detection['corpus']]).corr(df_suicide_detection['adjusted_sentiment'])
print(f"Correlation between text length and sentiment: {correlation}")

Correlation between text length and sentiment: -0.1330076893723232


The correlation of -0.133 suggests a weak but negative relationship between text length and sentiment.