In [1]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/goemotions/README.md
/kaggle/input/goemotions/goemotions_model_card.pdf
/kaggle/input/goemotions/GoEmotionsFormat.PNG
/kaggle/input/goemotions/replace_emotions.py
/kaggle/input/goemotions/extract_words.py
/kaggle/input/goemotions/calculate_metrics.py
/kaggle/input/goemotions/analyze_data.py
/kaggle/input/goemotions/tables/emotion_words.csv
/kaggle/input/goemotions/data/test.tsv
/kaggle/input/goemotions/data/ekman_mapping.json
/kaggle/input/goemotions/data/dev.tsv
/kaggle/input/goemotions/data/emotions.txt
/kaggle/input/goemotions/data/train.tsv
/kaggle/input/goemotions/data/ekman_labels.csv
/kaggle/input/goemotions/data/sentiment_dict.json
/kaggle/input/goemotions/data/sentiment_mapping.json
/kaggle/input/goemotions/data/full_dataset/goemotions_2.csv
/kaggle/input/goemotions/data/full_dataset/goemotions_1.csv
/kaggle/input/goemotions/data/full_dataset/goemotions_3.csv
/kaggle/input/goemotions/plots/hierarchical_corr.pdf
/kaggle/input/goemotions/plots/number_of_labels.pdf


In [2]:
import spacy
from nltk.stem import PorterStemmer

nlp = spacy.load("en_core_web_sm")

stemmer = PorterStemmer()

# read dataframes
file1 = "/kaggle/input/goemotions/data/full_dataset/goemotions_1.csv"
file2 = "/kaggle/input/goemotions/data/full_dataset/goemotions_2.csv"
file3 = "/kaggle/input/goemotions/data/full_dataset/goemotions_3.csv"

df1 = pd.read_csv(file1)
df2 = pd.read_csv(file2)
df3 = pd.read_csv(file3)

# combine dataframes
combined_df = pd.concat([df1, df2, df3], ignore_index=True)
output_file = "combined_file.csv"
combined_df.to_csv(output_file, index=False)

combined_df.head()  

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,1,False,0,...,0,0,0,0,0,0,0,1,0,0
1,>sexuality shouldn’t be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1548084000.0,37,True,0,...,0,0,0,0,0,0,0,0,0,0
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1546428000.0,37,False,0,...,0,0,0,0,0,0,0,0,0,1
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1547965000.0,18,False,0,...,1,0,0,0,0,0,0,0,0,0
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,t3_ackt2f,t1_eda65q2,1546669000.0,2,False,0,...,0,0,0,0,0,0,0,0,0,1


In [3]:
df = pd.read_csv('combined_file.csv')
df_suicide = pd.read_csv('/kaggle/input/suicide-watch/Suicide_Detection.csv')


df_mod = df.drop(['id', 'author', 'subreddit', 'link_id', 'parent_id',
       'created_utc', 'rater_id', 'example_very_unclear'], axis = 1)

df_mod.tail(5)

Unnamed: 0,text,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
211220,Everyone likes [NAME].,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
211221,Well when you’ve imported about a gazillion of...,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
211222,That looks amazing,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
211223,The FDA has plenty to criticize. But like here...,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
211224,Desktop link: ^^/r/HelperBot_ ^^Downvote ^^to ...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
df_suicide_mod = df_suicide.drop(['Unnamed: 0'], axis = 1)
df_suicide_mod = df_suicide_mod.head(300)
df_suicide_mod.tail(3)

Unnamed: 0,text,class
297,"Hey guys, Hey guys, I'm sad, and I need to fin...",non-suicide
298,"I'm just so tiredCurrently in college, and doi...",suicide
299,You Wanna Get More Karma?? No you'll not get it.,non-suicide


In [5]:
def preprocess_tokens(text):
   
    doc = nlp(text)
    
    tokens = []
    for token in doc:
        if not token.is_punct:
            lemmatized_token = token.lemma_
            tokens.append(lemmatized_token) 
    
    return tokens

df_mod['Preprocessed_tokens'] = df_mod['text'].apply(preprocess_tokens)
df_suicide_mod['Preprocessed_tokens'] = df_suicide_mod['text'].apply(preprocess_tokens)

df_mod.tail(3)

Unnamed: 0,text,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,...,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral,Preprocessed_tokens
211222,That looks amazing,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[that, look, amazing]"
211223,The FDA has plenty to criticize. But like here...,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[the, FDA, have, plenty, to, criticize, but, l..."
211224,Desktop link: ^^/r/HelperBot_ ^^Downvote ^^to ...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[desktop, link, ^^/r, HelperBot, ^^downvote, ^..."


In [6]:
cols = df_mod.columns.tolist()
last_col = cols[-1]
cols.remove(last_col)
cols.insert(1, last_col)
df_mod= df_mod[cols]

emotion_columns = df_mod.columns[-28:]
df_mod['emotion'] = df_mod[emotion_columns].idxmax(axis=1)
df_mod = df_mod.drop(columns=emotion_columns)

df_mod.tail(3)

Unnamed: 0,text,Preprocessed_tokens,emotion
211222,That looks amazing,"[that, look, amazing]",admiration
211223,The FDA has plenty to criticize. But like here...,"[the, FDA, have, plenty, to, criticize, but, l...",anger
211224,Desktop link: ^^/r/HelperBot_ ^^Downvote ^^to ...,"[desktop, link, ^^/r, HelperBot, ^^downvote, ^...",admiration


In [7]:
df_mod.to_csv('p_data.csv', index = False)
df_suicide_mod.to_csv('p_suicide_data.csv', index = False)

# Start Running program from here for all purposes.

In [8]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)

df_mod = pd.read_csv("/kaggle/input/preprocessed-data/p_data_full_stop.csv")
df_suicide_mod = pd.read_csv("/kaggle/input/preprocessed-data/p_suicide_data_30k_stop.csv")

from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format('/kaggle/input/mental-wellbeing-chatbot/scikitlearn/default/2/GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin', binary=True)

# def vectorize(tokens):
#     vectors = [model.wv[token] for token in tokens if token in model.wv]
#     if vectors:
#         return sum(vectors) / len(vectors)  
#     else:
#         return [0] * model.vector_size  

def vectorize(tokens, model):
    word_vectors = []
    for word in tokens:
        if word in model:
            word_vector = model[word]  
            word_vectors.append(word_vector)
    if word_vectors:
        sentence_vector = np.mean(word_vectors, axis=0)
        return sentence_vector
    else:
        return np.zeros(model.vector_size)

df_mod['vector'] = df_mod['Preprocessed_tokens'].apply(lambda x: vectorize(x, model))
df_suicide_mod['vector'] = df_suicide_mod['Preprocessed_tokens'].apply(lambda x: vectorize(x, model))

emotion_counts = df_mod['emotion'].value_counts()
print(emotion_counts)


emotion
neutral           55298
admiration        20542
approval          15530
annoyance         11929
disapproval        8917
amusement          8862
gratitude          8437
anger              7956
curiosity          7707
disappointment     6769
confusion          6600
love               5310
caring             5147
realization        5125
joy                5120
optimism           4994
excitement         4375
sadness            3863
surprise           3472
disgust            3420
desire             3002
fear               2514
embarrassment      1720
remorse            1648
nervousness         946
relief              814
pride               714
grief               494
Name: count, dtype: int64


In [9]:
df_mod.head()  

Unnamed: 0,text,Preprocessed_tokens,emotion,vector
0,That game hurt.,"['that', 'game', 'hurt']",sadness,"[-0.27539062, 0.17261353, -0.07705078, 0.10944..."
1,>sexuality shouldn’t be a grouping category I...,"[' ', '>', 'sexuality', 'should', 'not', 'be',...",admiration,"[-0.19681089, 0.111585006, 0.002059098, 0.1409..."
2,"You do right, if you don't care then fuck 'em!","['you', 'do', 'right', 'if', 'you', 'do', 'not...",neutral,"[-0.19564383, 0.111964636, 0.013619559, 0.1303..."
3,Man I love reddit.,"['man', 'I', 'love', 'reddit']",love,"[-0.09999906, 0.12735455, 0.012986403, 0.13516..."
4,"[NAME] was nowhere near them, he was by the Fa...","['name', 'be', 'nowhere', 'near', 'they', 'he'...",neutral,"[-0.14364716, 0.12471887, -0.01883212, 0.14475..."


Positive Sentiments:

- Admiration: "admiration", "approval", "pride"
Amusement: "amusement", "excitement", "joy"
Love: "love", "caring", "gratitude"
Optimism: "optimism", "relief"
Neutral Sentiments:

- Neutral: "neutral", "realization", "curiosity"
Negative Sentiments:

- Sadness: "sadness", "grief", "remorse", "disappointment"
Anger: "anger", "annoyance", "disapproval"
Fear: "fear", "nervousness", "confusion"
Disgust: "disgust", "embarrassment"
Surprise: "surprise", "desire"

In [10]:
# Define the main emotions to keep
main_emotions = ["admiration", "amusement", "excitement", "anger", "fear", "joy", "confusion", "realization", "curiosity"
                 , "love", "sadness", "grief", "remorse", "disappointment" ,"annoyance", "nervousness"]

print(f"Original dataset size: {df_mod.shape[0]}")

df_mod = df_mod[df_mod['emotion'].isin(main_emotions)]

print(f"Filtered dataset size: {df_mod.shape[0]}")

df_mod.head(5)

Original dataset size: 211225
Filtered dataset size: 99760


Unnamed: 0,text,Preprocessed_tokens,emotion,vector
0,That game hurt.,"['that', 'game', 'hurt']",sadness,"[-0.27539062, 0.17261353, -0.07705078, 0.10944..."
1,>sexuality shouldn’t be a grouping category I...,"[' ', '>', 'sexuality', 'should', 'not', 'be',...",admiration,"[-0.19681089, 0.111585006, 0.002059098, 0.1409..."
3,Man I love reddit.,"['man', 'I', 'love', 'reddit']",love,"[-0.09999906, 0.12735455, 0.012986403, 0.13516..."
7,That's crazy; I went to a super [RELIGION] hig...,"['that', 'be', 'crazy', 'I', 'go', 'to', 'a', ...",amusement,"[-0.17494032, 0.11104363, -0.0142862955, 0.144..."
8,that's adorable asf,"['that', 'be', 'adorable', 'asf']",amusement,"[-0.19988544, 0.16729267, -0.016310472, 0.1263..."


In [11]:

emotion_mapping = {
    "admiration": "happiness",  
    "amusement": "happiness", 
    "excitement": "happiness",
    "joy": "happiness",
    "love": "love",
   # "caring": "love",
    #"optimism": "happiness", # opt
    "curiosity": "neutral",
    #"neutral" : "neutral",
    "realization": "neutral",
    "sadness": "sadness",
    "grief": "sadness",
    "remorse": "sadness",
    "disappointment": "sadness", 
    "anger": "anger",
    "annoyance": "anger",
    "fear": "fear/anxiety",
    "nervousness": "fear/anxiety", # opt
   # "disgust": "disgust",
    "confusion": "neutral",
   # "surprise": "surprise",# opt
}

df_mod['emotion'] = df_mod['emotion'].replace(emotion_mapping)

emotion_to_int = {
  # "admiration": 0, 
    "happiness": 1,
    "love": 2,
   # "optimism": 3,
    "neutral": 3,
    "sadness": 4,
    "anger": 5,
    "fear/anxiety": 6,
   # "surprise": 8
    
}

df_mod['emotion_int'] = df_mod['emotion'].map(emotion_to_int)

df_suicide_mod['class_int'] = df_suicide_mod['class'].replace({'suicide': 1, 'non-suicide': 0})



emotion_counts = df_mod['emotion'].value_counts()

print("Number of entries for each emotion:")
print(emotion_counts)

Number of entries for each emotion:
emotion
happiness       38899
anger           19885
neutral         19432
sadness         12774
love             5310
fear/anxiety     3460
Name: count, dtype: int64


In [12]:
# df_mod.to_csv('p_v_data.csv', index = False)
# df_suicide_mod.to_csv('p_v_suicide_data.csv', index = False)

In [13]:
df_mod.head(30)

Unnamed: 0,text,Preprocessed_tokens,emotion,vector,emotion_int
0,That game hurt.,"['that', 'game', 'hurt']",sadness,"[-0.27539062, 0.17261353, -0.07705078, 0.10944...",4
1,>sexuality shouldn’t be a grouping category I...,"[' ', '>', 'sexuality', 'should', 'not', 'be',...",happiness,"[-0.19681089, 0.111585006, 0.002059098, 0.1409...",1
3,Man I love reddit.,"['man', 'I', 'love', 'reddit']",love,"[-0.09999906, 0.12735455, 0.012986403, 0.13516...",2
7,That's crazy; I went to a super [RELIGION] hig...,"['that', 'be', 'crazy', 'I', 'go', 'to', 'a', ...",happiness,"[-0.17494032, 0.11104363, -0.0142862955, 0.144...",1
8,that's adorable asf,"['that', 'be', 'adorable', 'asf']",happiness,"[-0.19988544, 0.16729267, -0.016310472, 0.1263...",1
9,"""Sponge Blurb Pubs Quaw Haha GURR ha AAa!"" fin...","['Sponge', 'Blurb', 'Pubs', 'Quaw', 'Haha', 'G...",happiness,"[-0.15698548, 0.111558534, 0.03517456, 0.09603...",1
11,"I wanted to downvote this, but it's not your f...","['I', 'want', 'to', 'downvote', 'this', 'but',...",sadness,"[-0.18965112, 0.11191391, -0.034405965, 0.1326...",4
13,That is odd.,"['that', 'be', 'odd']",sadness,"[-0.19763184, 0.18621826, -0.043380737, 0.1411...",4
15,"I appreciate it, that's good to know. I hope I...","['I', 'appreciate', 'it', 'that', 'be', 'good'...",happiness,"[-0.14584547, 0.11323252, -0.032575052, 0.1617...",1
17,Well then I’d say you have a pretty good chanc...,"['well', 'then', 'I', '’d', 'say', 'you', 'hav...",neutral,"[-0.15911202, 0.11998915, -0.0011517069, 0.124...",3


# 1st Checkpoint 
Data has been preprocessed and vectorised, and emotions have been converted into labels to work with.

# Model Training - Emotion Classifier

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

X = np.array(df_mod['vector'].tolist())
Y = df_mod['emotion_int'].values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify = Y, random_state=42)

print(X.shape, X_train.shape, X_test.shape) 

(99760, 300) (79808, 300) (19952, 300)


>Training the Machine Learning Model (Using logistic regression)

In [15]:
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier

In [16]:
model1 = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, class_weight='balanced')
model1.fit(X_train, Y_train)

Y_pred = model1.predict(X_test)
print("Test data accuracy: ", accuracy_score(Y_test, Y_pred))

Y_pred2 = model1.predict(X_train)
print("Training data accuracy: ", accuracy_score(Y_train, Y_pred2))

Test data accuracy:  0.5926222935044105
Training data accuracy:  0.841018444266239


In [17]:
from xgboost import XGBClassifier

base_models = [
    ('lr', LogisticRegression(max_iter=500, random_state=42, n_jobs=-1)),
    ('rf', RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1)),
    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=-1))
]

meta_model = LogisticRegression(max_iter=500, random_state=42)

stacking_clf = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=3, n_jobs=-1)

stacking_clf.fit(X_train, Y_train)

y_train_pred = stacking_clf.predict(X_train)
y_test_pred = stacking_clf.predict(X_test)

train_accuracy = accuracy_score(Y_train, y_train_pred)
test_accuracy = accuracy_score(Y_test, y_test_pred)

print("Training Accuracy: ", train_accuracy)
print("Testing Accuracy: ", test_accuracy)
print("Classification Report:\n", classification_report(Y_test, y_test_pred))


Training Accuracy:  0.8405798917401764
Testing Accuracy:  0.6139234161988774
Classification Report:
               precision    recall  f1-score   support

           1       0.65      0.82      0.73      7780
           2       0.66      0.38      0.49      1062
           3       0.59      0.54      0.56      3886
           4       0.55      0.41      0.47      2555
           5       0.57      0.52      0.54      3977
           6       0.59      0.33      0.42       692

    accuracy                           0.61     19952
   macro avg       0.60      0.50      0.53     19952
weighted avg       0.61      0.61      0.60     19952



In [18]:
import pickle

with open("emotion_model.pkl", "wb") as file:
    pickle.dump(stacking_clf, file)

# 

In [19]:
sentence3 = "i am happy and adorable"

word_tokens3 = preprocess_tokens(sentence3)
print(word_tokens3)

['I', 'be', 'happy', 'and', 'adorable']


In [20]:
vector_tokens3 = vectorize(word_tokens3, model)
vector_tokens3 = np.array(vector_tokens3)
vector_tokens3 = np.reshape(vector_tokens3, (1, -1))
vector_tokens3.shape

(1, 300)

In [21]:
Y3 = model1.predict(vector_tokens3)
Y3


array([1])

In [22]:
from collections import Counter
print(Counter(Y_train))


Counter({1: 31119, 5: 15908, 3: 15546, 4: 10219, 2: 4248, 6: 2768})


# Suicide detection classifier

>using json module to convert the strings to a list of floats,
as the ast.literal_eval method was giving error, as it comes out some of the entries
were already of list type

In [23]:
# # Fix the formatting by adding commas between numbers
# df_suicide_mod['vector'] = df_suicide_mod['vector'].apply(
#     lambda x: re.sub(r'(?<=\d)\s+(?=[\-\d])', ', ', x.strip())
# )

In [24]:
# df_suicide_mod['vector'] = df_suicide_mod['vector'].apply(ast.literal_eval)

In [25]:
A = np.array(df_suicide_mod['vector'].tolist())
B = df_suicide_mod['class_int'].values

In [26]:
A_train, A_test, B_train, B_test = train_test_split(A, B, test_size=0.2, stratify = B, random_state=34)

print(A.shape, A_train.shape, A_test.shape)

(30000, 300) (24000, 300) (6000, 300)


In [27]:

model2 = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
model2.fit(A_train, B_train)

B_pred2 = model2.predict(A_train)
print("Training data accuracy: ", accuracy_score(B_train, B_pred2))

B_pred = model2.predict(A_test)
print("Test data accuracy: ", accuracy_score(B_test, B_pred))

Training data accuracy:  0.9999583333333333
Test data accuracy:  0.8055


In [28]:
with open("suicide_model.pkl", "wb") as file:
    pickle.dump(model2, file)

In [29]:
sentence2 = "I dont want to live this life, i want to die"

word_tokens2 = preprocess_tokens(sentence2)
print(word_tokens2)

['I', 'do', 'not', 'want', 'to', 'live', 'this', 'life', 'I', 'want', 'to', 'die']


In [30]:
vector_tokens2 = vectorize(word_tokens2, model)
vector_tokens2 = np.array(vector_tokens2)
vector_tokens2 = np.reshape(vector_tokens2, (1, -1))
vector_tokens2.shape

(1, 300)

In [31]:
Y2 = model2.predict(vector_tokens2)
if Y2 == 1:
    print("Suicide Risk = High")
else:
    print("Suicide Risk = Low")

Suicide Risk = High
