In this version, the oracle can annotate the suicide attempt and suicide planning of one sample at one time.

The oracle should complete the same number of planning-based annotations and attempt-based annotations at one time.

The annotated data generated by the machine learning are saved to a file named `result_file_Name`. The oracle should check the results, after which all annotated data should be moved to file `April_Submissions.xlsx` to enlarge the training dataset.

The oracle can input `12321` to terminate the annotation when asked to input 0 or 1 during the annotation.

In [1]:
pip install keras-tcn --no-dependencies

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
#Libraries in use
import numpy as np
import pandas as pd
import openpyxl
from sklearn.utils import shuffle
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, SpatialDropout1D
from tensorflow.keras.layers import concatenate, GlobalAveragePooling1D, GlobalMaxPooling1D
from tensorflow.keras.models import Model
from sklearn.model_selection import KFold 
from tcn import TCN, tcn_full_summary





In [3]:
tf.config.list_physical_devices('GPU')

[]

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
root = '/content/drive/MyDrive/ActiveLearningStigma/'
# training dataset
file_Name = root + "Data for machine annotation.xlsx"
# test dataset
test_file_Name = root + "Data from 2019 to 2020.xlsx"
# result file
result_file_Name = 'AnnotatedResult.xlsx'


In [6]:
unclean_Data = pd.read_excel(file_Name,engine='openpyxl')
unclean_Data

Unnamed: 0,Index,id,author,Date,score,num_comments,title,selftext,Outcomes - Self-Harm/Suicide Attempts,Outcomes - Ideation/Planning,Outcomes - Self-Harm/Suicide Attempts.1,Outcomes - Ideation/Planning.1
0,0.0,alyhxj,Msmrme,2019-01-31 23:54:00,1.0,0.0,How long does it take to die in the cold?,Right now outside it’s -6°F feels like -18°F h...,0.0,How long does it take to die in the cold? ...,0.0,1.0
1,1.0,alyglb,imadethistofindhelp,2019-01-31 23:49:22,1.0,3.0,Looking for advice,One of my close friends says that he struggles...,0.0,struggles with thoughts of suicide,0.0,1.0
2,2.0,alyeff,whoagordoo,2019-01-31 23:41:45,1.0,1.0,I got into it with my dad tonight and really d...,My mom isn’t in my life and a large part is du...,0.0,I feel like I need to go,0.0,1.0
3,3.0,alydob,kuma1112,2019-01-31 23:39:08,1.0,0.0,Im... sorry the helper is now going to be dead...,I'm sorry..,0.0,now going to be dead,0.0,1.0
4,4.0,alyc26,HiddenAI,2019-01-31 23:33:48,1.0,1.0,People think I am useless.,"Nobody finds me useful. Recently, a person cal...",0.0,"I want to die, preferably by gassing myself",0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1536,,b7w5em,ed8907,2019-03-31 22:27:26,23.0,6.0,Angel of Death.,**Angel of Death**. I know you're here because...,,,0.0,1.0
1537,,b7w5c7,Hyppocritamus,2019-03-31 22:27:13,0.0,20.0,I cheated on my wife.,"She wants a divorce, and I'm not going to begr...",,,0.0,1.0
1538,,b7w4pn,Anon1947473993,2019-03-31 22:25:26,1.0,3.0,Am going to do it in a couple days.,What is the best pill method w/ alcohol? I hav...,,,0.0,1.0
1539,,b7x12p,morticiabates,2019-03-31 23:59:10,18.0,5.0,My mother makes me want to kill myself,"I'm in college, I work 4 days a week, but bec...",,,0.0,1.0


In [7]:
raw_data = unclean_Data
raw_data = shuffle(raw_data)
# import pandas as pd
# raw_data = pd.read_excel('./Data fo machine annotation.xlsx', index=False)
label = raw_data.loc[:, ['Outcomes - Self-Harm/Suicide Attempts.1', 'Outcomes - Ideation/Planning.1']]
raw_data = raw_data.loc[:, ['title', 'selftext'], ]


test_unclean_Data = pd.read_excel(test_file_Name,engine='openpyxl')
test_data = test_unclean_Data.loc[:, ['title', 'selftext'], ]
test_data

Unnamed: 0,title,selftext
0,How long does it take to die in the cold?,Right now outside it’s -6°F feels like -18°F h...
1,Looking for advice,One of my close friends says that he struggles...
2,I got into it with my dad tonight and really d...,My mom isn’t in my life and a large part is du...
3,Im... sorry the helper is now going to be dead...,I'm sorry..
4,People think I am useless.,"Nobody finds me useful. Recently, a person cal..."
...,...,...
477,Angel of Death.,**Angel of Death**. I know you're here because...
478,I cheated on my wife.,"She wants a divorce, and I'm not going to begr..."
479,Am going to do it in a couple days.,What is the best pill method w/ alcohol? I hav...
480,,


In [8]:

new_columns = raw_data.columns.tolist()
new_columns.insert(2, 'content')
data = raw_data.reindex(columns=new_columns)
# print(data.loc[:, 'title'].isnull().sum().sum())
data.loc[:,'content'] = data.loc[:, 'title'] + ' ' + data.loc[:, 'selftext']
data.fillna(method = 'ffill', axis = 1, inplace=True)
data.drop(['title', 'selftext'], axis = 1, inplace=True)
# data = pd.concat([data,label],axis=1)

test_new_columns = test_data.columns.tolist()
test_new_columns.insert(2, 'content')
testdata = test_data.reindex(columns=test_new_columns)
# print(data.loc[:, 'title'].isnull().sum().sum())
testdata.loc[:,'content'] = testdata.loc[:, 'title'] + ' ' + testdata.loc[:, 'selftext']
testdata.fillna(method = 'ffill', axis = 1, inplace=True)
testdata.drop(['title', 'selftext'], axis = 1, inplace=True)
testdata = testdata[0:479]

In [9]:
import sys
import nltk
nltk.download('stopwords')
from nltk.tokenize import WordPunctTokenizer
import sklearn
import pandas
import numpy

print('Python: {}'.format(sys.version))
print('NLTK: {}'.format(nltk.__version__))
print('Scikit-learn: {}'.format(sklearn.__version__))
print('Pandas: {}'.format(pandas.__version__))
print('Numpy: {}'.format(numpy.__version__))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Python: 3.7.13 (default, Apr 24 2022, 01:04:09) 
[GCC 7.5.0]
NLTK: 3.2.5
Scikit-learn: 1.0.2
Pandas: 1.3.5
Numpy: 1.21.6


In [10]:
text_messages = data['content']
# print(text_messages)
# Replace numbers with 'numbr'
processed = text_messages.str.replace(r'\d+(\.\d+)?', 'numbr')

# Replace whitespace between terms with a single space
processed = processed.str.replace(r'\s+', ' ')

# Remove leading and trailing whitespace
processed = processed.str.replace(r'^\s+|\s+?$', '')

# Remove punctuation
processed = processed.str.replace(r'[^\w\d\s]', ' ')

test_text_messages = testdata['content']
# print(text_messages)
# Replace numbers with 'numbr'
test_processed = test_text_messages.str.replace(r'\d+(\.\d+)?', 'numbr')

# Replace whitespace between terms with a single space
test_processed = test_processed.str.replace(r'\s+', ' ')

# Remove punctuation
test_processed = test_processed.str.replace(r'[^\w\d\s]', ' ')

# Remove leading and trailing whitespace
test_processed = test_processed.str.replace(r'^\s+|\s+?$', '')
test_processed

  after removing the cwd from sys.path.
  import sys
  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]


0      How long does it take to die in the cold  Righ...
1      Looking for advice One of my close friends say...
2      I got into it with my dad tonight and really d...
3      Im    sorry the helper is now going to be dead...
4      People think I am useless  Nobody finds me use...
                             ...                        
474    I think a decent job would prevent my suicide ...
475    i ve been making some of my worst decisions I ...
476    Hah  my life is cursed  It s weird looking at ...
477    Angel of Death    Angel of Death    I know you...
478    I cheated on my wife  She wants a divorce  and...
Name: content, Length: 479, dtype: object

In [11]:
# change words to lower case - Hello, HELLO, hello are all the same word
processed = processed.str.lower()
processed

test_processed = test_processed.str.lower()
test_processed

0      how long does it take to die in the cold  righ...
1      looking for advice one of my close friends say...
2      i got into it with my dad tonight and really d...
3      im    sorry the helper is now going to be dead...
4      people think i am useless  nobody finds me use...
                             ...                        
474    i think a decent job would prevent my suicide ...
475    i ve been making some of my worst decisions i ...
476    hah  my life is cursed  it s weird looking at ...
477    angel of death    angel of death    i know you...
478    i cheated on my wife  she wants a divorce  and...
Name: content, Length: 479, dtype: object

In [12]:
from nltk import tokenize
from nltk.tokenize import word_tokenize
from nltk.tokenize import MWETokenizer

all_words = []
all_bitext = []
tk = WordPunctTokenizer() 

for message in processed:
    
    words = tk.tokenize(message)
    for w in words:
        all_words.append(w)
    for i in range(len(words)-1):
        all_bitext.append(words[i] + words[i+1])

all_words = nltk.FreqDist(all_words)
all_bitext = nltk.FreqDist(all_bitext)

In [13]:
print('Number of words: {}'.format(len(all_words)))
print('Most common words: {}'.format(all_words.most_common(15)))
bi_Words = all_bitext.most_common(100)
print('Most common bitext: {}'.format(bi_Words))

Number of words: 8534
Most common words: [('i', 20725), ('to', 9053), ('and', 7264), ('the', 5970), ('it', 5312), ('my', 5147), ('a', 4771), ('t', 3801), ('of', 3748), ('me', 3514), ('that', 3275), ('m', 3230), ('in', 2622), ('is', 2442), ('for', 2422)]
Most common bitext: [('im', 3219), ('dont', 1602), ('andi', 1486), ('ihave', 1180), ('idon', 1179), ('ive', 1166), ('its', 1120), ('wantto', 1072), ('ican', 1021), ('thati', 869), ('iwas', 839), ('buti', 827), ('cant', 816), ('iam', 760), ('iwant', 642), ('ifi', 610), ('ijust', 605), ('ifeel', 591), ('mylife', 573), ('tobe', 521), ('inthe', 520), ('todo', 519), ('feellike', 462), ('iknow', 456), ('ofmy', 446), ('wheni', 440), ('becausei', 432), ('inmy', 422), ('iti', 414), ('mei', 411), ('todie', 395), ('tknow', 393), ('goingto', 383), ('togo', 367), ('doit', 357), ('tothe', 338), ('killmyself', 323), ('havea', 322), ('ina', 317), ('ithink', 305), ('vebeen', 304), ('forthe', 303), ('id', 302), ('likei', 293), ('icould', 291), ('mnot', 2

In [14]:
word_features = []
bitext_features = []
for i in all_words.most_common(2500):
    word_features.append(i[0])
for  i in all_bitext.most_common(300):
    bitext_features.append(i[0])

In [15]:
attempts_label = label['Outcomes - Self-Harm/Suicide Attempts.1']
planning_label = label['Outcomes - Ideation/Planning.1']
print(attempts_label, planning_label)

1298    0.0
715     0.0
1233    0.0
1526    0.0
807     0.0
       ... 
121     1.0
1140    0.0
1327    0.0
1378    0.0
712     0.0
Name: Outcomes - Self-Harm/Suicide Attempts.1, Length: 1541, dtype: float64 1298    1.0
715     1.0
1233    1.0
1526    1.0
807     1.0
       ... 
121     0.0
1140    1.0
1327    1.0
1378    1.0
712     1.0
Name: Outcomes - Ideation/Planning.1, Length: 1541, dtype: float64


In [16]:
lexiconData = pd.read_excel('/content/drive/MyDrive/ActiveLearningStigma/lexicon_Bi_Words.xlsx',engine='openpyxl')

In [17]:
lexicon = [] 
for word in lexiconData['Words']:
  splitWord = word.split(',')
  lexicon.append(tuple(splitWord))
lexicon

[('want', 'die'),
 ('get', 'better'),
 ('want', 'live'),
 ('commit', 'suicid'),
 ('want', 'kill'),
 ('want', 'end'),
 ('live', 'life'),
 ('need', 'help'),
 ('suicid', 'thought'),
 ('self', 'harm'),
 ('end', 'life'),
 ('get', 'wors'),
 ('die', 'want'),
 ('tri', 'kill'),
 ('mental', 'ill'),
 ('suicid', 'attempt'),
 ('pleas', 'help'),
 ('get', 'help'),
 ('like', 'shit'),
 ('feel', 'better'),
 ('attempt', 'suicid'),
 ('life', 'want'),
 ('good', 'enough'),
 ('peopl', 'care'),
 ('wanna', 'die'),
 ('kill', 'self'),
 ('go', 'kill'),
 ('need', 'someon'),
 ('take', 'anymor'),
 ('suicid', 'note')]

In [18]:
metk= MWETokenizer(lexicon)

In [19]:
train_content = processed
train_attempts_label = attempts_label
train_planning_label = planning_label

def find_features(message):
    bitexts = []
    words = metk.tokenize(tk.tokenize(message))
    for i in range(len(words)-1):
        bitexts.append(words[i] + words[i+1])
    features = {}
    for bitext in bitext_features:
        features[bitext] = (bitext in bitexts)
    for word in word_features:
        features[word] = (word in words)
    return features

In [20]:
# Define a function to compute the max length of sequence
def max_length(sequences):
    max_length = 0
    for i, seq in enumerate(sequences):
        length = len(seq)
        if max_length < length:
            max_length = length
    return max_length

In [21]:
#load glove embedding
embeddings_dict = {}
with open("/content/drive/MyDrive/ActiveLearningStigma/glove.42B.300d.txt", 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

KeyboardInterrupt: ignored

In [None]:
def training_words_in_word2vector(word_to_vec_map, word_to_index):
    '''
    input:
        word_to_vec_map: a word2vec GoogleNews-vectors-negative300.bin model loaded using gensim.models
        word_to_index: word to index mapping from training set
    '''
    
    vocab_size = len(word_to_index) + 1
    count = 0
    # Set each row "idx" of the embedding matrix to be 
    # the word vector representation of the idx'th word of the vocabulary
    for word, idx in word_to_index.items():
        if word in word_to_vec_map:
            count+=1
            
    return print('Found {} words present from {} training vocabulary in the set of pre-trained word vector'.format(count, vocab_size))

In [None]:
trunc_type='post'
padding_type='post'
oov_tok = "<UNK>"

#Tokenize 
tokenizer = Tokenizer(oov_token=oov_tok)
tokenizer.fit_on_texts(train_content)

word_index = tokenizer.word_index
training_words_in_word2vector(embeddings_dict, word_index)




In [22]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
gnb = make_pipeline(TfidfVectorizer(),GaussianNB())
gnb2 = GaussianNB()
vector_1=TfidfVectorizer()
mnb = make_pipeline(TfidfVectorizer(),MultinomialNB())


In [23]:
X=vector_1.fit_transform(processed)
y=attempts_label

In [24]:
# splitting X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X.todense(), y, test_size=0.4, random_state=1)
          
          

In [25]:
gnb2.fit(X_train, y_train)



GaussianNB()

In [26]:
y_pred = gnb2.predict(X_test)



In [27]:
from sklearn import metrics
print("Gaussian Naive Bayes model accuracy(in %):", metrics.accuracy_score(y_test, y_pred)*100)

Gaussian Naive Bayes model accuracy(in %): 65.96434359805511


In [28]:
from sklearn.metrics import confusion_matrix 
confusion_matrix(y_test, y_pred)

array([[373, 149],
       [ 61,  34]])

In [29]:
y=planning_label

In [30]:
# splitting X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X.todense(), y, test_size=0.4, random_state=1)

In [31]:
gnb2.fit(X_train, y_train)



GaussianNB()

In [32]:
y_pred = gnb2.predict(X_test)



In [33]:
from sklearn import metrics
print("Gaussian Naive Bayes model accuracy(in %):", metrics.accuracy_score(y_test, y_pred)*100)

Gaussian Naive Bayes model accuracy(in %): 78.44408427876823


In [34]:
from sklearn.metrics import confusion_matrix 
confusion_matrix(y_test, y_pred)

array([[ 77,  95],
       [ 38, 407]])