<a href="https://colab.research.google.com/github/RavinduKodithuwakku/suicide-thoughts-detection/blob/dev/word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from time import time
from collections import Counter

In [3]:
SEED = 4222
EPOCHS = 5

In [4]:
# Load dataset and reset index
suicide_detection_df = pd.read_csv('/content/drive/MyDrive/fyp/dataset/suicide_detection_final_cleaned.csv', header=0)
suicide_detection_df.reset_index(drop=True, inplace=True)
suicide_detection_df.replace({"class": {"suicide": 1, "non-suicide": 0}}, inplace=True)
suicide_detection_df.drop(columns=['text'], inplace=True)
suicide_detection_df = suicide_detection_df.rename(columns={"cleaned_text": "text"})
suicide_detection_df

Unnamed: 0,class,text
0,1,sex wife threaten suicide recently leave wife ...
1,0,weird not affect compliment come know girl fee...
2,0,finally hear bad year swear fuck god annoying
3,1,need help help cry hard
4,1,end tonight not anymore quit
...,...,...
175969,0,today go sled friend not like pretty big miles...
175970,0,not like rock not go
175971,0,tell friend not lonely deprive buy little nigh...
175972,0,pee probably taste like salty tea drink pee co...


In [5]:
# Split dataset into train, validation and test sets
train_text, test_text, train_labels, test_labels = train_test_split(suicide_detection_df['text'], suicide_detection_df['class'],
                                                                    random_state=SEED,
                                                                    test_size=0.2,
                                                                    stratify=suicide_detection_df['class'])

word2vec

Building a vocab

In [6]:
# define vocab 
vocab = Counter()
# tokenise each sentence
tokens_list = [(s.split()) for s in train_text .dropna()]
# add each sentence to vocab
for i in tokens_list:
  vocab.update(i)
# removing words with a low occurance
min_occurance = 2
tokens = [k for k,c in vocab.items() if c >= min_occurance]
print(len(tokens))

18947


In [7]:
# save list to file
def save_list(lines, filename):
	# convert lines to a single blob of text
	data = '\n'.join(lines)
	# open file
	file = open(filename, 'w')
	# write text
	file.write(data)
	# close file
	file.close()
 
# save tokens to a vocabulary file
save_list(vocab, '/content/drive/MyDrive/fyp/vocab.txt')

In [8]:
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# load the vocabulary
vocab_filename = '/content/drive/MyDrive/fyp/vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

Removing out-of-vocab words

In [9]:
# clean each line
def clean_line(line, vocab):
  tokens = line.split()
  # filter out tokens not in vocab
  tokens_clean = [w for w in tokens if w in vocab]
  return [tokens_clean]

# clean entire dataset
def process_lines(data, vocab):
  lines = list()
  for i in data:
    line = clean_line(i, vocab)
    # add lines to list
    lines += line
  return lines

In [12]:
train_clean = process_lines(train_text, vocab)
test_clean = process_lines(test_text, vocab)


Training the model

In [13]:
# set up the parameters of the model
model = Word2Vec(size=300, window=10, min_count=1, iter=EPOCHS, seed=SEED)

# it builds the vocabulary from a sequence of sentences and thus initialized the model.
t = time()
model.build_vocab(train_clean, progress_per=1000)
print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

# training the model
t = time()
model.train(train_clean, total_examples=model.corpus_count, epochs=EPOCHS, report_delay=1)
print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))



Time to build vocab: 0.06 mins
Time to train the model: 0.68 mins


In [16]:
# save model in ASCII (word2vec) format
filename = '/content/drive/MyDrive/fyp/embedding_word2vec.txt'
model.wv.save_word2vec_format(filename, binary=False)