In [1]:

import tensorflow as tf
import pandas as pd
import spacy
from symspellpy import SymSpell, Verbosity
import collections
import importlib.resources as pkg_resources
import unidecode
import contractions as contract
import re
import importlib
import tensorflow as tf

In [2]:
data = pd.read_csv("Suicide_Detection.csv/Suicide_Detection.csv")

In [3]:
data.head(10)

Unnamed: 0.1,Unnamed: 0,text,class
0,2,Ex Wife Threatening SuicideRecently I left my ...,suicide
1,3,Am I weird I don't get affected by compliments...,non-suicide
2,4,Finally 2020 is almost over... So I can never ...,non-suicide
3,8,i need helpjust help me im crying so hard,suicide
4,9,"I’m so lostHello, my name is Adam (16) and I’v...",suicide
5,11,Honetly idkI dont know what im even doing here...,suicide
6,12,[Trigger warning] Excuse for self inflicted bu...,suicide
7,13,It ends tonight.I can’t do it anymore. \nI quit.,suicide
8,16,"Everyone wants to be ""edgy"" and it's making me...",non-suicide
9,18,My life is over at 20 years oldHello all. I am...,suicide


In [4]:
data.shape

(232074, 3)

In [5]:
df = data.drop(data.columns[0], axis = 1)

In [6]:
df.head(4)

Unnamed: 0,text,class
0,Ex Wife Threatening SuicideRecently I left my ...,suicide
1,Am I weird I don't get affected by compliments...,non-suicide
2,Finally 2020 is almost over... So I can never ...,non-suicide
3,i need helpjust help me im crying so hard,suicide


In [7]:
data.head(5)

Unnamed: 0.1,Unnamed: 0,text,class
0,2,Ex Wife Threatening SuicideRecently I left my ...,suicide
1,3,Am I weird I don't get affected by compliments...,non-suicide
2,4,Finally 2020 is almost over... So I can never ...,non-suicide
3,8,i need helpjust help me im crying so hard,suicide
4,9,"I’m so lostHello, my name is Adam (16) and I’v...",suicide


# Data preprocessing

In [8]:
# Defining methods

nlp = spacy.load("en_core_web_sm") 

vocab = collections.Counter()

sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)

#dictionary_path = importlib.resources_filename(
#    "symspellpy", "frequency_dictionary_en_82_765.txt")
# Access the dictionary path
dictionary_path = pkg_resources.files("symspellpy").joinpath("frequency_dictionary_en_82_765.txt")
dictionary_path_str = str(dictionary_path)

# Access the bigram path
bigram_path = pkg_resources.files("symspellpy").joinpath("frequency_bigramdictionary_en_243_342.txt")
bigram_path_str = str(bigram_path)

#bigram_path = resources.resource_filename(
#symspellpy", "frequency_bigramdictionary_en_243_342.txt")

sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)

True

In [9]:
# Spell Check using Symspell
def fix_spelling(text):
    suggestions = sym_spell.lookup_compound(text, max_edit_distance=2)
    correctedtext = suggestions[0].term # get the first suggestion, otherwise returns original text if nothing is corrected 
    return correctedtext 

In [10]:
# Remove some important words from stopwords list 
deselect_stop_words = ['no', 'not']
    
for w in deselect_stop_words:
    nlp.vocab[w].is_stop = False

In [11]:

# Remove extra whitespaces from text
def remove_whitespace(text):
    text = text.strip()
    return " ".join(text.split())

In [12]:
# Remove accented characters from text, e.g. café
def remove_accented_chars(text):
    text = unidecode.unidecode(text)
    return text


In [13]:
# Remove URL 
def remove_url(text):
    return re.sub(r'http\S+', '', text)

In [14]:
# Removing symbols and digits
def remove_symbols_digits(text):
    return re.sub('[^a-zA-Z]', ' ', text)

In [15]:
# Removing special characters
def remove_special(text):
    return text.replace("\r", " ").replace("\n", " ").replace("    ", " ").replace('"', '')

In [16]:
# Fix word lengthening (characters are wrongly repeated)
def fix_lengthening(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

In [17]:
def text_preprocessing(text, accented_chars=True, contractions=True, convert_num=True, 
                       extra_whitespace=True, lemmatization=True, lowercase=True, 
                       url=True, symbols_digits=True, special_chars=True, 
                       stop_words=True, lengthening=True, spelling=True):
    """preprocess text with default option set to true for all steps"""
    if accented_chars == True: # remove accented characters
        text = remove_accented_chars(text)
    if contractions == True: # expand contractions
        text = contract.fix(text)
    if lowercase == True: # convert all characters to lowercase
        text = text.lower()
    if url == True: # remove URLs before removing symbols 
        text = remove_url(text)
    if symbols_digits == True: # remove symbols and digits
        text = remove_symbols_digits(text)
    if special_chars == True: # remove special characters
        text = remove_special(text)
    if extra_whitespace == True: # remove extra whitespaces
        text = remove_whitespace(text)
    if lengthening == True: # fix word lengthening
        text = fix_lengthening(text)
    if spelling == True: # fix spelling
        text = fix_spelling(text)

    doc = nlp(text) # tokenise text

    clean_text = []

    # return text
    
    for token in doc:
        flag = True
        edit = token.text
        # remove stop words
        if stop_words == True and token.is_stop and token.pos_ != 'NUM': 
            flag = False
        # exclude number words
        if convert_num == True and token.pos_ == 'NUM' and flag == True:
            flag = False
        # convert tokens to base form
        elif lemmatization == True and token.lemma_ != "-PRON-" and flag == True:
            edit = token.lemma_
        # append tokens edited and not removed to list 
        if edit != "" and flag == True:
            clean_text.append(edit)  
    return " ".join(clean_text)

In [18]:
# Test functions on a subset of 20 rows
df['cleaned_text'] = df['text'][:20].apply(lambda row: text_preprocessing(row))


In [19]:
df[:20]

Unnamed: 0,text,class,cleaned_text
0,Ex Wife Threatening SuicideRecently I left my ...,suicide,sex wife threaten suicide recently leave wife ...
1,Am I weird I don't get affected by compliments...,non-suicide,weird not affect compliment come know girl fee...
2,Finally 2020 is almost over... So I can never ...,non-suicide,finally hear bad year swear fucking god annoying
3,i need helpjust help me im crying so hard,suicide,need help help cry hard
4,"I’m so lostHello, my name is Adam (16) and I’v...",suicide,lose hello adam struggle year afraid past year...
5,Honetly idkI dont know what im even doing here...,suicide,honestly d not know feel like feel unbearably ...
6,[Trigger warning] Excuse for self inflicted bu...,suicide,trigger warn excuse self inflict burn know cri...
7,It ends tonight.I can’t do it anymore. \nI quit.,suicide,end tonight not anymore quit
8,"Everyone wants to be ""edgy"" and it's making me...",non-suicide,want edgy make self conscious feel like not st...
9,My life is over at 20 years oldHello all. I am...,suicide,life year old hello year old bald male hairlin...


In [20]:
# Apply preprocessing to all data
df['cleaned_text'] = df['text'].apply(lambda row: text_preprocessing(row))

In [21]:
# Export cleaned dataset
df.to_csv('suicide_detection_full_cleaned_2.csv', index=False)

In [22]:
df.describe

<bound method NDFrame.describe of                                                      text        class  \
0       Ex Wife Threatening SuicideRecently I left my ...      suicide   
1       Am I weird I don't get affected by compliments...  non-suicide   
2       Finally 2020 is almost over... So I can never ...  non-suicide   
3               i need helpjust help me im crying so hard      suicide   
4       I’m so lostHello, my name is Adam (16) and I’v...      suicide   
...                                                   ...          ...   
232069  If you don't like rock then your not going to ...  non-suicide   
232070  You how you can tell i have so many friends an...  non-suicide   
232071  pee probably tastes like salty tea😏💦‼️ can som...  non-suicide   
232072  The usual stuff you find hereI'm not posting t...      suicide   
232073  I still haven't beaten the first boss in Hollo...  non-suicide   

                                             cleaned_text  
0       sex wife 

In [None]:
#convert the text to vector

In [None]:
# pass the text to embedding 

In [None]:
# pass the embedding to lstm / GRU / Transformer

In [None]:
import tensorflow as tf

In [None]:
tf.config.list_physical_devices()

In [None]:
tf.device('/device:GPU:0')

In [None]:
tf.config.list_physical_devices()