In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
!pip install -qqq unidecode contractions pyspellchecker wordninja symspellpy

[K     |████████████████████████████████| 235 kB 6.5 MB/s 
[K     |████████████████████████████████| 2.5 MB 50.3 MB/s 
[K     |████████████████████████████████| 541 kB 54.8 MB/s 
[K     |████████████████████████████████| 2.6 MB 42.7 MB/s 
[K     |████████████████████████████████| 106 kB 65.7 MB/s 
[K     |████████████████████████████████| 287 kB 73.2 MB/s 
[K     |████████████████████████████████| 125 kB 69.1 MB/s 
[?25h  Building wheel for wordninja (setup.py) ... [?25l[?25hdone


In [None]:
import os
import pandas as pd
import numpy as np
import spacy
import unidecode
import contractions as contract
import re
import wordninja
import collections
import pkg_resources
from spellchecker import SpellChecker 
from symspellpy import SymSpell, Verbosity

In [None]:
# Change to your own directory
try:
    os.chdir("/content/drive/MyDrive/Suicide/New Data")
    print("Directory changed")
except OSError:
    print("Error: Can't change the Current Working Directory")

Directory changed


In [None]:
# Load dataset
df = pd.read_csv('Suicide_dataset.csv')
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,text,class
0,I recently went through a breakup and she said...,depression
1,"I do not know how to navigate these feelings, ...",depression
2,"So I have been with my bf for 5 months , and h...",depression
3,I am so exhausted of this. Just when I think I...,suicide
4,i could feel the strongest connection and stil...,normal


In [None]:
# Defining methods

nlp = spacy.load("en_core_web_sm") 
vocab = collections.Counter()
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_dictionary_en_82_765.txt")
bigram_path = pkg_resources.resource_filename(
"symspellpy", "frequency_bigramdictionary_en_243_342.txt")
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)

# Spell Check using Symspell
def fix_spelling(text):
    suggestions = sym_spell.lookup_compound(text, max_edit_distance=2)
    correctedtext = suggestions[0].term # get the first suggestion, otherwise returns original text if nothing is corrected 
    return correctedtext 

# Remove some important words from stopwords list 
deselect_stop_words = ['no', 'not']
    
for w in deselect_stop_words:
    nlp.vocab[w].is_stop = False

# Remove extra whitespaces from text
def remove_whitespace(text):
    text = text.strip()
    return " ".join(text.split())

# Remove accented characters from text, e.g. café
def remove_accented_chars(text):
    text = unidecode.unidecode(text)
    return text

# Remove URL 
def remove_url(text):
    return re.sub(r'http\S+', '', text)

# Removing symbols and digits
def remove_symbols_digits(text):
    return re.sub('[^a-zA-Z\s]', ' ', text)
# Removing special characters
def remove_special(text):
    return text.replace("\r", " ").replace("\n", " ").replace("    ", " ").replace('"', '')

# Fix word lengthening (characters are wrongly repeated)
def fix_lengthening(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

def text_preprocessing(text, accented_chars=True, contractions=True, convert_num=True, 
                       extra_whitespace=True, lemmatization=True, lowercase=True, 
                       url=True, symbols_digits=True, special_chars=True, 
                       stop_words=True, lengthening=True, spelling=True):
    """preprocess text with default option set to true for all steps"""
    if accented_chars == True: # remove accented characters
        text = remove_accented_chars(text)
    if contractions == True: # expand contractions
        text = contract.fix(text)
    if lowercase == True: # convert all characters to lowercase
        text = text.lower()
    if url == True: # remove URLs before removing symbols 
        text = remove_url(text)
    if symbols_digits == True: # remove symbols and digits
        text = remove_symbols_digits(text)
    if special_chars == True: # remove special characters
        text = remove_special(text)
    if extra_whitespace == True: # remove extra whitespaces
        text = remove_whitespace(text)
    if lengthening == True: # fix word lengthening
        text = fix_lengthening(text)
    if spelling == True: # fix spelling
        text = fix_spelling(text)

    doc = nlp(text) # tokenise text

    clean_text = []

# return text
    
    for token in doc:
        flag = True
        edit = token.text
        # remove stop words
        if stop_words == True and token.is_stop and token.pos_ != 'NUM': 
            flag = False
        # exclude number words
        if convert_num == True and token.pos_ == 'NUM' and flag == True:
            flag = False
        # convert tokens to base form
        elif lemmatization == True and token.lemma_ != "-PRON-" and flag == True:
            edit = token.lemma_
        # append tokens edited and not removed to list 
        if edit != "" and flag == True:
            clean_text.append(edit)        
    return " ".join(clean_text)

In [None]:
 df.keys()

Index(['                                                                           text', '    class'], dtype='object')

In [None]:
# Test functions on a subset of 20 rows
df['cleaned_text'] = df['                                                                           text'][:20].apply(lambda row: text_preprocessing(row))
df[:20]

Unnamed: 0,text,class,cleaned_text
0,I recently went through a breakup and she said...,depression,recently go breakup say want friend say try ta...
1,"I do not know how to navigate these feelings, ...",depression,not know navigate feeling not new feeling stre...
2,"So I have been with my bf for 5 months , and h...",depression,month tell depressed week particular happen fe...
3,I am so exhausted of this. Just when I think I...,suicide,exhausted think finally rest think maybe thing...
4,i could feel the strongest connection and stil...,normal,feel strong connection divine self
5,i feel privileged to have the earthly father t...,normal,feel privileged earthly father far great privi...
6,i feel even more hated,normal,feel hated
7,i can tell pms is at work because i feel so we...,normal,tell pms work feel weepy
8,Help me for ideas simple healthy meals to make...,depression,help idea simple healthy meal feel depressed s...
9,it is looming around the corner again. It alwa...,suicide,loom corner come like wave like moon hear sile...


In [None]:
# Apply preprocessing to all data
df['cleaned_text'] = df['                                                                           text'].apply(lambda row: text_preprocessing(row))

In [None]:
# Export cleaned dataset
df.to_csv('suicide_full_cleaned.csv', index=False)