<a href="https://colab.research.google.com/github/WesselBoi/Suicidal-Content-detection/blob/main/data_preprocessing_small.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

!pip install -qqq unidecode contractions pyspellchecker wordninja symspellpy spacy
!python -m spacy download en_core_web_sm


Mounted at /content/drive
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m93.3 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import os
import pandas as pd
import numpy as np
import spacy
import unidecode
import contractions as contract
import re
import wordninja
import collections
import pkg_resources
from spellchecker import SpellChecker
from symspellpy import SymSpell, Verbosity

  import pkg_resources


In [None]:
os.chdir("/content/drive/MyDrive/SuicideModel")
print("✅ Directory changed")

✅ Directory changed


In [None]:
# Load dataset
df = pd.read_csv('Data/Suicide_Detection.csv', index_col=0)
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,text,class
0,Ex Wife Threatening SuicideRecently I left my ...,suicide
1,Am I weird I don't get affected by compliments...,non-suicide
2,Finally 2020 is almost over... So I can never ...,non-suicide
3,i need helpjust help me im crying so hard,suicide
4,"I’m so lostHello, my name is Adam (16) and I’v...",suicide


In [None]:
df.shape

(232074, 2)

In [None]:
nlp = spacy.load("en_core_web_sm")
vocab = collections.Counter()
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_dictionary_en_82_765.txt")
bigram_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_bigramdictionary_en_243_342.txt")
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)

def fix_spelling(text):
    suggestions = sym_spell.lookup_compound(text, max_edit_distance=2)
    correctedtext = suggestions[0].term if suggestions else text
    return correctedtext

deselect_stop_words = ['no', 'not']
for w in deselect_stop_words:
    nlp.vocab[w].is_stop = False

def remove_whitespace(text):
    text = text.strip()
    return " ".join(text.split())

def remove_accented_chars(text):
    text = unidecode.unidecode(text)
    return text

def remove_url(text):
    return re.sub(r'http\S+', '', text)

def remove_symbols_digits(text):
    return re.sub('[^a-zA-Z\s]', ' ', text)

def remove_special(text):
    return text.replace("\r", " ").replace("\n", " ").replace("    ", " ").replace('"', '')

def fix_lengthening(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

def text_preprocessing(text, accented_chars=True, contractions=True, convert_num=True,
                       extra_whitespace=True, lemmatization=True, lowercase=True,
                       url=True, symbols_digits=True, special_chars=True,
                       stop_words=True, lengthening=True, spelling=True):
    if accented_chars:
        text = remove_accented_chars(text)
    if contractions:
        text = contract.fix(text)
    if lowercase:
        text = text.lower()
    if url:
        text = remove_url(text)
    if symbols_digits:
        text = remove_symbols_digits(text)
    if special_chars:
        text = remove_special(text)
    if extra_whitespace:
        text = remove_whitespace(text)
    if lengthening:
        text = fix_lengthening(text)
    if spelling:
        text = fix_spelling(text)

    doc = nlp(text)
    clean_text = []
    for token in doc:
        flag = True
        edit = token.text
        if stop_words and token.is_stop and token.pos_ != 'NUM':
            flag = False
        if convert_num and token.pos_ == 'NUM' and flag:
            flag = False
        elif lemmatization and token.lemma_ != "-PRON-" and flag:
            edit = token.lemma_
        if edit != "" and flag:
            clean_text.append(edit)
    return " ".join(clean_text)


  return re.sub('[^a-zA-Z\s]', ' ', text)


In [None]:
def batch_preprocessing(texts, batch_size=100, n_process=2):
    cleaned = []
    for doc in nlp.pipe(texts, batch_size=batch_size, n_process=n_process):
        cleaned.append(text_preprocessing(doc.text))
    return cleaned

df['cleaned_text'] = batch_preprocessing(df['text'].tolist())


In [None]:
df.shape

(232074, 3)

In [None]:
df.iloc[23000].cleaned_text

'recently go attempt depth thread question repercussion fail treatment lead reach'

In [None]:
df.head()

Unnamed: 0,text,class,cleaned_text
0,Ex Wife Threatening SuicideRecently I left my ...,suicide,sex wife threaten suicide recently leave wife ...
1,Am I weird I don't get affected by compliments...,non-suicide,weird not affect compliment come know girl fee...
2,Finally 2020 is almost over... So I can never ...,non-suicide,finally hear bad year swear fuck god annoying
3,i need helpjust help me im crying so hard,suicide,need help help cry hard
4,"I’m so lostHello, my name is Adam (16) and I’v...",suicide,lose hello adam struggle year afraid past year...


In [None]:
# Export cleaned dataset
df.to_csv('Data/suicide_detection_full_cleaned.csv', index=False)