# Preprocessing Text Data

In [1]:
import nltk
import pandas as pd
import numpy as np
import string 
import re
import contractions
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec

In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/roberto/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/roberto/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/roberto/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
df = pd.read_parquet('../Data/cleaned_suicide_detection.parquet')

stopwords = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [4]:
def preprocessing(text):
    """Apply preprocessing steps to prepare text for classification"""

    text = contractions.fix(text)
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'[_\W]+', ' ', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return tokens

In [5]:
df['text'] = df['text'].apply(preprocessing)

df.sample(5)

Unnamed: 0,text,class
74503,"[rteenagers, please, help, thesis, hey, guy, b...",non-suicide
189213,"[music, suggestion, please, like, everything, ...",non-suicide
153796,"[herei, know, type, post, allowed, come, subre...",suicide
131761,"[downvote, post, reason, know]",non-suicide
27706,"[fuck, hopping, bandwagon, httpsmusictastespac...",non-suicide


In [6]:
tokens_count = df['text'].apply(len)
tokens_count.describe()

count    231979.000000
mean         59.129456
std         100.364899
min           0.000000
25%          13.000000
50%          28.000000
75%          69.000000
max        5717.000000
Name: text, dtype: float64

In [7]:
vectorizer = Word2Vec(
    sentences = df['text'],
    vector_size = 150,
    window = 5,
    min_count = 3,
    sg = 1,
    seed = 13,
    workers = 10
)

def vectorize_doc(doc, model):
    vectors = [model.wv[word] for word in doc if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

In [8]:
df['text'] = df['text'].apply(lambda tokens: vectorize_doc(tokens, vectorizer))
df['text'] = df['text'].apply(lambda arr: [float(x) for x in np.array(arr, dtype=np.float64)])
df['class'] = df['class'].map({'non-suicide': 0, 'suicide': 1})
df.sample(5)

Unnamed: 0,text,class
31535,"[-0.134375661611557, -0.04869122430682182, 0.1...",0
8156,"[0.02591336891055107, 0.0018674135208129883, 0...",0
92721,"[-0.11975962668657303, 0.010417267680168152, 0...",0
12735,"[-0.03838827833533287, -0.07487594336271286, 0...",0
122482,"[0.08334995806217194, 0.0761137306690216, 0.03...",1


In [10]:
df.to_parquet('../Data/preprocessing_data.parquet')
vectorizer.save('../Outputs/Models/Vectorizer.model')