# Preprocessing Text Data

In [12]:
import nltk
import pandas as pd
import numpy as np
import string 
import re
import contractions
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec

In [3]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/roberto/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/roberto/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /home/roberto/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
df = pd.read_parquet('../Data/cleaned_suicide_detection.parquet')

stopwords = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [5]:
def preprocessing(text):
    """Apply preprocessing steps to prepare text for classification"""

    text = contractions.fix(text)
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'[_\W]+', ' ', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return tokens

In [7]:
df['text'] = df['text'].apply(preprocessing)

df.sample(5)

Unnamed: 0,text,class
10066,"[unpopular, opinion, sugar, cooky, better, cho...",non-suicide
124461,"[help, anyone, going, kind, long, letting, kno...",non-suicide
80604,"[came, everyone, talked, mom, school, posted, ...",non-suicide
117600,"[want, okso, wanting, die, like, fucking, lose...",suicide
187188,"[considering, suicidei, full, time, student, k...",suicide


In [10]:
tokens_count = df['text'].apply(len)
tokens_count.describe()

count    231979.000000
mean         59.129456
std         100.364899
min           0.000000
25%          13.000000
50%          28.000000
75%          69.000000
max        5717.000000
Name: text, dtype: float64

In [13]:
vectorizer = Word2Vec(
    sentences = df['text'],
    vector_size = 150,
    window = 5,
    min_count = 3,
    sg = 1,
    seed = 13,
    workers = 10
)

def vectorize_doc(doc, model):
    vectors = [model.wv[word] for word in doc if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

In [None]:
df['text'] = df['text'].apply(lambda tokens: vectorize_doc(tokens, vectorizer))
df['text'] = df['text'].apply(lambda arr: [float(x) for x in np.array(arr, dtype=np.float64)])
df['class'] = df['class'].map({'non-suicide': 0, 'suicide': 1})
df.sample(5)

Unnamed: 0,text,class
224665,"[0.13732357, -0.016881073, 0.10697746, -0.0657...",1
8828,"[0.07704612, -0.02935385, 0.10011502, -0.15534...",1
159472,"[0.050362483, 0.03871868, 0.104064636, 0.00309...",0
141778,"[0.008742183, -0.08327679, 0.092561945, -0.011...",0
150841,"[0.04087665, 0.023715034, 0.18240535, -0.05154...",0


In [None]:
df.to_parquet('../Data/preprocessing_data.parquet')