In [16]:
import pandas as pd

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import ngrams

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/wojtek/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/wojtek/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/wojtek/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
filename = '../data/train_ready_for_WS'
# filename = '../data/test'

In [18]:
# load the data in csv format using pandas
data = pd.read_csv(f"{filename}.csv", delimiter=';', header=0)

In [19]:
print(data['essay'][91])

I was bullied in school, which led to self harm, and eventually a suicide attempt. This article reminded me so much of myself, and I want to do something to put a stop this bullying. As I sit here with tears welling my eyes, now a mother, I can't imagine being in the shoes of these parents who lost their children all because other kids just can't be nice. I am so sad for these kids who saw no other way out of this mess than to just end their lives. Their pain is felt deeply in my soul, and my wish is that somehow, someday, no one else will feel that kind of pain.


In [20]:
# remove numbers, punctuation and make everything lowercase
data['essay'] = data['essay'].str.replace('\d+', '', regex=True)
data['essay'] = data['essay'].str.replace('[^\w\s]','', regex=True)
data['essay'] = data['essay'].str.lower()

In [21]:
print(data['essay'][91])

i was bullied in school which led to self harm and eventually a suicide attempt this article reminded me so much of myself and i want to do something to put a stop this bullying as i sit here with tears welling my eyes now a mother i cant imagine being in the shoes of these parents who lost their children all because other kids just cant be nice i am so sad for these kids who saw no other way out of this mess than to just end their lives their pain is felt deeply in my soul and my wish is that somehow someday no one else will feel that kind of pain


In [22]:
# stopword removal
stop = stopwords.words('english')
data['essay'] = data['essay'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [23]:
print(data['essay'][91])

bullied school led self harm eventually suicide attempt article reminded much want something put stop bullying sit tears welling eyes mother cant imagine shoes parents lost children kids cant nice sad kids saw way mess end lives pain felt deeply soul wish somehow someday one else feel kind pain


In [24]:
# lemmatization
lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_tokens)
data['essay'] = data['essay'].apply(lemmatize_text)

In [25]:
print(data['essay'][91])

bullied school led self harm eventually suicide attempt article reminded much want something put stop bullying sit tear welling eye mother cant imagine shoe parent lost child kid cant nice sad kid saw way mess end life pain felt deeply soul wish somehow someday one else feel kind pain


In [26]:
# remove words which occur only once
from collections import Counter
cnt = Counter()
for text in data['essay'].values:
    for word in text.split():
        cnt[word] += 1

# create a set of words that occur only once
if 'test' in filename:
    once_occurred_words = {}
else:
    once_occurred_words = {word for word, count in cnt.items() if count == 1}

# iterate over the 'essay' column and remove the words that occur only once
data['essay'] = data['essay'].apply(lambda x: ' '.join([word for word in x.split() if word not in once_occurred_words]))

In [27]:
print(data['essay'][91])

bullied school led self harm eventually suicide attempt article reminded much want something put stop bullying sit tear welling eye mother cant imagine shoe parent lost child kid cant nice sad kid saw way mess end life pain felt deeply soul wish somehow someday one else feel kind pain


In [28]:
# 1-gram and bi-gram extraction
# Tokenize the essays into words
data['tokens'] = data['essay'].apply(word_tokenize)

# Function to extract n-grams from a list of tokens
def extract_ngrams(tokens, n):
    return list(ngrams(tokens, n))

# Extract 1-grams and 2-grams
data['1-gram'] = data['tokens'].apply(lambda x: extract_ngrams(x, 1))
data['2-gram'] = data['tokens'].apply(lambda x: extract_ngrams(x, 2))

# Convert the n-grams to strings for easier representation
data['1-gram'] = data['1-gram'].apply(lambda x: [' '.join(gram) for gram in x])
data['2-gram'] = data['2-gram'].apply(lambda x: [' '.join(gram) for gram in x])

# Convert the lists of n-grams to DataFrames
data['1-gram'] = pd.DataFrame(data['1-gram'])
data['2-gram'] = pd.DataFrame(data['2-gram'])

In [29]:
print(data['2-gram'][91])

['bullied school', 'school led', 'led self', 'self harm', 'harm eventually', 'eventually suicide', 'suicide attempt', 'attempt article', 'article reminded', 'reminded much', 'much want', 'want something', 'something put', 'put stop', 'stop bullying', 'bullying sit', 'sit tear', 'tear welling', 'welling eye', 'eye mother', 'mother cant', 'cant imagine', 'imagine shoe', 'shoe parent', 'parent lost', 'lost child', 'child kid', 'kid cant', 'cant nice', 'nice sad', 'sad kid', 'kid saw', 'saw way', 'way mess', 'mess end', 'end life', 'life pain', 'pain felt', 'felt deeply', 'deeply soul', 'soul wish', 'wish somehow', 'somehow someday', 'someday one', 'one else', 'else feel', 'feel kind', 'kind pain']


In [30]:
# save the preprocessed data
data.to_csv(f'{filename}_preprocessed.csv', sep=';', index=False)