In [82]:
import pandas as pd

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import ngrams

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/wojtek/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/wojtek/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/wojtek/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [71]:
# load the data in csv format using pandas
data = pd.read_csv('../data/train_ready_for_WS.csv', delimiter=';', header=0)

In [72]:
print(data['essay'][91])

Wow, how dare this person hurt a poor flamingo for no reason!? Why would some random dude go to Busch Gardens and abuse a bird who was minding its own business? Poor bird had to be euthanized because of its injuries! The bird was 19 years old! It seems that there may have to be some sort of background check or mental health testing before you are even allowed to visit a place like this.Just unbelievable the way things happen! Soooo SAD!


In [73]:
# remove numbers, punctuation and make everything lowercase
data['essay'] = data['essay'].str.replace('\d+', '', regex=True)
data['essay'] = data['essay'].str.replace('[^\w\s]','', regex=True)
data['essay'] = data['essay'].str.lower()

In [74]:
print(data['essay'][91])

wow how dare this person hurt a poor flamingo for no reason why would some random dude go to busch gardens and abuse a bird who was minding its own business poor bird had to be euthanized because of its injuries the bird was  years old it seems that there may have to be some sort of background check or mental health testing before you are even allowed to visit a place like thisjust unbelievable the way things happen soooo sad


In [75]:
# stopword removal
stop = stopwords.words('english')
data['essay'] = data['essay'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [76]:
print(data['essay'][91])

wow dare person hurt poor flamingo reason would random dude go busch gardens abuse bird minding business poor bird euthanized injuries bird years old seems may sort background check mental health testing even allowed visit place like thisjust unbelievable way things happen soooo sad


In [77]:
# lemmatization
lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_tokens)
data['essay'] = data['essay'].apply(lemmatize_text)

In [78]:
print(data['essay'][91])

wow dare person hurt poor flamingo reason would random dude go busch garden abuse bird minding business poor bird euthanized injury bird year old seems may sort background check mental health testing even allowed visit place like thisjust unbelievable way thing happen soooo sad


In [79]:
# remove words which occur only once
from collections import Counter
cnt = Counter()
for text in data['essay'].values:
    for word in text.split():
        cnt[word] += 1

# create a set of words that occur only once
once_occurred_words = {word for word, count in cnt.items() if count == 1}

# iterate over the 'essay' column and remove the words that occur only once
data['essay'] = data['essay'].apply(lambda x: ' '.join([word for word in x.split() if word not in once_occurred_words]))

In [80]:
print(data['essay'][91])

wow dare person hurt poor flamingo reason would random dude go busch garden abuse bird minding business poor bird injury bird year old seems may sort background check mental health testing even allowed visit place like unbelievable way thing happen sad


In [83]:
# 1-gram and bi-gram extraction
# Tokenize the essays into words
data['tokens'] = data['essay'].apply(word_tokenize)

# Function to extract n-grams from a list of tokens
def extract_ngrams(tokens, n):
    return list(ngrams(tokens, n))

# Extract 1-grams and 2-grams
data['1-gram'] = data['tokens'].apply(lambda x: extract_ngrams(x, 1))
data['2-gram'] = data['tokens'].apply(lambda x: extract_ngrams(x, 2))

# Convert the n-grams to strings for easier representation
data['1-gram'] = data['1-gram'].apply(lambda x: [' '.join(gram) for gram in x])
data['2-gram'] = data['2-gram'].apply(lambda x: [' '.join(gram) for gram in x])

# Convert the lists of n-grams to DataFrames
data['1-gram'] = pd.DataFrame(data['1-gram'])
data['2-gram'] = pd.DataFrame(data['2-gram'])

In [85]:
print(data['2-gram'][91])

['wow dare', 'dare person', 'person hurt', 'hurt poor', 'poor flamingo', 'flamingo reason', 'reason would', 'would random', 'random dude', 'dude go', 'go busch', 'busch garden', 'garden abuse', 'abuse bird', 'bird minding', 'minding business', 'business poor', 'poor bird', 'bird injury', 'injury bird', 'bird year', 'year old', 'old seems', 'seems may', 'may sort', 'sort background', 'background check', 'check mental', 'mental health', 'health testing', 'testing even', 'even allowed', 'allowed visit', 'visit place', 'place like', 'like unbelievable', 'unbelievable way', 'way thing', 'thing happen', 'happen sad']


In [86]:
# save the preprocessed data
data.to_csv('../data/train_ready_for_WS_preprocessed.csv', sep=';', index=False)