Objective : redo form there : https://datastud.dev/posts/nlp-preprocess

Learn :
* NLTK
* PyTorch
* Spacy

# Import Libraries

In [27]:
import string

from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet', quiet=True)

from torchtext.data import get_tokenizer

from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\JeremySCHNEIDER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Custom functions

In [29]:
def remove_stopwords(input_text):
    return [token for token in input_text if token.lower() not in stopwords.words('english')]

def remove_punctuation(input_text):
    return [token for token in input_text if token not in set(string.punctuation)]

def lemmatize(input_text):
    # Instantiate class
    lem = WordNetLemmatizer()
    # Lemmatized text becomes input inside all loop runs
    lemmatized_text = input_text
    # Lemmatize each part of speech
    for part_of_speech in ['n', 'v', 'a', 'r', 's']:
        lemmatized_text = [lem.lemmatize(token, part_of_speech).lower() for token in lemmatized_text]
    return lemmatized_text

# Open files

In [15]:
with open("DATA/the_wind_cries_mary.txt", "r") as f:
    lines = f.readlines()

# NLTK

In [16]:
lines

['After all the jacks are in their boxes\n',
 'And the clowns have all gone to bed\n',
 'You can hear happiness staggering on down the street\n',
 'Footprints dressed in red\n',
 'And the wind whispers Mary \n',
 'A broom is drearily sweeping\n',
 'Up the broken pieces of yesterdays life\n',
 'Somewhere a queen is weeping\n',
 'Somewhere a king has no wife\n',
 'And the wind, it cries Mary \n',
 'The traffic lights they all true blue tomorrow\n',
 'And shine their emptiness down on my bed\n',
 'The tiny island sags downstream\n',
 'Cause the life that lived is, is dead\n',
 'And the wind screams Mary \n',
 'Will the wind ever remember\n',
 'The names it has blown in the past\n',
 "And with his crutch, its old age, and it's wisdom\n",
 'It whispers no, this will be the last\n',
 'And the wind cries Mary ']

In [17]:
lines_tokenized = [word_tokenize(line) for line in lines]
lines_tokenized

[['After', 'all', 'the', 'jacks', 'are', 'in', 'their', 'boxes'],
 ['And', 'the', 'clowns', 'have', 'all', 'gone', 'to', 'bed'],
 ['You',
  'can',
  'hear',
  'happiness',
  'staggering',
  'on',
  'down',
  'the',
  'street'],
 ['Footprints', 'dressed', 'in', 'red'],
 ['And', 'the', 'wind', 'whispers', 'Mary'],
 ['A', 'broom', 'is', 'drearily', 'sweeping'],
 ['Up', 'the', 'broken', 'pieces', 'of', 'yesterdays', 'life'],
 ['Somewhere', 'a', 'queen', 'is', 'weeping'],
 ['Somewhere', 'a', 'king', 'has', 'no', 'wife'],
 ['And', 'the', 'wind', ',', 'it', 'cries', 'Mary'],
 ['The', 'traffic', 'lights', 'they', 'all', 'true', 'blue', 'tomorrow'],
 ['And', 'shine', 'their', 'emptiness', 'down', 'on', 'my', 'bed'],
 ['The', 'tiny', 'island', 'sags', 'downstream'],
 ['Cause', 'the', 'life', 'that', 'lived', 'is', ',', 'is', 'dead'],
 ['And', 'the', 'wind', 'screams', 'Mary'],
 ['Will', 'the', 'wind', 'ever', 'remember'],
 ['The', 'names', 'it', 'has', 'blown', 'in', 'the', 'past'],
 ['And',
  '

## Stopwords

In [18]:
# View stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [21]:
# Apply stopword function
tokens_without_stopwords = [remove_stopwords(line) for line in lines_tokenized]
tokens_without_stopwords

[['jacks', 'boxes'],
 ['clowns', 'gone', 'bed'],
 ['hear', 'happiness', 'staggering', 'street'],
 ['Footprints', 'dressed', 'red'],
 ['wind', 'whispers', 'Mary'],
 ['broom', 'drearily', 'sweeping'],
 ['broken', 'pieces', 'yesterdays', 'life'],
 ['Somewhere', 'queen', 'weeping'],
 ['Somewhere', 'king', 'wife'],
 ['wind', ',', 'cries', 'Mary'],
 ['traffic', 'lights', 'true', 'blue', 'tomorrow'],
 ['shine', 'emptiness', 'bed'],
 ['tiny', 'island', 'sags', 'downstream'],
 ['Cause', 'life', 'lived', ',', 'dead'],
 ['wind', 'screams', 'Mary'],
 ['wind', 'ever', 'remember'],
 ['names', 'blown', 'past'],
 ['crutch', ',', 'old', 'age', ',', "'s", 'wisdom'],
 ['whispers', ',', 'last'],
 ['wind', 'cries', 'Mary']]

In [26]:
# Apply punctuation function
tokens_without_punctuation = [remove_punctuation(line) for line in tokens_without_stopwords]
tokens_without_punctuation

[['jacks', 'boxes'],
 ['clowns', 'gone', 'bed'],
 ['hear', 'happiness', 'staggering', 'street'],
 ['Footprints', 'dressed', 'red'],
 ['wind', 'whispers', 'Mary'],
 ['broom', 'drearily', 'sweeping'],
 ['broken', 'pieces', 'yesterdays', 'life'],
 ['Somewhere', 'queen', 'weeping'],
 ['Somewhere', 'king', 'wife'],
 ['wind', 'cries', 'Mary'],
 ['traffic', 'lights', 'true', 'blue', 'tomorrow'],
 ['shine', 'emptiness', 'bed'],
 ['tiny', 'island', 'sags', 'downstream'],
 ['Cause', 'life', 'lived', 'dead'],
 ['wind', 'screams', 'Mary'],
 ['wind', 'ever', 'remember'],
 ['names', 'blown', 'past'],
 ['crutch', 'old', 'age', "'s", 'wisdom'],
 ['whispers', 'last'],
 ['wind', 'cries', 'Mary']]

## Lemmatization

In [28]:
lem = WordNetLemmatizer()
lem.lemmatize('cries')

'cry'

In [31]:
# Apply lemmatize function
tokens_lemmatized = [lemmatize(line) for line in tokens_without_punctuation]
tokens_lemmatized

[['jack', 'box'],
 ['clown', 'go', 'bed'],
 ['hear', 'happiness', 'stagger', 'street'],
 ['footprints', 'dress', 'red'],
 ['wind', 'whisper', 'mary'],
 ['broom', 'drearily', 'sweep'],
 ['break', 'piece', 'yesterday', 'life'],
 ['somewhere', 'queen', 'weep'],
 ['somewhere', 'king', 'wife'],
 ['wind', 'cry', 'mary'],
 ['traffic', 'light', 'true', 'blue', 'tomorrow'],
 ['shine', 'emptiness', 'bed'],
 ['tiny', 'island', 'sag', 'downstream'],
 ['cause', 'life', 'live', 'dead'],
 ['wind', 'scream', 'mary'],
 ['wind', 'ever', 'remember'],
 ['name', 'blow', 'past'],
 ['crutch', 'old', 'age', "'s", 'wisdom'],
 ['whisper', 'last'],
 ['wind', 'cry', 'mary']]

# PyTorch

In [5]:
pytorch_tokenizer = get_tokenizer("basic_english")
pytorch_tokens = [pytorch_tokenizer(line) for line in lines]
pytorch_tokens

[['after', 'all', 'the', 'jacks', 'are', 'in', 'their', 'boxes'],
 ['and', 'the', 'clowns', 'have', 'all', 'gone', 'to', 'bed'],
 ['you',
  'can',
  'hear',
  'happiness',
  'staggering',
  'on',
  'down',
  'the',
  'street'],
 ['footprints', 'dressed', 'in', 'red'],
 ['and', 'the', 'wind', 'whispers', 'mary'],
 ['a', 'broom', 'is', 'drearily', 'sweeping'],
 ['up', 'the', 'broken', 'pieces', 'of', 'yesterdays', 'life'],
 ['somewhere', 'a', 'queen', 'is', 'weeping'],
 ['somewhere', 'a', 'king', 'has', 'no', 'wife'],
 ['and', 'the', 'wind', ',', 'it', 'cries', 'mary'],
 ['the', 'traffic', 'lights', 'they', 'all', 'true', 'blue', 'tomorrow'],
 ['and', 'shine', 'their', 'emptiness', 'down', 'on', 'my', 'bed'],
 ['the', 'tiny', 'island', 'sags', 'downstream'],
 ['cause', 'the', 'life', 'that', 'lived', 'is', ',', 'is', 'dead'],
 ['and', 'the', 'wind', 'screams', 'mary'],
 ['will', 'the', 'wind', 'ever', 'remember'],
 ['the', 'names', 'it', 'has', 'blown', 'in', 'the', 'past'],
 ['and',
  '

# Spacy

In [6]:
nlp = English()
spacy_tokenizer = Tokenizer(nlp.vocab)
spacy_tokens = [spacy_tokenizer(line) for line in lines]
spacy_tokens # spacy return a doc object, not just list of words. See more here: https://spacy.io/api/doc

[After all the jacks are in their boxes,
 And the clowns have all gone to bed,
 You can hear happiness staggering on down the street,
 Footprints dressed in red,
 And the wind whispers Mary ,
 A broom is drearily sweeping,
 Up the broken pieces of yesterdays life,
 Somewhere a queen is weeping,
 Somewhere a king has no wife,
 And the wind, it cries Mary ,
 The traffic lights they all true blue tomorrow,
 And shine their emptiness down on my bed,
 The tiny island sags downstream,
 Cause the life that lived is, is dead,
 And the wind screams Mary ,
 Will the wind ever remember,
 The names it has blown in the past,
 And with his crutch, its old age, and it's wisdom,
 It whispers no, this will be the last,
 And the wind cries Mary ]

In [7]:
[[token.text for token in line] for line in spacy_tokens]

[['After', 'all', 'the', 'jacks', 'are', 'in', 'their', 'boxes', '\n'],
 ['And', 'the', 'clowns', 'have', 'all', 'gone', 'to', 'bed', '\n'],
 ['You',
  'can',
  'hear',
  'happiness',
  'staggering',
  'on',
  'down',
  'the',
  'street',
  '\n'],
 ['Footprints', 'dressed', 'in', 'red', '\n'],
 ['And', 'the', 'wind', 'whispers', 'Mary', '\n'],
 ['A', 'broom', 'is', 'drearily', 'sweeping', '\n'],
 ['Up', 'the', 'broken', 'pieces', 'of', 'yesterdays', 'life', '\n'],
 ['Somewhere', 'a', 'queen', 'is', 'weeping', '\n'],
 ['Somewhere', 'a', 'king', 'has', 'no', 'wife', '\n'],
 ['And', 'the', 'wind,', 'it', 'cries', 'Mary', '\n'],
 ['The', 'traffic', 'lights', 'they', 'all', 'true', 'blue', 'tomorrow', '\n'],
 ['And', 'shine', 'their', 'emptiness', 'down', 'on', 'my', 'bed', '\n'],
 ['The', 'tiny', 'island', 'sags', 'downstream', '\n'],
 ['Cause', 'the', 'life', 'that', 'lived', 'is,', 'is', 'dead', '\n'],
 ['And', 'the', 'wind', 'screams', 'Mary', '\n'],
 ['Will', 'the', 'wind', 'ever', 're