# NLTK

#### Install NLTK

In [None]:
%%bash
pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


#### Download models or corpora

In [None]:
# import nltk
!python -m nltk.downloader # shows a window when graphical output available

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> Traceback (most recent call last):


During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3.9/runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.9/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.9/dist-packages/nltk/downloader.py", line 2550, in <module>
    downloader.download(
  File "/usr/local/lib/python3.9/dist-packages/nltk/downloader.py", line 763, in download
    self._interactive_download()
  File "/usr/local/lib/python3.9/dist-packages/nltk/downloader.py", line 1115, in _interactive_download
    DownloaderShell(self).run()
  File "/usr/local/lib/pyth

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#### Tokenization

In [None]:
tweet = "RT @lOR42wsOEFcv3f: I fall too fast, crash too hard, forgive too easily and care too much... :( #amiright"

In [None]:
query = 'fast'

The naive way...

In [None]:
tweet.find(query)

31

In [None]:
tweet.split()

['RT',
 '@lOR42wsOEFcv3f:',
 'I',
 'fall',
 'too',
 'fast,',
 'crash',
 'too',
 'hard,',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much...',
 ':(',
 '#amiright']

In [None]:
[query in tweet.split()]

[False]

Correct tokenization: informed splitting of the text into tokens

In [None]:

nltk.word_tokenize(tweet)

['RT',
 '@',
 'lOR42wsOEFcv3f',
 ':',
 'I',
 'fall',
 'too',
 'fast',
 ',',
 'crash',
 'too',
 'hard',
 ',',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much',
 '...',
 ':',
 '(',
 '#',
 'amiright']

In [None]:
[query in nltk.word_tokenize(tweet)]
# query

[True]

In [None]:
nltk.word_tokenize(tweet, language='spanish')

['RT',
 '@',
 'lOR42wsOEFcv3f',
 ':',
 'I',
 'fall',
 'too',
 'fast',
 ',',
 'crash',
 'too',
 'hard',
 ',',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much',
 '...',
 ':',
 '(',
 '#',
 'amiright']

More options...

In [None]:
from nltk.tokenize import RegexpTokenizer
custom_tokenizer = RegexpTokenizer('[a-zA-Z0-9]*', discard_empty=False)

In [None]:
custom_tokenizer.tokenize(tweet)

['RT',
 '',
 '',
 'lOR42wsOEFcv3f',
 '',
 '',
 'I',
 '',
 'fall',
 '',
 'too',
 '',
 'fast',
 '',
 '',
 'crash',
 '',
 'too',
 '',
 'hard',
 '',
 '',
 'forgive',
 '',
 'too',
 '',
 'easily',
 '',
 'and',
 '',
 'care',
 '',
 'too',
 '',
 'much',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'amiright',
 '']

In [None]:
from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)

In [None]:
tweet_tokenizer.tokenize(tweet)

['RT',
 ':',
 'I',
 'fall',
 'too',
 'fast',
 ',',
 'crash',
 'too',
 'hard',
 ',',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much',
 '...',
 ':(',
 '#amiright']

In [None]:
from nltk.tokenize import MWETokenizer
mwe = MWETokenizer()
mwe.add_mwe(('too', 'fast'))
mwe.tokenize(tweet_tokenizer.tokenize(tweet))

['RT',
 ':',
 'I',
 'fall',
 'too_fast',
 ',',
 'crash',
 'too',
 'hard',
 ',',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much',
 '...',
 ':(',
 '#amiright']

In [None]:
mwe.add_mwe((('too', 'fast'), ('too', 'hard')))

In [None]:
query = 'fast'
query in mwe.tokenize(tweet_tokenizer.tokenize(tweet))

False

### Normalization

In [None]:
tweet.lower()

'rt @lor42wsoefcv3f: i fall too fast, crash too hard, forgive too easily and care too much... :( #amiright'

In [None]:
import re
import string

def normalize_tokens(tokenized_text):
    # Lowercase
    tokens = [t.lower() for t in tokenized_text]
    # Remove hashtags
    tokens = [t for t in tokens if not t.startswith('#')]
    # Remove punctuation
    tokens = [t for t in tokens if t not in string.punctuation]
    # Keep only letters
    tokens = [t for t in tokens if re.match('^[a-z]+$', t)]
    # Normalize characters
    tokens = [re.sub('á', 'a', t) for t in tokens]

    return tokens

In [None]:
spanish_query = 'muy rápido'
normalize_tokens(tweet_tokenizer.tokenize(spanish_query))

['muy']

In [None]:
!pip install unidecode
import unidecode
unidecode.unidecode(spanish_query)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting unidecode
  Downloading Unidecode-1.3.6-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.9/235.9 KB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.6


'muy rapido'

In [None]:
normalize_tokens(tweet_tokenizer.tokenize(tweet))

['rt',
 'i',
 'fall',
 'too',
 'fast',
 'crash',
 'too',
 'hard',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much']

#### Uniform normalization principle

In [None]:
query = 'TOO fast TOO furious'
tokenized_query = tweet_tokenizer.tokenize(query)
normalized_query = normalize_tokens(tokenized_query)
# normalized_query = tokenized_query
normalized_query

['too', 'fast', 'too', 'furious']

In [None]:
normalized_tweet = normalize_tokens(tweet_tokenizer.tokenize(tweet))
# normalized_tweet = normalize_tokens(tweet.split())
normalized_tweet

['rt',
 'i',
 'fall',
 'too',
 'fast',
 'crash',
 'too',
 'hard',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much']

In [None]:
common_words = set(normalized_query).intersection(normalized_tweet)
print(common_words)
print(len(common_words), "common word(s)")

{'fast', 'too'}
2 common word(s)


#### Stemming / Lemmatization


In [None]:
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer


In [None]:
stemmer = PorterStemmer()

[stemmer.stem(t) for t in normalized_tweet]

['rt',
 'i',
 'fall',
 'too',
 'fast',
 'crash',
 'too',
 'hard',
 'forgiv',
 'too',
 'easili',
 'and',
 'care',
 'too',
 'much']

In [None]:
stemmer = nltk.LancasterStemmer() # is prone to overstemming
[stemmer.stem(t) for t in normalized_tweet]


['rt',
 'i',
 'fal',
 'too',
 'fast',
 'crash',
 'too',
 'hard',
 'forg',
 'too',
 'easy',
 'and',
 'car',
 'too',
 'much']

In [None]:
stemmer = SnowballStemmer(language='english') # Porter2

[stemmer.stem(t) for t in normalized_tweet]

['rt',
 'i',
 'fall',
 'too',
 'fast',
 'crash',
 'too',
 'hard',
 'forgiv',
 'too',
 'easili',
 'and',
 'care',
 'too',
 'much']

In [None]:
print(stemmer.stem("running"))

print(stemmer.stem("runs"))

print(stemmer.stem("ran"))

print(stemmer.stem("darling"))

print(stemmer.stem("are"))

print(stemmer.stem("bring"))

print(stemmer.stem("being"))

print(stemmer.stem("Charles"))


run
run
ran
darl
are
bring
be
charl


In [None]:
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

[lemmatizer.lemmatize(t) for t in normalized_tweet]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


['rt',
 'i',
 'fall',
 'too',
 'fast',
 'crash',
 'too',
 'hard',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much']

In [None]:
nltk.download('averaged_perceptron_tagger')


tagged_tweet = nltk.pos_tag(normalized_tweet) # More on this next lab...
print(tagged_tweet)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


[('rt', 'NN'), ('i', 'NN'), ('fall', 'VBP'), ('too', 'RB'), ('fast', 'JJ'), ('crash', 'NN'), ('too', 'RB'), ('hard', 'JJ'), ('forgive', 'JJ'), ('too', 'RB'), ('easily', 'RB'), ('and', 'CC'), ('care', 'VB'), ('too', 'RB'), ('much', 'JJ')]


In [None]:
from nltk.corpus import wordnet as wn
tag_map = {'J': wn.ADJ, 'V': wn.VERB, 'R': wn.ADV, 'N': wn.NOUN}
def get_lemmas(tokenized_text):
    tagged_text = nltk.pos_tag(tokenized_text)
    return [lemmatizer.lemmatize(w, pos=tag_map.get(p[0], wn.NOUN)) for (w, p) in tagged_text]


In [None]:
query = "the fastest!"
normalized_query = normalize_tokens(tweet_tokenizer.tokenize(query))
print(normalized_query)

['the', 'fastest']


In [None]:
lemmatized_tweet = get_lemmas(normalized_tweet)
lemmatized_query = get_lemmas(normalized_query)
print(lemmatized_tweet)
print(lemmatized_query)


['rt', 'i', 'fall', 'too', 'fast', 'crash', 'too', 'hard', 'forgive', 'too', 'easily', 'and', 'care', 'too', 'much']
['the', 'fast']


In [None]:
tweet = "I am so fast, I am the fastest!"
normalized_tweet = normalize_tokens(tweet_tokenizer.tokenize(tweet))
normalized_tweet


['i', 'am', 'so', 'fast', 'i', 'am', 'the', 'fastest']

In [None]:
[lemmatizer.lemmatize(t) for t in normalized_tweet]


['i', 'am', 'so', 'fast', 'i', 'am', 'the', 'fastest']

In [None]:
get_lemmas(normalized_tweet)

['i', 'be', 'so', 'fast', 'i', 'be', 'the', 'fast']

In [None]:
print("Common words:", set(lemmatized_tweet).intersection(set(lemmatized_query)))

Common words: {'fast'}


#### Stopwords

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.corpus import stopwords
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [None]:
blacklist_words = stopwords.words('english') + ['rt']

In [None]:
cleaned_tweet = [t for t in normalized_tweet if t not in blacklist_words]
print(cleaned_tweet)

['fast', 'fastest']


#### Vocabulary

In [None]:
from collections import Counter

Counter(get_lemmas(normalized_tweet)).most_common(5)

[('i', 2), ('be', 2), ('fast', 2), ('so', 1), ('the', 1)]

In [None]:
tweet = "I am so fast, I am the fastest!"
normalized_tweet = normalize_tokens(tweet_tokenizer.tokenize(tweet))
lemmatized_tweet = get_lemmas(normalized_tweet)
print(lemmatized_tweet)

['i', 'be', 'so', 'fast', 'i', 'be', 'the', 'fast']


In [None]:
print(Counter(normalized_tweet))
print(Counter(lemmatized_tweet))

Counter({'i': 2, 'am': 2, 'so': 1, 'fast': 1, 'the': 1, 'fastest': 1})
Counter({'i': 2, 'be': 2, 'fast': 2, 'so': 1, 'the': 1})


#### Sentence segmentation

In [None]:
query = "I am too fast. I am too furious."

In [None]:
from nltk.tokenize import sent_tokenize

In [None]:
sent_tokenize(query)

['I am too fast.', 'I am too furious.']

In [None]:
spanish_tokenizer = nltk.data.load('tokenizers/punkt/PY3/spanish.pickle')
spanish_query = 'Soy muy rápido! Estoy muy furioso!'
spanish_tokenizer.tokenize(spanish_query)

['Soy muy rápido!', 'Estoy muy furioso!']

In [None]:
sent_tokenize("J.K. Rowling is rich. I am not as rich as J.K.")

['J.K. Rowling is rich.', 'I am not as rich as J.K.']

In [None]:
from nltk.tokenize import PunktSentenceTokenizer
PunktSentenceTokenizer??

#### Numeral conversion

In [None]:
!pip install word2number
!pip install num2words

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting word2number
  Downloading word2number-1.1.zip (9.7 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: word2number
  Building wheel for word2number (setup.py) ... [?25l[?25hdone
  Created wheel for word2number: filename=word2number-1.1-py3-none-any.whl size=5582 sha256=e4d5bbbfaef94378c2ff3296d760969fcfc83e6632e19958f21fe44b896d58ac
  Stored in directory: /root/.cache/pip/wheels/a0/4a/5b/d2f2df5c344ddbecb8bea759872c207ea91d93f57fb54e816e
Successfully built word2number
Installing collected packages: word2number
Successfully installed word2number-1.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting num2words
  Downloading num2words-0.5.12-py3-none-any.whl (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.2/125.2 KB[0m [31m4.4 MB/s[0m eta [36m0:

In [None]:
import word2number
from word2number import w2n
w2n.word_to_num("eleven")


11

In [None]:
w2n.word_to_num("twenty three")

23

In [None]:
from num2words import num2words

In [None]:
num2words(12)

'twelve'

In [None]:
num2words(101)


'one hundred and one'

In [None]:
num2words(2020)

'two thousand and twenty'

In [None]:
w2n.word_to_num("Twelve o'clock!")

12

### Exercise (1p)


Find a recent news article online.
Read it in a python variable (input it manually or read from a file).

Write a function that normalizes the text and splits it into tokens. Add flags to customize the different preprocessing choices (which stemmer/lemmatizer to use, whether to lowercase, whether to convert numbers, whether to remove stopwords, ...).

Store the vocabulary of unique tokens found in the text.

Compare the number of unique tokens ("types") with different preprocessing settings.

In [None]:
raw_text = 'The cat (Felis catus) is a domestic species of small carnivorous mammal.[1][2] It is the only domesticated species in the family Felidae and is commonly referred to as the domestic cat or house cat to distinguish it from the wild members of the family.[4] Cats are commonly kept as house pets but can also be farm cats or feral cats; the feral cat ranges freely and avoids human contact.[5] Domestic cats are valued by humans for companionship and their ability to kill rodents. About 60 cat breeds are recognized by various cat registries.[6] The cat is similar in anatomy to the other felid species: it has a strong flexible body, quick reflexes, sharp teeth, and retractable claws adapted to killing small prey. Its night vision and sense of smell are well developed. Cat communication includes vocalizations like meowing, purring, trilling, hissing, growling, and grunting as well as cat-specific body language. Although the cat is a social species, it is a solitary hunter. As a predator, it is crepuscular, i.e. most active at dawn and dusk. It can hear sounds too faint or too high in frequency for human ears, such as those made by mice and other small mammals.[7] It also secretes and perceives pheromones.[8] '

In [None]:
def preprocesare(text, lowercase=True, remove_numbers=False, remove_stopwords=True, stemming=None, lemmatization=None, punctuation=True):

    #remove text within paranthesis
    text = re.sub(r'[\(\[\{].*?[\)\]\}]', '', text)

    #tokenize words
    text = nltk.word_tokenize(text, language='english')

    #lowercase
    if lowercase:
        text = [t.lower() for t in text]

    #remove punctuation
    if punctuation:
        text = [t for t in text if t not in string.punctuation]

    # by default remove numbers is set to False as I wanted to keep the numbers in text and only delete the numbers betwen paranthesis from the text
    #( ex: The cat (Felis catus) is a domestic species of small carnivorous mammal.[1][2])
    if remove_numbers:
        text = [t for t in text if re.match('^[a-z]+$', t)]

    #remove stopwords
    if remove_stopwords:
        text = [t for t in text if t not in blacklist_words]

    #choose stemmer
    if stemming:
        if stemming == 'porter':
            stemmer = PorterStemmer()
        elif stemming == 'snowball':
            stemmer = SnowballStemmer('english')
        else:
            raise ValueError("Choose valid stemmer!")
        text = [stemmer.stem(t) for t in text]

    #choose lemmatizer
    if lemmatization:
        if lemmatization == 'wordnet':
            lemmatizer = WordNetLemmatizer()
        else:
            raise ValueError("Choose valid lemmatizer!")
        text = [lemmatizer.lemmatize(t) for t in text]



    #remove punctuation
    if punctuation:
        text = [t for t in text if t not in string.punctuation]

    # by default remove numbers is set to False as I wanted to keep the numbers in text and only delete the numbers betwen paranthesis from the text
    #( ex: The cat (Felis catus) is a domestic species of small carnivorous mammal.[1][2])
    if remove_numbers:
        text = [t for t in text if re.match('^[a-z]+$', t)]

    #remove stopwords
    if remove_stopwords:
        text = [t for t in text if t not in blacklist_words]


    return text


In [None]:
preprocessed_text_1 = preprocesare(raw_text, stemming= 'porter')
preprocessed_text_2 = preprocesare(raw_text, lemmatization='wordnet', remove_numbers=True)
preprocessed_text_3 = preprocesare(raw_text, stemming= 'porter', remove_stopwords=False)

print(f"Tokens length with stemming and default settings: {len(preprocessed_text_1)}")
print(f"Tokens length with lemmatization and removing numbers: {len(preprocessed_text_2)}")
print(f"Tokens length with stemming, removing numbers and keeping stopwords: {len(preprocessed_text_3)}")

Tokens length with stemming and default settings: 118
Tokens length with lemmatization and removing numbers: 115
Tokens length with stemming, removing numbers and keeping stopwords: 203
