# NLTK

#### Install NLTK

In [None]:
%%bash
pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


#### Download models or corpora

In [None]:
# import nltk
!python -m nltk.downloader # shows a window when graphical output available

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/nltk/downloader.py", line 1113, in _interactive_download
  File "/usr/local/lib/python3.8/dist-packages/nltk/downloader.py", line 1394, in __init__
    top = self.top = Tk()
  File "/usr/lib/python3.8/tkinter/__init__.py", line 2270, in __init__
    self.tk = _tkinter.create(screenName, baseName, className, interactive, wantobjects, useTk, sync, use)
_tkinter.TclError: no display name and no $DISPLAY environment variable

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/py

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#### Tokenization

In [None]:
tweet = "RT @lOR42wsOEFcv3f: I fall too fast, crash too hard, forgive too easily and care too much... :( #amiright"

In [None]:
query = 'fast'

The naive way...

In [None]:
tweet.find(query)

31

In [None]:
tweet.split()

['RT',
 '@lOR42wsOEFcv3f:',
 'I',
 'fall',
 'too',
 'fast,',
 'crash',
 'too',
 'hard,',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much...',
 ':(',
 '#amiright']

In [None]:
[query in tweet.split()]

[False]

Correct tokenization: informed splitting of the text into tokens

In [None]:
nltk.word_tokenize(tweet)

['RT',
 '@',
 'lOR42wsOEFcv3f',
 ':',
 'I',
 'fall',
 'too',
 'fast',
 ',',
 'crash',
 'too',
 'hard',
 ',',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much',
 '...',
 ':',
 '(',
 '#',
 'amiright']

In [None]:
[query in nltk.word_tokenize(tweet)]
# query

[True]

In [None]:
nltk.word_tokenize(tweet, language='spanish')

['RT',
 '@',
 'lOR42wsOEFcv3f',
 ':',
 'I',
 'fall',
 'too',
 'fast',
 ',',
 'crash',
 'too',
 'hard',
 ',',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much',
 '...',
 ':',
 '(',
 '#',
 'amiright']

More options...

In [None]:
from nltk.tokenize import RegexpTokenizer
custom_tokenizer = RegexpTokenizer('[a-zA-Z0-9]*', discard_empty=False)

In [None]:
custom_tokenizer.tokenize(tweet)

['RT',
 '',
 '',
 'lOR42wsOEFcv3f',
 '',
 '',
 'I',
 '',
 'fall',
 '',
 'too',
 '',
 'fast',
 '',
 '',
 'crash',
 '',
 'too',
 '',
 'hard',
 '',
 '',
 'forgive',
 '',
 'too',
 '',
 'easily',
 '',
 'and',
 '',
 'care',
 '',
 'too',
 '',
 'much',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'amiright',
 '']

In [None]:
from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)

In [None]:
tweet_tokenizer.tokenize(tweet)

['RT',
 ':',
 'I',
 'fall',
 'too',
 'fast',
 ',',
 'crash',
 'too',
 'hard',
 ',',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much',
 '...',
 ':(',
 '#amiright']

In [None]:
from nltk.tokenize import MWETokenizer
mwe = MWETokenizer()
mwe.add_mwe(('too', 'fast'))
mwe.tokenize(tweet_tokenizer.tokenize(tweet))

['RT',
 ':',
 'I',
 'fall',
 'too_fast',
 ',',
 'crash',
 'too',
 'hard',
 ',',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much',
 '...',
 ':(',
 '#amiright']

In [None]:
mwe.add_mwe((('too', 'fast'), ('too', 'hard')))

In [None]:
query = 'fast'
query in mwe.tokenize(tweet_tokenizer.tokenize(tweet))

False

### Normalization

In [None]:
tweet.lower()

'rt @lor42wsoefcv3f: i fall too fast, crash too hard, forgive too easily and care too much... :( #amiright'

In [None]:
import re
import string

def normalize_tokens(tokenized_text):
    # Lowercase
    tokens = [t.lower() for t in tokenized_text]
    # Remove hashtags
    tokens = [t for t in tokens if not t.startswith('#')]
    # Remove punctuation
    tokens = [t for t in tokens if t not in string.punctuation]
    # Keep only letters
#     tokens = [t for t in tokens if re.match('^[a-z]+$', t)]
    # Normalize characters
    tokens = [re.sub('á', 'a', t) for t in tokens]

    return tokens

In [None]:
spanish_query = 'muy rápido'
normalize_tokens(tweet_tokenizer.tokenize(spanish_query))

['muy', 'rapido']

In [None]:
!pip install unidecode
import unidecode
unidecode.unidecode(spanish_query)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting unidecode
  Downloading Unidecode-1.3.6-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.9/235.9 KB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.6


'muy rapido'

In [None]:
normalize_tokens(tweet_tokenizer.tokenize(tweet))

['rt',
 'i',
 'fall',
 'too',
 'fast',
 'crash',
 'too',
 'hard',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much',
 '...',
 ':(']

#### Uniform normalization principle

In [None]:
query = 'TOO fast TOO furious'
tokenized_query = tweet_tokenizer.tokenize(query)
normalized_query = normalize_tokens(tokenized_query)
# normalized_query = tokenized_query
normalized_query

['too', 'fast', 'too', 'furious']

In [None]:
normalized_tweet = normalize_tokens(tweet_tokenizer.tokenize(tweet))
# normalized_tweet = normalize_tokens(tweet.split())
normalized_tweet

['rt',
 'i',
 'fall',
 'too',
 'fast',
 'crash',
 'too',
 'hard',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much',
 '...',
 ':(']

In [None]:
common_words = set(normalized_query).intersection(normalized_tweet)
print(common_words)
print(len(common_words), "common word(s)")

{'too', 'fast'}
2 common word(s)


#### Stemming / Lemmatization


In [None]:
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer


In [None]:
stemmer = PorterStemmer()

[stemmer.stem(t) for t in normalized_tweet]

['rt',
 'i',
 'fall',
 'too',
 'fast',
 'crash',
 'too',
 'hard',
 'forgiv',
 'too',
 'easili',
 'and',
 'care',
 'too',
 'much',
 '...',
 ':(']

In [None]:
stemmer = nltk.LancasterStemmer() # is prone to overstemming
[stemmer.stem(t) for t in normalized_tweet]


['rt',
 'i',
 'fal',
 'too',
 'fast',
 'crash',
 'too',
 'hard',
 'forg',
 'too',
 'easy',
 'and',
 'car',
 'too',
 'much',
 '...',
 ':(']

In [None]:
stemmer = SnowballStemmer(language='english') # Porter2

[stemmer.stem(t) for t in normalized_tweet]

['rt',
 'i',
 'fall',
 'too',
 'fast',
 'crash',
 'too',
 'hard',
 'forgiv',
 'too',
 'easili',
 'and',
 'care',
 'too',
 'much',
 '...',
 ':(']

In [None]:
print(stemmer.stem("running"))

print(stemmer.stem("runs"))

print(stemmer.stem("ran"))

print(stemmer.stem("darling"))

print(stemmer.stem("are"))

print(stemmer.stem("bring"))

print(stemmer.stem("being"))

print(stemmer.stem("Charles"))


run
run
ran
darl
are
bring
be
charl


In [None]:
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

[lemmatizer.lemmatize(t) for t in normalized_tweet]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


['rt',
 'i',
 'fall',
 'too',
 'fast',
 'crash',
 'too',
 'hard',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much',
 '...',
 ':(']

In [None]:
nltk.download('averaged_perceptron_tagger')


tagged_tweet = nltk.pos_tag(normalized_tweet) # More on this next lab...
print(tagged_tweet)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


[('rt', 'NN'), ('i', 'NN'), ('fall', 'VBP'), ('too', 'RB'), ('fast', 'JJ'), ('crash', 'NN'), ('too', 'RB'), ('hard', 'JJ'), ('forgive', 'JJ'), ('too', 'RB'), ('easily', 'RB'), ('and', 'CC'), ('care', 'VB'), ('too', 'RB'), ('much', 'JJ'), ('...', ':'), (':(', 'NN')]


In [None]:
from nltk.corpus import wordnet as wn
tag_map = {'J': wn.ADJ, 'V': wn.VERB, 'R': wn.ADV, 'N': wn.NOUN}
def get_lemmas(tokenized_text):
    tagged_text = nltk.pos_tag(tokenized_text)
    return [lemmatizer.lemmatize(w, pos=tag_map.get(p[0], wn.NOUN)) for (w, p) in tagged_text]


In [None]:
query = "the fastest!"
normalized_query = normalize_tokens(tweet_tokenizer.tokenize(query))
print(normalized_query)

['the', 'fastest']


In [None]:
lemmatized_tweet = get_lemmas(normalized_tweet)
lemmatized_query = get_lemmas(normalized_query)
print(lemmatized_tweet)
print(lemmatized_query)


['rt', 'i', 'fall', 'too', 'fast', 'crash', 'too', 'hard', 'forgive', 'too', 'easily', 'and', 'care', 'too', 'much', '...', ':(']
['the', 'fast']


In [None]:
tweet = "I am so fast, I am the fastest!"
normalized_tweet = normalize_tokens(tweet_tokenizer.tokenize(tweet))
normalized_tweet


['i', 'am', 'so', 'fast', 'i', 'am', 'the', 'fastest']

In [None]:
[lemmatizer.lemmatize(t) for t in normalized_tweet]


['i', 'am', 'so', 'fast', 'i', 'am', 'the', 'fastest']

In [None]:
get_lemmas(normalized_tweet)

['i', 'be', 'so', 'fast', 'i', 'be', 'the', 'fast']

In [None]:
print("Common words:", set(lemmatized_tweet).intersection(set(lemmatized_query)))

Common words: {'fast'}


#### Stopwords

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.corpus import stopwords
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [None]:
blacklist_words = stopwords.words('english') + ['rt']

In [None]:
cleaned_tweet = [t for t in normalized_tweet if t not in blacklist_words]
print(cleaned_tweet)

['fast', 'fastest']


#### Vocabulary

In [None]:
from collections import Counter

Counter(get_lemmas(normalized_tweet)).most_common(5)

[('i', 2), ('be', 2), ('fast', 2), ('so', 1), ('the', 1)]

In [None]:
tweet = "I am so fast, I am the fastest!"
normalized_tweet = normalize_tokens(tweet_tokenizer.tokenize(tweet))
lemmatized_tweet = get_lemmas(normalized_tweet)
print(lemmatized_tweet)

['i', 'be', 'so', 'fast', 'i', 'be', 'the', 'fast']


In [None]:
print(Counter(normalized_tweet))
print(Counter(lemmatized_tweet))

Counter({'i': 2, 'am': 2, 'so': 1, 'fast': 1, 'the': 1, 'fastest': 1})
Counter({'i': 2, 'be': 2, 'fast': 2, 'so': 1, 'the': 1})


#### Sentence segmentation

In [None]:
query = "I am too fast. I am too furious."

In [None]:
from nltk.tokenize import sent_tokenize

In [None]:
sent_tokenize(query)

['I am too fast.', 'I am too furious.']

In [None]:
spanish_tokenizer = nltk.data.load('tokenizers/punkt/PY3/spanish.pickle')
spanish_query = 'Soy muy rápido! Estoy muy furioso!'
spanish_tokenizer.tokenize(spanish_query)

['Soy muy rápido!', 'Estoy muy furioso!']

In [None]:
sent_tokenize("J.K. Rowling is rich. I am not as rich as J.K.")

['J.K. Rowling is rich.', 'I am not as rich as J.K.']

In [None]:
from nltk.tokenize import PunktSentenceTokenizer
PunktSentenceTokenizer??

#### Numeral conversion

In [None]:
!pip install word2number
!pip install num2words

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting word2number
  Downloading word2number-1.1.zip (9.7 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: word2number
  Building wheel for word2number (setup.py) ... [?25l[?25hdone
  Created wheel for word2number: filename=word2number-1.1-py3-none-any.whl size=5582 sha256=0af1bb44a5ddd3f08db33c846318b132b0af2700cd9e3d281259bb4041ebd884
  Stored in directory: /root/.cache/pip/wheels/cb/f3/5a/d88198fdeb46781ddd7e7f2653061af83e7adb2a076d8886d6
Successfully built word2number
Installing collected packages: word2number
Successfully installed word2number-1.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting num2words
  Downloading num2words-0.5.12-py3-none-any.whl (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.2/125.2 KB[0m [31m6.9 MB/s[0m eta [36m0:

In [None]:
import word2number
from word2number import w2n
w2n.word_to_num("eleven")


11

In [None]:
w2n.word_to_num("twenty three")

23

In [None]:
from num2words import num2words

In [None]:
num2words(12)

'twelve'

In [None]:
num2words(101)


'one hundred and one'

In [None]:
num2words(2020)

'two thousand and twenty'

In [None]:
w2n.word_to_num("Twelve o'clock!")

12

### Exercise (1p)


Find a recent news article online.
Read it in a python variable (input it manually or read from a file).

Write a function that normalizes the text and splits it into tokens. Add flags to customize the different preprocessing choices (which stemmer/lemmatizer to use, whether to lowercase, whether to convert numbers, whether to remove stopwords, ...).

Store the vocabulary of unique tokens found in the text.

Compare the number of unique tokens ("types") with different preprocessing settings.

In [None]:
!pip install num2words

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting num2words
  Downloading num2words-0.5.12-py3-none-any.whl (125 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/125.2 KB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.2/125.2 KB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docopt>=0.6.2
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: docopt
  Building wheel for docopt (setup.py) ... [?25l[?25hdone
  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13723 sha256=caf650518d5b07d2f3148065a2851bef4a9ace15653ce3186e67aa4b86ad98f2
  Stored in directory: /root/.cache/pip/wheels/70/4a/46/1309fc853b8d395e60bafaf1b6df7845bdd82c95fd59dd8d2b
Successfully built docopt
Installing collected packages: docopt, num2words
Successfully insta

In [None]:
import nltk
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from num2words import num2words

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
text = "sea turtle, any of seven species of marine turtles belonging to the families Dermochelyidae (leatherback sea turtles) and Cheloniidae (green turtles, flatback sea turtles, loggerhead sea turtles, hawksbills, and ridleys). Both families are highly aquatic, and most species only appear on coastal beaches for egg laying; however, the green turtle (Chelonia mydas) occasionally basks in terrestrial environments. Adult sea turtles are mainly denizens of tropical and subtropical seas, but the juveniles of both families occur naturally in more temperate waters.Dermochelyids and cheloniids are distantly related; their divergence from one another took place between 100 million and 150 million years ago. Nevertheless, both groups have streamlined shells, forelimbs modified as flippers that propel their bodies through the water, figure-eight swimming strokes, and large, fully webbed hind feet as rudders. Cheloniids are hard-shelled sea turtles with a bony carapace (top shell) and plastron (bottom shell) with epidermal scutes (scales). In contrast, the leatherback shell of dermochelyids has a greatly reduced bony architecture, and the bones are less firmly articulated; scutes appear in hatchlings, but they are quickly shed, so the bony shell is covered with a thick, leathery skin. Size varies greatly among the seven species; however, commonalities exist in diet and habitat. With some exception, most sea turtles are carnivorous and prefer warm, coastal marine environments. The leatherback sea turtle (Dermochelys coriacea) inhabits pelagic (open ocean) environments. Apparently following the blooms of its jellyfish prey, it moves widely throughout the oceans. The shell lengths of few individuals exceed 1.6 metres (5 feet), although some reportedly reach 2.4 metres (8 feet). Adult and juvenile olive ridleys (Lepidochelys olivacea) are also largely pelagic, but they are known to frequent coastal regions such as bays and estuaries. The olive ridley and its relative, the Kemp’s ridley sea turtle (L. kempii), are small with wide rounded shells. As adults, both species have shells about 58–78 cm (23–31 inches) long. Leatherbacks and ridleys are largely carnivorous and consume a wide variety of crustaceans and mollusks. Loggerhead (Caretta caretta) and green (Chelonia mydas) sea turtles have adult shell lengths between 0.9 and 1.2 metres (3 and 4 feet) long. The loggerhead is carnivorous and prefers coastal marine environments. It has the proportionately largest head of the sea turtles; this feature may be an adaptation that increases its jaw strength in order to crush the shells of large mollusks such as whelks. The green turtle is found in warm coastal waters around the world; however, unlike other sea turtles, it is predominantly herbivorous and feeds on algae or marine grasses. The hawksbill sea turtle (Eretmochelys imbricata) is largely tropical and common in coral reef habitats, where it feeds on sponges and a variety of other invertebrates. The flatback sea turtle (Natator depressa) occurs in the seas between Australia and New Guinea; it also feeds on a variety of invertebrates. The shells of adults of both species range from 90 to 100 cm (35 to 39 inches)."

In [None]:
len(text.split())

488

In [None]:
def preprocess(text, remove_punctuation=True, lowercase=True, numbers_to_words_or_remove=None, remove_stopwords=False, stemmer=None, lemmatizer=None):
  if remove_punctuation:
    text = text.translate(str.maketrans('', '', string.punctuation))

  if lowercase:
    text = text.lower()

  if numbers_to_words_or_remove == "remove":
    text = re.sub(r'\d+', '', text)

  if numbers_to_words_or_remove == "words":
    digits = re.findall(r'\d+', text)
    for digit in digits:
        word = num2words(int(digit))
        text = text.replace(digit, word)

  tokens = word_tokenize(text)

  if remove_stopwords:
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if not token in stop_words]

  if stemmer:
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

  if lemmatizer:
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

  return set(tokens)



In [None]:
t1 = preprocess(text, remove_punctuation=True, lowercase=True,  numbers_to_words_or_remove='remove', remove_stopwords=False, stemmer=None, lemmatizer=None)
t2 = preprocess(text, remove_punctuation=True, lowercase=True,  numbers_to_words_or_remove='words', remove_stopwords=False, stemmer=None, lemmatizer=None)
t3 = preprocess(text, remove_punctuation=True, lowercase=True,  numbers_to_words_or_remove='remove', remove_stopwords=True, stemmer=None, lemmatizer=None)
t4 = preprocess(text, remove_punctuation=True, lowercase=True,  numbers_to_words_or_remove='remove', remove_stopwords=True, stemmer=True, lemmatizer=None)
t5 = preprocess(text, remove_punctuation=True, lowercase=True,  numbers_to_words_or_remove='remove', remove_stopwords=True, stemmer=None, lemmatizer=True)

In [None]:
print("Basic preprocessing without removing stopwords:", len(t1))
print("Basic preprocessing without removing stopwords but replacing numbers with words:", len(t2))
print("Preprocessing and removing stopwords:", len(t3))
print("Removing stopwords and stemmer:", len(t4))
print("Removing stopwords and lemmatizer:", len(t5))

Basic preprocessing without removing stopwords 240
Basic preprocessing without removing stopwords but replacing numbers with words 254
Preprocessing and removing stopwords 200
Removing stopwords and stemmer 184
Removing stopwords and lemmatizer 189
