
In this notebook we re going to introduce text processing in NLP (Natural Language Processing), this will include :
* Tokenization
* Stop words removal
* Stemming and Lemmatization
* Part-of-Speech Tagging

We will be using different librairies

NLTK, TextBlob, spacy, Gensim, these are the main known libraries for basic NLP tasks.

# Tokenization

Tokenization is the process of breaking down text into smaller units called tokens, which could be words, phrases, or characters. It's the first step in most NLP tasks, allowing further analysis on individual components of the text.

In [None]:
from nltk.tokenize import (
    word_tokenize,
    sent_tokenize,
    TreebankWordTokenizer,
    wordpunct_tokenize,
    TweetTokenizer,
    MWETokenizer,
)

In [None]:
# Example of a sentence in English
sentence_en = "Hello Where are you? I'm in San Francisco. I can't come." 

# Example of a sentence in French
sentence_fr = "Bonjour Où es-tu? Je suis à Saint Etienne. Je ne peux pas venir."

In [None]:
# word_tokenize for word tokenization
print("word_tokenize: ", word_tokenize(sentence_en))
print("word_tokenize: ", word_tokenize(sentence_fr))

In [None]:
# Sent_tokenize for tokenizing the sentences
print("sent_tokenize: ", sent_tokenize(sentence_en))
print("sent_tokenize: ", sent_tokenize(sentence_fr))

In [None]:
# TreebankWordTokenizer for tokenizing the words
tokenizer = TreebankWordTokenizer()
print("TreebankWordTokenizer: ", tokenizer.tokenize(sentence_en))
print("TreebankWordTokenizer: ", tokenizer.tokenize(sentence_fr))

In [None]:
# wordpunct_tokenize for tokenizing the words
print("wordpunct_tokenize: ", wordpunct_tokenize(sentence_en))
print("wordpunct_tokenize: ", wordpunct_tokenize(sentence_fr))

In [None]:
# TweetTokenizer for tokenizing the words
Tweet_tokenizer = TweetTokenizer(match_phone_numbers=True)
print("TweetTokenizer: ", Tweet_tokenizer.tokenize(sentence_en))
print("TweetTokenizer: ", Tweet_tokenizer.tokenize(sentence_fr))
another_sentence_phone = "Call me at 123-456-7890"
print("TweetTokenizer: ", Tweet_tokenizer.tokenize(another_sentence_phone))

In [None]:
MWEtokenizer_mwes = MWETokenizer(mwes=[('San', 'Francisco'), ('I', "'m"), ('Saint', "Etienne")], separator="-")
MWEtokenizer = MWETokenizer()

print("MWETokenizer without MWE: ", MWEtokenizer.tokenize(word_tokenize(sentence_en)))
print("MWETokenizer with MWE: ", MWEtokenizer_mwes.tokenize(word_tokenize(sentence_en)))

print("MWETokenizer without MWE: ", MWEtokenizer.tokenize(word_tokenize(sentence_fr)))
print("MWETokenizer with MWE: ", MWEtokenizer_mwes.tokenize(word_tokenize(sentence_fr)))

## TextBlob

In [None]:
!pip install -U textblob
!python3 -m textblob.download_corpora

In [None]:
from textblob import TextBlob

In [None]:
# tokenize the sentence
blob = TextBlob(sentence_en)
print("TextBlob: ", blob.words)

In [None]:
# tokenize the sentence in french
blob = TextBlob(sentence_fr)
print("TextBlob for French: ", blob.words)

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
import spacy

In [None]:
# Load the English model
nlp = spacy.load("en_core_web_sm")

In [None]:
# Example input texts in english
text1 = "Hello! How are you?"
text2 = "Let's visit www.example.com!"

# Example input texts in french
text3 = "Bonjour! Comment ça va?"

In [None]:
# Tokenize text using spaCy's default tokenizer
# nlp here is the spaCy model loaded in the previous step

print("spaCy tokenizer: ", [token.text for token in nlp(text1)])
print("spaCy tokenizer: ", [token.text for token in nlp(text2)])
print("spaCy tokenizer: ", [token.text for token in nlp(text3)])

In [None]:
# Process and tokenize a stream of texts
texts = ["Hello!", "How are you?"]
docs = nlp.pipe(texts)
tokens_pipe = [[token.text for token in doc] for doc in docs]
print("Pipe Tokenizer:", tokens_pipe)

In [None]:
# Custom Tokenizer in spaCy
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

In [None]:
# Create a blank Tokenizer with just the English vocab
custom_nlp = English()
custom_tokenizer = Tokenizer(custom_nlp.vocab)

In [None]:
# Example custom tokenization
doc2 = custom_tokenizer(text2)
tokens2 = [token.text for token in doc2]
print("Custom Tokenizer:", tokens2)

## Gensim

In [None]:
# tokenization with gensim
from gensim.utils import tokenize

# English sentence
tokens = tokenize(sentence_en)
print("gensim tokenizer in English: ", list(tokens))

# French sentence
tokens = tokenize(sentence_fr)
print("gensim tokenizer in French: ", list(tokens))

# French sentence with accents
tokens = tokenize(sentence_fr, deacc=True)
print("gensim tokenizer in French without accents: ", list(tokens))

# Stop Words

Stopwords are common words (like "and," "the," "in") that carry little semantic value in text analysis. Removing them helps reduce noise and focus on the more meaningful words in a text.

## NLTK

In [None]:
# download stopwords from nltk
import nltk
nltk.download('stopwords')

In [None]:
# import stopwords
from nltk.corpus import stopwords

In [None]:
# check the list of stopwords in english
print("List of stopwords in English: ", stopwords.words('english'))
print("Number of stopwords in English: ", len(stopwords.words('english')))

In [None]:
# check the list of stopwords in another language (french)
print("List of stopwords in French: ", stopwords.words('french'))
print("Number of stopwords in French: ", len(stopwords.words('french')))

In [None]:
# filter the stopwords from the sentences 
filtered_sentence_en = [word for word in word_tokenize(sentence_en) if word.lower() not in stopwords.words('english')]
print("Filtered sentence in English: ", filtered_sentence_en)

filtered_sentence_fr = [word for word in word_tokenize(sentence_fr) if word.lower() not in stopwords.words('french')]
print("Filtered sentence in French: ", filtered_sentence_fr)

## Spacy

In [None]:
from spacy.lang.en import stop_words

In [None]:
stopwords_spacy = stop_words.STOP_WORDS
print("List of stopwords in English using spaCy: ", stopwords_spacy)
print("Number of stopwords in English using spaCy: ", len(stopwords_spacy))

In [None]:
# OR 
# Load the English model
nlp = spacy.load("en_core_web_sm")

# Get the list of stopwords in English
stopwords_spacy = list(nlp.Defaults.stop_words)
print("List of stopwords in English: ", stopwords_spacy)
print("Number of stopwords in English: ", len(stopwords_spacy))

In [None]:
# Download the French model
!python -m spacy download fr_core_news_md

In [None]:
# Load the French model
import fr_core_news_md

# Get the list of stopwords in French
nlp_fr = fr_core_news_md.load()
stopwords_spacy_fr = list(nlp_fr.Defaults.stop_words)
print("List of stopwords in French: ", stopwords_spacy_fr)
print("Number of stopwords in French: ", len(stopwords_spacy_fr))

In [None]:
# Filter the stopwords from the sentences
filtered_sentence_spacy_en = [word for word in word_tokenize(sentence_en) if word.lower() not in stopwords_spacy]
print("Filtered sentence in English: ", filtered_sentence_spacy_en)

filtered_sentence_spacy_fr = [word for word in word_tokenize(sentence_fr) if word.lower() not in stopwords_spacy_fr]
print("Filtered sentence in French: ", filtered_sentence_spacy_fr)

## Gensim

In [None]:
!pip install gensim

In [None]:
from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS

In [None]:
# check the list of stopwords in gensim

print("List of stopwords in gensim: ", STOPWORDS)
print("Number of stopwords in gensim: ", len(STOPWORDS))

In [None]:
# using gensim to remove stopwords
remove_stopwords(sentence_en)

# Stemming and Lemmatization

Lemmatization is the process of converting a word to its base or dictionary form (lemma). For example, "running" becomes "run." Lemmatization helps in reducing the different forms of a word to a single form, which aids in standardizing text for analysis.

Stemming is the process of reducing a word to its root or base form, often by stripping away prefixes or suffixes. Unlike lemmatization, which returns the actual dictionary form of a word, stemming usually results in a word stem that may not be a valid word in the language.

## NLTK

In [None]:
# import different stemmers
from nltk.stem import (
    PorterStemmer,
    LancasterStemmer,
    SnowballStemmer,
)

In [None]:
# define a list of different words
words = ["car", "cars", "bus", "buses", "fly", "flies", "run", "running", "city", "cities"]

In [None]:
# create an instance of the stemmers
porter = PorterStemmer()
lancaster = LancasterStemmer()
snowball = SnowballStemmer("english")

In [None]:
# stem the words using the stemmers
print("Porter Stemmer: ", [porter.stem(word) for word in words])
print("Lancaster Stemmer: ", [lancaster.stem(word) for word in words])
print("Snowball Stemmer: ", [snowball.stem(word) for word in words])

## Spacy

In [None]:
# We will be using the nlp object created in the previous steps
print("Spacy Lemmatization: ", [token.lemma_ for token in nlp(" ".join(words))])

## TextBlob

In [None]:
# We will be using the TextBlob object created in the previous steps
blob = TextBlob(" ".join(words))
print("TextBlob Lemmatization: ", blob.words.lemmatize())

# POS Tagging

Part-of-Speech (POS) Tagging is the process of assigning a part of speech (like noun, verb, adjective) to each word in a sentence. It helps in understanding the grammatical structure and the meaning of the text.

## NLTK

In [None]:
# download the pos tagger from nltk
nltk.download('averaged_perceptron_tagger')

In [None]:
# tag the words in the sentence for the English language
nltk.pos_tag(word_tokenize(sentence_en))

In [None]:
# tag the words in the sentence for the French language
nltk.pos_tag(word_tokenize(sentence_fr))

In [None]:
# define a ConditionalFreqDist object
CFD = nltk.ConditionalFreqDist(nltk.pos_tag(word_tokenize(sentence_en)))
# get all the tags and their frequency
print(CFD.tabulate())

In [None]:
# get the list of POS tags
nltk.download('tagsets')
nltk.help.upenn_tagset()

# Spacy

In [None]:
# pos tagging using spacy

print("spaCy POS tagging: ", [(token.text, token.pos_) for token in nlp(sentence_en)])
print("spaCy POS tagging: ", [(token.text, token.pos_) for token in nlp(sentence_fr)])

In [None]:
# some examoles of POS tags in spaCy
print("List of POS tags in spaCy: ", spacy.explain("ADJ"))
print("List of POS tags in spaCy: ", spacy.explain("DET"))

## TextBlob

In [None]:
# using the TextBlob object created in the previous steps
print("TextBlob POS tagging: ", blob.tags)

In [None]:
# pos tagging in french
blob_fr = TextBlob(sentence_fr)
print("TextBlob POS tagging in French: ", blob_fr.tags)