In this notebook we will demostrate how to perform tokenization,stemming,lemmatization and pos_tagging using libraries like [spacy](https://spacy.io/) and [nltk](https://www.nltk.org/)

In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m62.6 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import pos_tag
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import spacy

nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

# Tokenizacja

In [None]:
# Przykładowy tekst
txt = "Need to finalize the demo corpus which will be used for this notebook and it should be done soon !!. It should be done by the ending of this month. But will it? This notebook has been run 4 times !!"


# Tokenizacja zdań i słów za pomocą spaCy
nlp = spacy.load("en_core_web_sm")
doc = nlp(txt)

sentences_spacy = [sent.text for sent in doc.sents]
print("Tokenizacja zdań (spaCy):", sentences_spacy)

words_spacy = [token.text for token in doc]
print("Tokenizacja słów (spaCy):", words_spacy)


Tokenizacja zdań (spaCy): ['Need to finalize the demo corpus which will be used for this notebook and it should be done soon !!.', 'It should be done by the ending of this month.', 'But will it?', 'This notebook has been run 4 times !!']
Tokenizacja słów (spaCy): ['Need', 'to', 'finalize', 'the', 'demo', 'corpus', 'which', 'will', 'be', 'used', 'for', 'this', 'notebook', 'and', 'it', 'should', 'be', 'done', 'soon', '!', '!', '.', 'It', 'should', 'be', 'done', 'by', 'the', 'ending', 'of', 'this', 'month', '.', 'But', 'will', 'it', '?', 'This', 'notebook', 'has', 'been', 'run', '4', 'times', '!', '!']


In [None]:

print("Tokenizacja zdań (NLTK):")
sentences_nltk = sent_tokenize(txt)
print(sentences_nltk)

print("Tokenizacja słów (NLTK):")
words_nltk = word_tokenize(txt)
print(words_nltk)

Tokenizacja zdań (NLTK):
['Need to finalize the demo corpus which will be used for this notebook and it should be done soon !', '!.', 'It should be done by the ending of this month.', 'But will it?', 'This notebook has been run 4 times !', '!']
Tokenizacja słów (NLTK):
['Need', 'to', 'finalize', 'the', 'demo', 'corpus', 'which', 'will', 'be', 'used', 'for', 'this', 'notebook', 'and', 'it', 'should', 'be', 'done', 'soon', '!', '!', '.', 'It', 'should', 'be', 'done', 'by', 'the', 'ending', 'of', 'this', 'month', '.', 'But', 'will', 'it', '?', 'This', 'notebook', 'has', 'been', 'run', '4', 'times', '!', '!']


# Tagowanie POS

In [None]:
# Tagowanie POS
pos_tags_spacy = [(token.text, token.pos_) for token in doc]
print("Tagowanie POS:", pos_tags_spacy)

Tagowanie POS: [('Need', 'VERB'), ('to', 'PART'), ('finalize', 'VERB'), ('the', 'DET'), ('demo', 'NOUN'), ('corpus', 'X'), ('which', 'PRON'), ('will', 'AUX'), ('be', 'AUX'), ('used', 'VERB'), ('for', 'ADP'), ('this', 'DET'), ('notebook', 'NOUN'), ('and', 'CCONJ'), ('it', 'PRON'), ('should', 'AUX'), ('be', 'AUX'), ('done', 'VERB'), ('soon', 'ADV'), ('!', 'PUNCT'), ('!', 'PUNCT'), ('.', 'PUNCT'), ('It', 'PRON'), ('should', 'AUX'), ('be', 'AUX'), ('done', 'VERB'), ('by', 'ADP'), ('the', 'DET'), ('ending', 'NOUN'), ('of', 'ADP'), ('this', 'DET'), ('month', 'NOUN'), ('.', 'PUNCT'), ('But', 'CCONJ'), ('will', 'AUX'), ('it', 'PRON'), ('?', 'PUNCT'), ('This', 'DET'), ('notebook', 'NOUN'), ('has', 'AUX'), ('been', 'AUX'), ('run', 'VERB'), ('4', 'NUM'), ('times', 'NOUN'), ('!', 'PUNCT'), ('!', 'PUNCT')]


In [None]:
pos_tags_nltk = pos_tag(words_nltk)
print("POS tags:", pos_tags_nltk)


POS tags: [('Need', 'NN'), ('to', 'TO'), ('finalize', 'VB'), ('the', 'DT'), ('demo', 'NN'), ('corpus', 'NN'), ('which', 'WDT'), ('will', 'MD'), ('be', 'VB'), ('used', 'VBN'), ('for', 'IN'), ('this', 'DT'), ('notebook', 'NN'), ('and', 'CC'), ('it', 'PRP'), ('should', 'MD'), ('be', 'VB'), ('done', 'VBN'), ('soon', 'RB'), ('!', '.'), ('!', '.'), ('.', '.'), ('It', 'PRP'), ('should', 'MD'), ('be', 'VB'), ('done', 'VBN'), ('by', 'IN'), ('the', 'DT'), ('ending', 'VBG'), ('of', 'IN'), ('this', 'DT'), ('month', 'NN'), ('.', '.'), ('But', 'CC'), ('will', 'MD'), ('it', 'PRP'), ('?', '.'), ('This', 'DT'), ('notebook', 'NN'), ('has', 'VBZ'), ('been', 'VBN'), ('run', 'VBN'), ('4', 'CD'), ('times', 'NNS'), ('!', '.'), ('!', '.')]


In [None]:
nltk.help.upenn_tagset()

# Tematyzacja (Stemming)

In [None]:
# Stemming za pomocą NLTK
# sprawdź działanie innych stemmerów
stemmer = PorterStemmer()
stems_nltk = [stemmer.stem(word) for word in words_spacy]
print("Steaming (NLTK):", stems_nltk)

Steaming (NLTK): ['need', 'to', 'final', 'the', 'demo', 'corpu', 'which', 'will', 'be', 'use', 'for', 'thi', 'notebook', 'and', 'it', 'should', 'be', 'done', 'soon', '!', '!', '.', 'it', 'should', 'be', 'done', 'by', 'the', 'end', 'of', 'thi', 'month', '.', 'but', 'will', 'it', '?', 'thi', 'notebook', 'ha', 'been', 'run', '4', 'time', '!', '!']


# Lematyzacja

In [None]:
# Lematyzacja za pomocą spaCy
lemmas_spacy = [token.lemma_ for token in doc]
print("Lematyzacja (spaCy):", lemmas_spacy)


Lematyzacja (spaCy): ['need', 'to', 'finalize', 'the', 'demo', 'corpus', 'which', 'will', 'be', 'use', 'for', 'this', 'notebook', 'and', 'it', 'should', 'be', 'do', 'soon', '!', '!', '.', 'it', 'should', 'be', 'do', 'by', 'the', 'ending', 'of', 'this', 'month', '.', 'but', 'will', 'it', '?', 'this', 'notebook', 'have', 'be', 'run', '4', 'time', '!', '!']


In [None]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("The children are playing outside.")
print([token.lemma_ for token in doc])

['the', 'child', 'be', 'play', 'outside', '.']


# Usuwanie Słów Stopu

In [None]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [None]:
len(ENGLISH_STOP_WORDS)

318

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))
words = ["this", "is", "a", "great", "movie"]
filtered_words = [w for w in words if not w in stop_words]
print(filtered_words)  # Wynik: ['great', 'movie']

['great', 'movie']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Usuwanie Cyfr, Interpunkcji, Zmiana na małe litery

In [None]:
#lower case the txt
txt = txt.lower()
print(txt)

need to finalize the demo corpus which will be used for this notebook and it should be done soon !!. it should be done by the ending of this month. but will it? this notebook has been run 4 times !!


In [None]:
#removing digits in the txt
import re
txt = re.sub(r'\d+','', txt)
print(txt)

need to finalize the demo corpus which will be used for this notebook and it should be done soon !!. it should be done by the ending of this month. but will it? this notebook has been run  times !!


In [None]:
#removing punctuations in txt
import string
txt = txt.translate(str.maketrans('', '', string.punctuation))
print(txt)

need to finalize the demo corpus which will be used for this notebook and it should be done soon  it should be done by the ending of this month but will it this notebook has been run  times 


# Frequency

In [None]:
from nltk.probability import FreqDist
fd_nltk = FreqDist(words_nltk)
fd_nltk

FreqDist({'!': 4, 'be': 3, 'the': 2, 'will': 2, 'this': 2, 'notebook': 2, 'it': 2, 'should': 2, 'done': 2, '.': 2, ...})

In [None]:
from collections import Counter
fd = Counter(words_spacy)
fd

Counter({'Need': 1,
         'to': 1,
         'finalize': 1,
         'the': 2,
         'demo': 1,
         'corpus': 1,
         'which': 1,
         'will': 2,
         'be': 3,
         'used': 1,
         'for': 1,
         'this': 2,
         'notebook': 2,
         'and': 1,
         'it': 2,
         'should': 2,
         'done': 2,
         'soon': 1,
         '!': 4,
         '.': 2,
         'It': 1,
         'by': 1,
         'ending': 1,
         'of': 1,
         'month': 1,
         'But': 1,
         '?': 1,
         'This': 1,
         'has': 1,
         'been': 1,
         'run': 1,
         '4': 1,
         'times': 1})

# TF-IDF i podobieńśtwo cosinusowe

In [None]:
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(words_nltk)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 38 stored elements and shape (46, 27)>

In [None]:
print("\nSłownictwo:")
print(vectorizer.get_feature_names_out())


Słownictwo:
['and' 'be' 'been' 'but' 'by' 'corpus' 'demo' 'done' 'ending' 'finalize'
 'for' 'has' 'it' 'month' 'need' 'notebook' 'of' 'run' 'should' 'soon'
 'the' 'this' 'times' 'to' 'used' 'which' 'will']


In [None]:
dense_matrix = tfidf.toarray()
print("\nMacierz TF-IDF:")
print(dense_matrix)


Macierz TF-IDF:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [None]:
query = "primo sort corpus run"
query_vector = vectorizer.transform([query])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

similarities = cosine_similarity(tfidf, query_vector).flatten()
similarities

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.70710678, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.70710678, 0.        , 0.        , 0.        ,
       0.        ])