# Introduction to NLTK lib

In [9]:
### 1.2 imports required

In [10]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk

In [11]:
nltk.__version__

'3.9.2'

In [12]:
### look what is available to download

In [13]:
nltk.download()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------


Downloader>  q


True

In [14]:
### download all (takes time!), but our image has everything

In [15]:
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('tagsets_json')
nltk.download('gutenberg')
nltk.download('genesis')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jupyter/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets_json to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package tagsets_json is already up-to-date!
[nltk_data] Downloading package gutenberg to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package genesis to /home/jupyter/nltk_data...
[nltk_data]   Package gen

True

In [None]:
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /home/jupyter/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /home/jupyter/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /home/jupyter/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /home/jupyter/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /home/jupyter/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data] 

In [None]:
### import all texts samples

In [None]:
from nltk.book import *

In [None]:
text1

In [None]:
### 1.3 search

In [None]:
text1.concordance("monstrous")

In [None]:
text2.concordance("affection")

In [None]:
text3.concordance("lived")

In [None]:
### similarity

In [None]:
text1.similar("monstrous")

In [None]:
text2.similar("monstrous")

In [None]:
text2.common_contexts(["monstrous", "very"])

In [None]:
### dispersion plot

In [None]:
text4.dispersion_plot(["citizens", "democracy", "freedom", "duties", "America"])

In [None]:
text3.generate()

In [None]:
len(text3)

In [None]:
len(set(text3))

In [None]:
sorted(set(text3))

In [None]:
len(set(text3)) / len(text3)

In [None]:
text3.count("a")

In [None]:
100 * text4.count('a') / len(text4)

In [None]:
text4[173]

In [None]:
text4.index('awaken')

In [None]:
text4[163:183]

In [None]:
fdist1 = FreqDist(text1)
fdist1

In [None]:
fdist1.plot(50)

In [None]:
V = set(text1)
long_words = [w for w in V if len(w) > 15]
sorted(long_words)

In [None]:
fdist5 = FreqDist(text5)
sorted(w for w in set(text5) if len(w) > 7 and fdist5[w] > 7)

In [None]:
text4.collocations()

## Tokenization

In [None]:
text = 'Data science is a multi-disciplinary field that uses scientific methods, processes, algorithms and systems to extract knowledge and insights from structured and unstructured data.'

In [None]:
words = word_tokenize(text)
words

## Stop words filtration

In [None]:
stop_words = stopwords.words('english')
stop_words

In [None]:
text = "This is a sample sentence, showing off the stop words filtration."
text

In [None]:
text_tokens = word_tokenize(text)
text_tokens

In [None]:
filtered_text = []
for w in text_tokens:
    if w not in stop_words:
        filtered_text.append(w)
filtered_text

## Stemming and Lemmatization

In [None]:
text = 'list lists listed listing listings'
tokens = word_tokenize(text)

In [None]:
tokens 

In [None]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

In [None]:
[stemmer.stem(t) for t in tokens]

In [None]:
[lemmatizer.lemmatize(t) for t in tokens]

## Wordnet

In [None]:
from nltk.corpus import wordnet as wn
wn.synsets('motorcar')

In [None]:
wn.synset('car.n.01').lemma_names()

In [None]:
for i in range(len(wn.synset('car.n.01').hypernyms())):
    print(wn.synset('car.n.01').hypernyms()[i].lemma_names())

In [None]:
for i in range(len(wn.synset('car.n.01').hyponyms())):
    print(wn.synset('car.n.01').hyponyms()[i].lemma_names())

## Part of speech tagging

In [None]:
text = 'Bob is great'

In [None]:
words = word_tokenize(text)

In [None]:
from nltk import pos_tag
pos_tag(words)

In [None]:
from nltk.help import upenn_tagset

In [None]:
upenn_tagset('NNP')

In [None]:
upenn_tagset('VBZ')

In [None]:
upenn_tagset('JJ')