In [1]:
import os
import nltk

In [2]:
import regex as re

In [None]:
# nltk.download('punkt')

In [None]:
# nltk.download('averaged_perceptron_tagger')

In [3]:
print(os.listdir(nltk.data.find('corpora')))

['movie_reviews', 'movie_reviews.zip', 'stopwords', 'stopwords.zip', 'wordnet.zip', 'words', 'words.zip']


#### Tokenization: Breaking down text into words (tokens).

In [4]:
from nltk.tokenize import word_tokenize

In [5]:
text = "Hi... today we will learn about tokenization"

In [9]:
text

'Hi... today we will learn about tokenization'

In [6]:
tokens = word_tokenize(text)

In [7]:
tokens

['Hi', '...', 'today', 'we', 'will', 'learn', 'about', 'tokenization']

##### Is tokenization splitting the text based on 'spaces' in the string?

In [8]:
re.split(' ', text)

['Hi...', 'today', 'we', 'will', 'learn', 'about', 'tokenization']

#### Part-of-Speech Tagging (Noun, Verb, Adverb, Adjective)

In [None]:
#nltk.download('averaged_perceptron_tagger')

In [10]:
pos_tags = nltk.pos_tag(tokens)

In [11]:
print(pos_tags)

[('Hi', 'NN'), ('...', ':'), ('today', 'NN'), ('we', 'PRP'), ('will', 'MD'), ('learn', 'VB'), ('about', 'IN'), ('tokenization', 'NN')]


#### Stemming and Lemmatization: Reducing words to their base/root form

In [19]:
#nltk.download('wordnet')

In [12]:
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [13]:
stemmer = PorterStemmer()
stemmed = [stemmer.stem(token) for token in tokens]

In [14]:
stemmed

['hi', '...', 'today', 'we', 'will', 'learn', 'about', 'token']

In [15]:
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(token) for token in tokens]

In [16]:
lemmatized

['Hi', '...', 'today', 'we', 'will', 'learn', 'about', 'tokenization']

#### Named Entity Recognition (NER): Identifying proper nouns in text.

In [None]:
# nltk.download('maxent_ne_chunker')
# nltk.download('words')

In [None]:
sentence = "Apple is looking at buying U.K. startup for $1 billion"

In [None]:
tokens = nltk.word_tokenize(sentence)

In [None]:
tags = nltk.pos_tag(tokens)

In [None]:
tags

#### Stopwords: Common words that are often filtered out in text processing.

* Usage: Removing noise from text data.

In [None]:
nltk.download('stopwords')

In [22]:
from nltk.corpus import stopwords

In [23]:
stop_words = set(stopwords.words('english'))

In [24]:
filtered_sentence = [w for w in tokens if w not in stop_words]

In [26]:
tokens

['Hi', '...', 'today', 'we', 'will', 'learn', 'about', 'tokenization']

In [25]:
filtered_sentence

['Hi', '...', 'today', 'learn', 'tokenization']

#### Frequency Distributions: Counting word frequencies.

* Usage: Analyzing word frequency patterns.

In [33]:
from nltk.probability import FreqDist

In [36]:
tokens

['Hi', '...', 'today', 'we', 'will', 'learn', 'about', 'tokenization']

In [34]:
fdist = FreqDist(tokens)

In [35]:
print(fdist.most_common(2))

[('Hi', 1), ('...', 1)]


#### n-grams: Generating combinations of N words in a row.

* Usage: Used in text predictions, text generation.

In [27]:
from nltk.util import ngrams

In [28]:
tokens

['Hi', '...', 'today', 'we', 'will', 'learn', 'about', 'tokenization']

In [29]:
bigrams = list(ngrams(tokens, 2))

In [30]:
print(bigrams)

[('Hi', '...'), ('...', 'today'), ('today', 'we'), ('we', 'will'), ('will', 'learn'), ('learn', 'about'), ('about', 'tokenization')]


In [31]:
trigrams = list(ngrams(tokens,3))

In [32]:
print(trigrams)

[('Hi', '...', 'today'), ('...', 'today', 'we'), ('today', 'we', 'will'), ('we', 'will', 'learn'), ('will', 'learn', 'about'), ('learn', 'about', 'tokenization')]
