In [1]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\max\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

**Tokenization**

In [2]:
sentence = "books are on the table"
tokens = word_tokenize(sentence)
tokens

['books', 'are', 'on', 'the', 'table']

**StopWords**

In [3]:
from nltk.corpus import stopwords
nltk.download('stopwords')

sentence = "Machine learning is so cool"
stopwords = set(stopwords.words('english'))

word_tokens = word_tokenize(sentence)
filtered_sentence = [w for w in word_tokens if w not in stopwords]

filtered_sentence

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\max\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['Machine', 'learning', 'cool']

**Stemming**

In [4]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

sentence = "Machine learning feels so cool"
sentence_tokens = word_tokenize(sentence)

stemmed_sentence = [ps.stem(s) for s in sentence_tokens]
stemmed_sentence

['machin', 'learn', 'feel', 'so', 'cool']

**Lemmatizing**

In [5]:
from nltk.stem import WordNetLemmatizer
nltk.download('omw-1.4')

wnl = WordNetLemmatizer()
lemmatized_sentence = [wnl.lemmatize(s) for s in sentence_tokens]
lemmatized_sentence

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\max\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


['Machine', 'learning', 'feel', 'so', 'cool']

**POS Tagging**

In [6]:
from nltk import pos_tag
nltk.download('averaged_perceptron_tagger')
tokens = word_tokenize("And now for something completely different")
tokens_tags = pos_tag(tokens)
tokens_tags

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\max\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


[('And', 'CC'),
 ('now', 'RB'),
 ('for', 'IN'),
 ('something', 'NN'),
 ('completely', 'RB'),
 ('different', 'JJ')]

**Brown Automatic Tagging**

In [7]:
from nltk.corpus import brown
nltk.download('brown')

brown_sents = brown.sents(categories='news')
brown_tagged_sents = brown.tagged_sents(categories='news')

print(len(brown_sents))
print(len(brown_tagged_sents))

print(brown_sents[0][:5])
print(brown_tagged_sents[0][:5])

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\max\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


4623
4623
['The', 'Fulton', 'County', 'Grand', 'Jury']
[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL')]


**Automatic Tagging (Default Tagger)**

In [8]:
tags = [tag for (word, tag) in brown.tagged_words(categories='news')]
highest_freq_tag = nltk.FreqDist(tags).max()
highest_freq_tag

'NN'

In [9]:
raw = "I do not like green eggs and ham, I do not like them Sam I am!"
tokens = nltk.word_tokenize(raw)
default_tagger = nltk.DefaultTagger(highest_freq_tag)
default_tagger.tag(tokens)

[('I', 'NN'),
 ('do', 'NN'),
 ('not', 'NN'),
 ('like', 'NN'),
 ('green', 'NN'),
 ('eggs', 'NN'),
 ('and', 'NN'),
 ('ham', 'NN'),
 (',', 'NN'),
 ('I', 'NN'),
 ('do', 'NN'),
 ('not', 'NN'),
 ('like', 'NN'),
 ('them', 'NN'),
 ('Sam', 'NN'),
 ('I', 'NN'),
 ('am', 'NN'),
 ('!', 'NN')]

In [10]:
default_tagger.accuracy(brown_tagged_sents)

0.13089484257215028

**Automatic Tagging (Regular Expression Tagger)**

In [11]:
patterns = [
            (r'.*ing$', 'VBG'),  #gerunds
            (r'.*ed$', 'VBD'),   #simple past
            (r'.*es$', 'VBZ'),   #3rd singular present
            (r'.*ould$', 'MD'),  #modals
            (r'.*\'s$', 'NN$'),  #possessive nouns
            (r'.*s$', 'NNS'),    #plural nouns
            (r'.*^-?[0-9]+(\.[0-9]+)?$', 'CD'),  #cardinal numbers
            (r'.*$', 'NN')
        ]      #nouns (default)

In [12]:
regexp_tagger = nltk.RegexpTagger(patterns)
regexp_tagger.tag(brown_sents[3][:10])

[('``', 'NN'),
 ('Only', 'NN'),
 ('a', 'NN'),
 ('relative', 'NN'),
 ('handful', 'NN'),
 ('of', 'NN'),
 ('such', 'NN'),
 ('reports', 'NNS'),
 ('was', 'NNS'),
 ('received', 'VBD')]

In [13]:
regexp_tagger.accuracy(brown_tagged_sents)

0.20186168625812995

**Automatic Tagging (N-Gram Tagging)**

In [14]:
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
unigram_tagger.tag(brown_sents[2007])

[('Various', 'JJ'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('apartments', 'NNS'),
 ('are', 'BER'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('terrace', 'NN'),
 ('type', 'NN'),
 (',', ','),
 ('being', 'BEG'),
 ('on', 'IN'),
 ('the', 'AT'),
 ('ground', 'NN'),
 ('floor', 'NN'),
 ('so', 'QL'),
 ('that', 'CS'),
 ('entrance', 'NN'),
 ('is', 'BEZ'),
 ('direct', 'JJ'),
 ('.', '.')]

In [15]:
unigram_tagger.accuracy(brown_tagged_sents)

0.9349006503968017

In [16]:
size = int(len(brown_sents) * 0.9)
training_set = brown_tagged_sents[:size]
testing_set = brown_tagged_sents[size:]
print(testing_set)  
unigram_tagger = nltk.UnigramTagger(training_set)

[[('But', 'CC'), ('in', 'IN'), ('all', 'ABN'), ('its', 'PP$'), ('175', 'CD'), ('years', 'NNS'), (',', ','), ('not', '*'), ('a', 'AT'), ('single', 'AP'), ('Negro', 'NP'), ('student', 'NN'), ('has', 'HVZ'), ('entered', 'VBN'), ('its', 'PP$'), ('classrooms', 'NNS'), ('.', '.')], [('Last', 'AP'), ('week', 'NN'), ('Federal', 'JJ-TL'), ('District', 'NN-TL'), ('Judge', 'NN-TL'), ('William', 'NP'), ('A.', 'NP'), ('Bootle', 'NP'), ('ordered', 'VBD'), ('the', 'AT'), ('university', 'NN'), ('to', 'TO'), ('admit', 'VB'), ('immediately', 'RB'), ('a', 'AT'), ('``', '``'), ('qualified', 'VBN'), ("''", "''"), ('Negro', 'NP'), ('boy', 'NN'), ('and', 'CC'), ('girl', 'NN'), ('.', '.')], ...]


In [17]:
unigram_tagger.accuracy(training_set)

0.9353630649241612

**Automatic Tagging (N-Gram Tagging)**

In [18]:
bigram_tagger = nltk.BigramTagger(training_set)
bigram_tagger.tag(brown_sents[2007])

[('Various', 'JJ'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('apartments', 'NNS'),
 ('are', 'BER'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('terrace', 'NN'),
 ('type', 'NN'),
 (',', ','),
 ('being', 'BEG'),
 ('on', 'IN'),
 ('the', 'AT'),
 ('ground', 'NN'),
 ('floor', 'NN'),
 ('so', 'CS'),
 ('that', 'CS'),
 ('entrance', 'NN'),
 ('is', 'BEZ'),
 ('direct', 'JJ'),
 ('.', '.')]

In [19]:
unseen_sent = brown_sents[4203]
bigram_tagger.tag(unseen_sent)

[('The', 'AT'),
 ('population', 'NN'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('Congo', 'NP'),
 ('is', 'BEZ'),
 ('13.5', None),
 ('million', None),
 (',', None),
 ('divided', None),
 ('into', None),
 ('at', None),
 ('least', None),
 ('seven', None),
 ('major', None),
 ('``', None),
 ('culture', None),
 ('clusters', None),
 ("''", None),
 ('and', None),
 ('innumerable', None),
 ('tribes', None),
 ('speaking', None),
 ('400', None),
 ('separate', None),
 ('dialects', None),
 ('.', None)]

In [20]:
bigram_tagger.accuracy(testing_set)

0.10206319146815508

In [21]:
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(training_set, backoff=t0)
t2 = nltk.BigramTagger(training_set, backoff=t1)

t2.accuracy(testing_set)

0.8452108043456593

**Automatic Tagging (Brill's Tagger)**

In [22]:
from nltk.tbl import demo as brill_demo
nltk.download('treebank')
brill_demo.demo()

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\max\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!


Loading tagged data from treebank... 
Read testing data (200 sents/5251 wds)
Read training data (800 sents/19933 wds)
Read baseline data (800 sents/19933 wds) [reused the training set]
Trained baseline tagger
    Accuracy on test set: 0.8358
Training tbl tagger...
TBL train (fast) (seqs: 800; tokens: 19933; tpls: 24; min score: 3; min acc: None)
Finding initial useful rules...
    Found 12799 useful rules.

           B      |
   S   F   r   O  |        Score = Fixed - Broken
   c   i   o   t  |  R     Fixed = num tags changed incorrect -> correct
   o   x   k   h  |  u     Broken = num tags changed correct -> incorrect
   r   e   e   e  |  l     Other = num tags changed incorrect -> incorrect
   e   d   n   r  |  e
------------------+-------------------------------------------------------
  23  23   0   0  | POS->VBZ if Pos:PRP@[-2,-1]
  18  19   1   0  | NN->VB if Pos:-NONE-@[-2] & Pos:TO@[-1]
  14  14   0   0  | VBP->VB if Pos:MD@[-2,-1]
  12  12   0   0  | VBP->VB if Pos:TO@[-1]
  

**Named Entity Recognition (NER)**

In [23]:
# from nltk.tag import StanfordNERTagger
# st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') 
# st.tag('Mustafa Walid is in the University of MSA. He majors in computer science and wishes to work in google')

**ONE HOT ENCODING**

In [24]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from numpy import array

#define example
data = ['cold','cold','warm','cold','hot','hot','warm','cold','warm','hot']
values = array(data)
print(values)
#integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print(integer_encoded)

#binary encode
one_hot_encoder = OneHotEncoder(sparse_output=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
one_hot_encoded = one_hot_encoder.fit_transform(integer_encoded)
print(one_hot_encoded)

['cold' 'cold' 'warm' 'cold' 'hot' 'hot' 'warm' 'cold' 'warm' 'hot']
[0 0 2 0 1 1 2 0 2 1]
[[1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]]


**Count Vectorizer & TF-IDF**

In [48]:
import pandas as pd

In [67]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
train = ['The sky is blue.', 'The sun is bright.']

count_vectorizer = CountVectorizer(analyzer='word', stop_words='english')
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english')

count_wm = count_vectorizer.fit_transform(train)
tfidf_wm = tfidf_vectorizer.fit_transform(train)

In [68]:
count_wm.toarray()

array([[1, 0, 1, 0],
       [0, 1, 0, 1]], dtype=int64)

In [69]:
count_tokens = count_vectorizer.get_feature_names_out()
tfidf_tokens = tfidf_vectorizer.get_feature_names_out()
count_tokens

array(['blue', 'bright', 'sky', 'sun'], dtype=object)

In [70]:
df_count_vect = pd.DataFrame(data=count_wm.toarray(), index=['Doc1', 'Doc2'], columns=count_tokens)
df_count_vect

Unnamed: 0,blue,bright,sky,sun
Doc1,1,0,1,0
Doc2,0,1,0,1


In [71]:
df_tfidf_vect = pd.DataFrame(data=tfidf_wm.toarray(), index=['Doc1', 'Doc2'], columns=tfidf_tokens)
df_tfidf_vect

Unnamed: 0,blue,bright,sky,sun
Doc1,0.707107,0.0,0.707107,0.0
Doc2,0.0,0.707107,0.0,0.707107


In [72]:
print("Count Vectorizer\n")
print(df_count_vect)
print("\nTf-IDF Vectorizer\n")
print(df_tfidf_vect)

Count Vectorizer

      blue  bright  sky  sun
Doc1     1       0    1    0
Doc2     0       1    0    1

Tf-IDF Vectorizer

          blue    bright       sky       sun
Doc1  0.707107  0.000000  0.707107  0.000000
Doc2  0.000000  0.707107  0.000000  0.707107


**N-Gram Reperesentation**

In [77]:
text = ['NLP has changed the world', 'I love NLP', 'NLP is cool', 'NLP is future']
count_vectorizer = CountVectorizer(ngram_range=(2,2), stop_words='english')

#Bigram Count Vectorizer
count_wm = count_vectorizer.fit_transform(text)
count_tokens = count_vectorizer.get_feature_names_out()

df_count_vect = pd.DataFrame(data=count_wm.toarray(), index=['Doc1', 'Doc2','Doc3','Doc4'], columns=count_tokens)
df_count_vect

Unnamed: 0,changed world,love nlp,nlp changed,nlp cool,nlp future
Doc1,1,0,1,0,0
Doc2,0,1,0,0,0
Doc3,0,0,0,1,0
Doc4,0,0,0,0,1
