![Practicum AI Logo image](https://github.com/PracticumAI/practicumai.github.io/blob/main/images/logo/PracticumAI_logo_250x50.png?raw=true)
***
These exercises adapted from Baig et al. (2020) <i>The Deep Learning Workshop</i> from <a href="https://www.packtpub.com/product/the-deep-learning-workshop/9781839219856">Packt Publishers</a> (Exercises 4.01 - 4.06, page 159).

(15 Minutes: Exercises 4.01 - 4.03)

<div style="padding: 10px;margin-bottom: 20px;border: thin solid #30335D;border-left-width: 10px;background-color: #fff"><strong>Note:</strong> All exercises for the NLP workshop series are included in this notebook as later exercises build on earlier ones.</div>

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:80% !important;}</style>"))

#### Getting Started with Text Data Handling

In [2]:
raw_txt = """Welcome to the world of Deep Learning for NLP! We're in this together, and we'll learn together. 
NLP is amazing, and Deep Learning makes it even more fun. Let's learn!"""

##### Tokenization

In [3]:
import nltk
nltk.download('punkt')
from nltk import tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     /home/danielmaxwell/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
tokenize.sent_tokenize(raw_txt)

['Welcome to the world of Deep Learning for NLP!',
 "We're in this together, and we'll learn together.",
 'NLP is amazing, and Deep Learning makes it even more fun.',
 "Let's learn!"]

In [5]:
txt_sents = tokenize.sent_tokenize(raw_txt)

In [6]:
type(txt_sents), len(txt_sents)

(list, 4)

In [7]:
txt_words = [tokenize.word_tokenize(sent) for sent in txt_sents]
type(txt_words), type(txt_words[0])

(list, list)

In [8]:
print(txt_words[:2])

[['Welcome', 'to', 'the', 'world', 'of', 'Deep', 'Learning', 'for', 'NLP', '!'], ['We', "'re", 'in', 'this', 'together', ',', 'and', 'we', "'ll", 'learn', 'together', '.']]


##### Normalizing case

In [9]:
#You needn't run this
raw_txt = raw_txt.lower()

In [10]:
txt_sents = [sent.lower() for sent in txt_sents]
txt_sents

['welcome to the world of deep learning for nlp!',
 "we're in this together, and we'll learn together.",
 'nlp is amazing, and deep learning makes it even more fun.',
 "let's learn!"]

In [11]:
txt_words = [tokenize.word_tokenize(sent) for sent in txt_sents]

In [12]:
print(txt_words[:2])

[['welcome', 'to', 'the', 'world', 'of', 'deep', 'learning', 'for', 'nlp', '!'], ['we', "'re", 'in', 'this', 'together', ',', 'and', 'we', "'ll", 'learn', 'together', '.']]


##### Removing punctuation

In [13]:
from string import punctuation

In [14]:
list_punct = list(punctuation)
print(list_punct)

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']


In [15]:
def drop_punct(input_tokens):
    return [token for token in input_tokens if token not in list_punct]

In [16]:
drop_punct(["let",".","us",".","go","!"])

['let', 'us', 'go']

In [17]:
txt_words_nopunct = [drop_punct(sent) for sent in txt_words]
print(txt_words_nopunct)

[['welcome', 'to', 'the', 'world', 'of', 'deep', 'learning', 'for', 'nlp'], ['we', "'re", 'in', 'this', 'together', 'and', 'we', "'ll", 'learn', 'together'], ['nlp', 'is', 'amazing', 'and', 'deep', 'learning', 'makes', 'it', 'even', 'more', 'fun'], ['let', "'s", 'learn']]


##### Removing stop words

In [18]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/danielmaxwell/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [19]:
from nltk.corpus import stopwords
list_stop = stopwords.words("english")
len(list_stop)

179

In [20]:
print(list_stop[:50])

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be']


#### Exercise 4.01 (Tokenizing, Case Normalization, Punctuation and Stop Word...) - Page 166

***

<span style="color:steelblue">**(1)**</span>

In [21]:
# Code it!

<span style="color:steelblue">**(2)**</span>

In [22]:
raw_txt = """Welcome to the world of Deep Learning for NLP! We're in this together, and we'll learn together. NLP is amazing, and Deep Learning makes it even more fun. Let's learn!"""

<span style="color:steelblue">**(3)**</span>

In [23]:
# Code it!

<span style="color:steelblue">**(4)**</span>

In [24]:
# Code it!

<span style="color:steelblue">**(5)**</span>

In [25]:
# Code it!

<span style="color:steelblue">**(6)**</span>

In [26]:
# Code it!

<span style="color:steelblue">**(7)**</span>

In [27]:
# Code it!

<span style="color:steelblue">**(8)**</span>

In [28]:
def drop_stop(input_tokens):
    return [token for token in input_tokens if token not in stop_final]

<span style="color:steelblue">**(9)**</span>

In [29]:
# Code it!

<span style="color:steelblue">**(10)**</span>

In [31]:
# Code it!

***

##### Stemming

In [76]:
from nltk.stem import PorterStemmer

In [77]:
stemmer_p = PorterStemmer()

In [78]:
print(stemmer_p.stem("driving"))

drive


In [79]:
txt = "I mustered all my drive, drove to the driving school!"

In [80]:
tokens = tokenize.word_tokenize(txt)
print([stemmer_p.stem(word) for word in tokens])

['I', 'muster', 'all', 'my', 'drive', ',', 'drove', 'to', 'the', 'drive', 'school', '!']


##### Lemmatization

In [81]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rahim.baig\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [82]:
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()

In [83]:
lemmatizer.lemmatize("ponies")

'pony'

#### Exercise 4.02 (Stemming Our Data) - Page 172

***

<span style="color:steelblue">**(1)**</span>

In [84]:
# Code it!

<span style="color:steelblue">**(2)**</span>

In [85]:
# Code it!

<span style="color:steelblue">**(3)**</span>

In [9]:
# Code it!

... applying stemmer to all the sentences

<span style="color:steelblue">**(4)**</span>

In [87]:
# Code it!

<span style="color:steelblue">**(5)**</span>

In [8]:
# Code it!

***
##### Downloading Text Corpora using NLTK

In [1]:
import nltk

<div style="padding: 10px;margin-bottom: 20px;border: thin solid #30335D;border-left-width: 10px;background-color: #fff"><strong>Note:</strong> The nltk.download() command, when executed without arguments, does not open the NLTK downloader in a new window, as pictured in the textbook.  The Unix version has a command line interface.  Type 'l' in the NLTK field and hit enter to view the Packages available for install.  As this list is rather long, you will need to hit enter multiple times to scroll through it.  The gutenberg package is in this list.  To install it, type 'd' and hit enter.  Then type 'gutenberg' in the NLTK field and hit enter again.  The software will respond with an installation message.  And finally, type 'q' to exit the downloader.</div>

In [5]:
nltk.download()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------


Downloader>  q


True

In [3]:
alice_raw = nltk.corpus.gutenberg.raw('carroll-alice.txt')

In [4]:
alice_raw[:800]

"[Alice's Adventures in Wonderland by Lewis Carroll 1865]\n\nCHAPTER I. Down the Rabbit-Hole\n\nAlice was beginning to get very tired of sitting by her sister on the\nbank, and of having nothing to do: once or twice she had peeped into the\nbook her sister was reading, but it had no pictures or conversations in\nit, 'and what is the use of a book,' thought Alice 'without pictures or\nconversation?'\n\nSo she was considering in her own mind (as well as she could, for the\nhot day made her feel very sleepy and stupid), whether the pleasure\nof making a daisy-chain would be worth the trouble of getting up and\npicking the daisies, when suddenly a White Rabbit with pink eyes ran\nclose by her.\n\nThere was nothing so VERY remarkable in that; nor did Alice think it so\nVERY much out of the way to hear the Rabbit"

#### Representation
##### One hot encoding

In [92]:
txt_words_nostop

[['welcome', 'world', 'deep', 'learning', 'nlp'],
 ["'re", 'together', "'ll", 'learn', 'together'],
 ['nlp', 'amazing', 'deep', 'learning', 'makes', 'even', 'fun'],
 ['let', "'s", 'learn']]

#### Exercise 4.03 (Creating One-Hot Encoding for Our Data) - Page 181

***

<span style="color:steelblue">**(1)**</span>

In [1]:
# Code it!

<span style="color:steelblue">**(2)**</span>

In [94]:
# Code it!

<span style="color:steelblue">**(3)**</span>

In [95]:
# Code it!

<span style="color:steelblue">**(4)**</span>

In [96]:
# Code it!

<span style="color:steelblue">**(5)**</span>

In [97]:
# Code it!

***

##### Term Frequencies

In [99]:
from sklearn.feature_extraction.text import CountVectorizer

In [100]:
vectorizer = CountVectorizer(max_features = 5)

In [101]:
vectorizer.fit(txt_sents)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=5, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [102]:
vectorizer.vocabulary_

{'deep': 1, 'we': 4, 'together': 3, 'and': 0, 'learn': 2}

In [103]:
txt_dtm = vectorizer.fit_transform(txt_sents)

In [104]:
txt_dtm.toarray()

array([[0, 1, 0, 0, 0],
       [1, 0, 1, 2, 2],
       [1, 1, 0, 0, 0],
       [0, 0, 1, 0, 0]], dtype=int64)

In [105]:
txt_sents

['welcome to the world of deep learning for nlp!',
 "we're in this together, and we'll learn together.",
 'nlp is amazing, and deep learning makes it even more fun.',
 "let's learn!"]

In [106]:
def do_nothing(doc):
    return doc

In [107]:
vectorizer = CountVectorizer(max_features = 5, 
                             preprocessor = do_nothing, 
                             tokenizer    = do_nothing)

In [108]:
txt_dtm = vectorizer.fit_transform(txt_words_stem)

In [109]:
txt_dtm.toarray()

array([[0, 1, 1, 1, 0],
       [1, 0, 1, 0, 2],
       [0, 1, 1, 1, 0],
       [0, 0, 1, 0, 0]], dtype=int64)

In [110]:
vectorizer.vocabulary_

{'deep': 1, 'learn': 2, 'nlp': 3, 'togeth': 4, "'ll": 0}

In [111]:
txt_words_stem

[['welcom', 'world', 'deep', 'learn', 'nlp'],
 ["'re", 'togeth', "'ll", 'learn', 'togeth'],
 ['nlp', 'amaz', 'deep', 'learn', 'make', 'even', 'fun'],
 ['let', "'s", 'learn']]

#### Exercise 4.04 (Document Term Matrix with TF-IDF) - Page 188

***

<div style="padding: 10px;margin-bottom: 20px;border: thin solid #30335D;border-left-width: 10px;background-color: #fff"><strong>Note:</strong> This exercise is optional.</div>

<span style="color:steelblue">**(1)**</span>

In [112]:
# Code it!

<span style="color:steelblue">**(2)**</span>

In [113]:
# Code it!

<span style="color:steelblue">**(3)**</span>

In [6]:
# Code it!

<span style="color:steelblue">**(4)**</span>

In [5]:
# Code it!

<span style="color:steelblue">**(5)**</span>

In [116]:
# Code it!

<span style="color:steelblue">**(6)**</span>

In [3]:
# Code it!

<span style="color:steelblue">**(7)**</span>

In [4]:
# Code it!

***
### NLP Workshop (2)
#### Training Our Own Embeddings - Page 197

In [3]:
from gensim.models import word2vec

<div style="padding: 10px;margin-bottom: 20px;border: thin solid #30335D;border-left-width: 10px;background-color: #fff"><strong>Note:</strong> The code block below is in the text but is commented out as the text8 dataset is in the data folder.</div>

In [3]:
# Another way of loading the data. if this doesn't work, you could use the text8 corpus local file
# dataset = api.load("text8")

In [4]:
# Load the dataset from the data folder.
dataset = word2vec.Text8Corpus("data/text8")

To ensure reproducible results, set random seed to 1.

<div style="padding: 10px;margin-bottom: 20px;border: thin solid #30335D;border-left-width: 10px;background-color: #fff"><strong>Note:</strong> The text does not import the numpy library so that needs to be done prior to setting the random seed.</div>

In [1]:
import numpy as np

np.random.seed(1)

In [None]:
model = word2vec.Word2Vec(dataset)

In [10]:
print(model.wv["animal"])

[ 0.4287323   0.63332057  1.2456409   1.4752632   2.109443    1.1569548
 -0.02130948  0.0182436  -0.31349027  0.54959863 -0.38544917 -0.6813727
  0.6621224  -0.7377177  -1.359701    1.203772   -0.9285904   1.3970402
 -0.24282815  0.7036325   1.643011   -1.1232854  -0.8321323  -0.6999443
 -0.64125305  1.2003804  -2.5951238   0.9126282   0.40390974  0.34987825
 -0.6876566  -0.5630351  -1.4752318  -0.9742847  -1.2873613   0.364369
 -1.4539522  -0.68561345  3.5552766   0.72201556  1.5026729   0.98386985
 -0.28209195  1.3619014   1.5481789  -1.93158     0.2837808   1.1228682
 -0.26478738 -2.7449822   1.5130725   2.337826    0.990443    0.14571731
  1.7722192   0.2289574   0.91638774 -0.9191398   0.31600663 -1.9637885
  1.6257365   2.750085    0.82116085 -0.09666173 -1.0970197   1.7428256
 -1.2706696  -0.06126465  0.7258795  -2.1243627  -1.4855682   2.3246202
 -1.114259    1.3766924   0.03910503 -0.5288294  -0.11479293 -0.5253076
  0.0630557  -0.49075633  0.02464008  0.9779139  -1.0765476  -

In [11]:
len(model.wv["animal"])

100

In [125]:
model.wv.most_similar("animal")

[('insect', 0.7510448098182678),
 ('animals', 0.7423559427261353),
 ('aquatic', 0.6713050007820129),
 ('ants', 0.665005087852478),
 ('feces', 0.6597224473953247),
 ('humans', 0.6595240235328674),
 ('insects', 0.6531916856765747),
 ('domesticated', 0.639849066734314),
 ('mammal', 0.638308048248291),
 ('herd', 0.6336805820465088)]

In [126]:
model.wv.most_similar("happiness")

[('humanity', 0.7809507846832275),
 ('goodness', 0.7767032384872437),
 ('pleasure', 0.7601181268692017),
 ('mankind', 0.733366072177887),
 ('satisfaction', 0.7316511869430542),
 ('compassion', 0.7286831736564636),
 ('desires', 0.7183192372322083),
 ('feelings', 0.7128568887710571),
 ('perfection', 0.7075753808021545),
 ('salvation', 0.7067484855651855)]

##### Semantic Regularities in Word Embeddings

In [127]:
model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn = 5)

[('queen', 0.6851475238800049),
 ('prince', 0.6383140087127686),
 ('empress', 0.6180450916290283),
 ('princess', 0.6116641759872437),
 ('son', 0.6045335531234741)]

In [128]:
model.wv.most_similar(positive=['uncle', 'woman'], negative=['man'], topn = 5)

[('aunt', 0.8399454951286316),
 ('grandmother', 0.8323032259941101),
 ('wife', 0.8157851099967957),
 ('niece', 0.8152226805686951),
 ('widow', 0.7859092950820923)]

#### Exercise 4.05 (Vectors for Phrases) - Page 201

***

<span style="color:steelblue">**(1)**</span>

In [None]:
# Code it!

<span style="color:steelblue">**(2)**</span>

In [None]:
# Code it!

<span style="color:steelblue">**(3)**</span>

In [None]:
# Code it!

<span style="color:steelblue">**(4)**</span>

In [None]:
# Code it!

<span style="color:steelblue">**(5)**</span>

In [None]:
# Code it!

<span style="color:steelblue">**(6)**</span>

In [7]:
# Code it!

#### Effect of Parameters - 'size' of the Vector

In [135]:
model = word2vec.Word2Vec(dataset, size = 30)

In [136]:
model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn = 5)

[('son', 0.8133434653282166),
 ('empress', 0.8022229671478271),
 ('emperor', 0.7999867796897888),
 ('archbishop', 0.7950774431228638),
 ('constantine', 0.7858606576919556)]

#### Effect of parameters - skipgram vs. CBOW

##### Rare terms - oeuvre

In [137]:
model = word2vec.Word2Vec(dataset)

In [138]:
model.wv.most_similar("oeuvre", topn = 5)

[('seminal', 0.7173739671707153),
 ('baglione', 0.6992780566215515),
 ('wace', 0.6952950954437256),
 ('mockery', 0.6938953399658203),
 ('foxe', 0.687375545501709)]

In [139]:
model_sg = word2vec.Word2Vec(dataset, sg = 1)

In [140]:
model_sg.wv.most_similar("oeuvre", topn = 5)

[('masterful', 0.8323545455932617),
 ('satiric', 0.8200669288635254),
 ('masterwork', 0.815832257270813),
 ('mussorgsky', 0.815514862537384),
 ('librettos', 0.8108195662498474)]

#### Exercise 4.06 (Training Word Vectors on Different Datasets) - Page 205

***

<span style="color:steelblue">**(1)**</span>

In [13]:
# Code it!

<span style="color:steelblue">**(2)**</span>

In [146]:
# Code it!

<span style="color:steelblue">**(3)**</span>

In [11]:
# Code it!

<span style="color:steelblue">**(4)**</span>

In [12]:
# Code it!

##### Using Pre-Trained Word Vectors

In [1]:
from gensim.scripts.glove2word2vec import glove2word2vec

glove_input_file     = 'data/glove.6B.100d.txt'
word2vec_output_file = 'data/glove.6B.100d.w2vformat.txt'

glove2word2vec(glove_input_file, word2vec_output_file)

(400000, 100)

In [2]:
from gensim.models.keyedvectors import KeyedVectors

glove_model = KeyedVectors.load_word2vec_format("data/glove.6B.100d.w2vformat.txt", binary = False)

In [3]:
glove_model.most_similar("money", topn = 5)

[('funds', 0.8508071303367615),
 ('cash', 0.848483681678772),
 ('fund', 0.7594833374023438),
 ('paying', 0.7415367364883423),
 ('pay', 0.7407673001289368)]

In [4]:
glove_model.most_similar(positive=['woman', 'king'], negative=['man'], topn = 5)

[('queen', 0.7698541283607483),
 ('monarch', 0.6843380928039551),
 ('throne', 0.6755735874176025),
 ('daughter', 0.6594556570053101),
 ('princess', 0.6520534753799438)]

##### Bias in Embeddings â€“ A Word of Caution

In [149]:
model.wv.most_similar(positive=['woman', 'doctor'], negative=['man'], topn = 5)

[('child', 0.6149958372116089),
 ('nurse', 0.6090491414070129),
 ('teacher', 0.5878923535346985),
 ('dominatrix', 0.5384681224822998),
 ('detective', 0.5246642231941223)]

In [150]:
model.wv.most_similar(positive=['woman', 'smart'], negative=['man'], topn = 5)

[('pet', 0.6097452640533447),
 ('odie', 0.567996621131897),
 ('lingerie', 0.5643869042396545),
 ('scam', 0.5464061498641968),
 ('thug', 0.5415985584259033)]