# Tokenizing using regular expressions (Regex)

In [44]:
import re
from nltk.tokenize import word_tokenize

text1 = "Eighty-seven miles to go, yet.  Onward!"
# [\w]+, searches only for word characters
# [\w-]+ also includes - in the same word -> See the difference in output
word = re.findall('[\w]+',text1)
word1 = re.findall('[\w-]+', text1)
# Using NLTK library-> Infact our regular expression tokenizer is better. It's not considering the punctuations as tokens, which is what we want
word2 = word_tokenize(text1)
print(word)
print(word1)
print(word2)

['Eighty', 'seven', 'miles', 'to', 'go', 'yet', 'Onward']
['Eighty-seven', 'miles', 'to', 'go', 'yet', 'Onward']
['Eighty-seven', 'miles', 'to', 'go', ',', 'yet', '.', 'Onward', '!']


In [46]:
import re

text = """Founded in 2002, SpaceX’s mission is to enable humans to become a spacefaring civilization and a multi-planet 
species by building a self-sustaining city on Mars. In 2008, SpaceX’s Falcon 1 became the first privately developed 
liquid-fuel launch vehicle to orbit the Earth."""

# Level - 1
# Using split(), you can use only one separator at a time.
#  Punctuations to the right of word are added to the word, not good.
# Default separator is a space " "
a = text.split() 
print(a)

# For sentence tokenization, separator is '. '
b = text.split('. ')
print(b)

# Level-2
# Word tokenization
# Using regular expressions
# \w means any word character(letters numbers _)
tokens = re.findall("[\w]+", text)
print(tokens)

#Sentence tokenization
# Sentence to be splitted when you see ? ! .
sentences = re.compile('[.!?] ').split(text)
print(sentences)



['Founded', 'in', '2002,', 'SpaceX’s', 'mission', 'is', 'to', 'enable', 'humans', 'to', 'become', 'a', 'spacefaring', 'civilization', 'and', 'a', 'multi-planet', 'species', 'by', 'building', 'a', 'self-sustaining', 'city', 'on', 'Mars.', 'In', '2008,', 'SpaceX’s', 'Falcon', '1', 'became', 'the', 'first', 'privately', 'developed', 'liquid-fuel', 'launch', 'vehicle', 'to', 'orbit', 'the', 'Earth.']
['Founded in 2002, SpaceX’s mission is to enable humans to become a spacefaring civilization and a multi-planet \nspecies by building a self-sustaining city on Mars', 'In 2008, SpaceX’s Falcon 1 became the first privately developed \nliquid-fuel launch vehicle to orbit the Earth.']
['Founded', 'in', '2002', 'SpaceX', 's', 'mission', 'is', 'to', 'enable', 'humans', 'to', 'become', 'a', 'spacefaring', 'civilization', 'and', 'a', 'multi', 'planet', 'species', 'by', 'building', 'a', 'self', 'sustaining', 'city', 'on', 'Mars', 'In', '2008', 'SpaceX', 's', 'Falcon', '1', 'became', 'the', 'first', 'p

# Using the Natural Language Toolkit (NLTK) for tokenization

In [22]:
# Natural language toolkit
# Tokenizers divides a string into substrings. White Space not considered a token.
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


In [14]:

text = """Founded in 2002, SpaceX’s mission is to enable humans to become a spacefaring civilization and a multi-planet 
species by building a self-sustaining city on Mars. In 2008, SpaceX’s Falcon 1 became the first privately developed 
liquid-fuel launch vehicle to orbit the Earth."""

# Word tokenize using the library
# Note that every punctuation is also considered a token.
# Ex: ' , . are taken as tokens, we don't want this.
word = word_tokenize(text)
print(word)
sent = sent_tokenize(text)
print(sent)



['Founded', 'in', '2002', ',', 'SpaceX', '’', 's', 'mission', 'is', 'to', 'enable', 'humans', 'to', 'become', 'a', 'spacefaring', 'civilization', 'and', 'a', 'multi-planet', 'species', 'by', 'building', 'a', 'self-sustaining', 'city', 'on', 'Mars', '.', 'In', '2008', ',', 'SpaceX', '’', 's', 'Falcon', '1', 'became', 'the', 'first', 'privately', 'developed', 'liquid-fuel', 'launch', 'vehicle', 'to', 'orbit', 'the', 'Earth', '.']
['Founded in 2002, SpaceX’s mission is to enable humans to become a spacefaring civilization and a multi-planet \nspecies by building a self-sustaining city on Mars.', 'In 2008, SpaceX’s Falcon 1 became the first privately developed \nliquid-fuel launch vehicle to orbit the Earth.']


# Stemming using NLTK

In [68]:
# Stemming is like the precursor to lemmatization 
# In stemming we just try to chop off suffixes of the word, such that resultant is the simple version of the word
# Ex:  
# Porter's Stemmer algorithm is a popular stemming algorithm.
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize

words = ["program", "programs", "programmer", "programming", "programmers"]
# Note: the stemmed word need not always be a real word!
# Below is a case of over-stemming, all three words are not related to each other, but all three have the same stem from the PS algorithm
words2 = ["universal","universe","university"]
# Below is a case of under-stemming, the three words should have a common stem word due to almost same meaning but they don't from the algorithm
words3 = ["alumnus","almuni","alumnae"]
ps = PorterStemmer()
for w in words:
  print(w," -> ", ps.stem(w))
print("\n")
for w in words2:
  print(w,"->",ps.stem(w))
print("\n")
for w in words3:
  print(w,"->",ps.stem(w))


program  ->  program
programs  ->  program
programmer  ->  programm
programming  ->  program
programmers  ->  programm


universal -> univers
universe -> univers
university -> univers


alumnus -> alumnu
almuni -> almuni
alumnae -> alumna


# Lemmatizing using NLTK

In [63]:
# Lemmatization
# Converting a word to it's 'simple' form ->Links words with similar meaning to one word
# rocks -> rock
# better -> good
# corpora -> corpus

# Lemmatization using NLTK library
# nltk.stem library is used to remove 'morphological affixes' from a word, to leave only the word stem.
# Few difficulties -> ceil is not the stem of ceiling!
from nltk.stem import WordNetLemmatizer

# wnl is now an object pf WordNetLemmatizer
wnl = WordNetLemmatizer()
# WordNet is a lexical database of words-> basically thesaurus-like but also includes relations between words.
# Example: hypernyms -> Y is a hypernym of X, if every X is a (kind of) Y. -> canine is a hypernym of dog.
# arguments are the word and the part of speech it corresponds to,
# n - noun
# a - adjective
# v - verb
# r = adverb
print(wnl.lemmatize("worse",pos="a"))

text2 = "I am building spaceships to go to places, that otherwise would be impossible to go."
word = re.findall('[\w]+', text2)
# word is the tokenization of text2
print(word)

# Default pos = "n" (noun), you need to mention if it's something else.
mod = []
for w in word:
  modw = wnl.lemmatize(w)
  if(modw == w):
    modw = wnl.lemmatize(w,pos="v")
  mod.append(modw)

# word after lemmatization
print(mod)



bad
['I', 'am', 'building', 'spaceships', 'to', 'go', 'to', 'places', 'that', 'otherwise', 'would', 'be', 'impossible', 'to', 'go']
['I', 'be', 'build', 'spaceship', 'to', 'go', 'to', 'place', 'that', 'otherwise', 'would', 'be', 'impossible', 'to', 'go']


# Word Embeddings (also called word vectors)

1. Word embedding is basically associating a word with a vector, each dimension (or) column being a set of measurable characteristics of the word.

2. The closer the vectors are in the 100 (or any other) dimension space, closer are their meanings.

3. Word2vec is an algorithm used to obtain word embeddings (vector representation of words)
4. Glove is an unsupervised learning algorithm to do the same task. 

# Given a pre-trained word embeddings, finding closely related words.

In [107]:
import csv
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
import gensim
from gensim.models import Word2Vec

# Generating word vectors using Word2Vec
rev = pd.read_csv("Reviews.csv",engine="python", error_bad_lines=False)




  exec(code_obj, self.user_global_ns, self.user_ns)
Skipping line 168205: unexpected end of data


In [125]:
# We'll use the Text column of the dataset
# Preparing corpus
# Taking 1000 rows, and appending each text column with new line.
corpus = '\n'.join(rev[:1000]['Text'])
data = []
for i in sent_tokenize(corpus):
  temp = []
  for j in word_tokenize(i):
    temp.append(j.lower())

  data.append(temp)

# Implementing CBOW architecture
model = gensim.models.Word2Vec(data, min_count = 1,size = 100, window = 5, sg=0)
model.save("word2vec.model")

# model = Word2Vec.load("word2vec.model") -> Reloading model later

# mapping each word with a vector from the model above 
word_vectors = model.wv
# Voila! Our word vectors. 
# print(word_vectors['eat'])
similar = model.wv.most_similar('use', topn = 5)
print(similar)

[('who', 0.9999181032180786), ('add', 0.9998929500579834), ('hard', 0.9998905658721924), ('then', 0.9998830556869507), ('out', 0.9998829364776611)]
