Tokenization Functions

1. TreebankWordTokenizer

In [3]:
import nltk

In [4]:
from nltk.tokenize import TreebankWordTokenizer

In [5]:
text = "I'm Soundhiri. I'm Studying B.E CSE..."
tokenizer = TreebankWordTokenizer()
print(tokenizer.tokenize(text))

['I', "'m", 'Soundhiri.', 'I', "'m", 'Studying', 'B.E', 'CSE', '...']


2. WordPunct Tokenizer

In [6]:
from nltk.tokenize import wordpunct_tokenize

In [7]:
print(wordpunct_tokenize(text))

['I', "'", 'm', 'Soundhiri', '.', 'I', "'", 'm', 'Studying', 'B', '.', 'E', 'CSE', '...']


3. MWE Tokenizer

In [8]:
from nltk.tokenize import MWETokenizer
from nltk.tokenize import word_tokenize

In [9]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [10]:
tokeniz = MWETokenizer()
tokeniz.add_mwe(('B.E','CSE'))
print(tokeniz.tokenize(word_tokenize(text)))

['I', "'m", 'Soundhiri', '.', 'I', "'m", 'Studying', 'B.E_CSE', '...']


 Stemming  examples

1. Lancaster Stemming

In [11]:
from nltk.stem import LancasterStemmer

In [13]:
lancaster = LancasterStemmer()
words = ['drinking','drink','drank','eat','eating']
for word in words:
    print(word,"--->",lancaster.stem(word))

drinking ---> drink
drink ---> drink
drank ---> drank
eat ---> eat
eating ---> eat


2. Regexp Stemmer

In [14]:
from nltk.stem import RegexpStemmer

In [15]:
regexp = RegexpStemmer('ing$|s$|e$|able$', min=4)
words = ['massable','was','bees','computers','advisable']
for word in words:
    print(word,"--->",regexp.stem(word))

massable ---> mass
was ---> was
bees ---> bee
computers ---> computer
advisable ---> advis


Lemmatization techniques

1. spaCy Lemmatization

In [16]:
import spacy

In [17]:
nlp = spacy.load('en_core_web_sm')
doc = nlp(u'the bats saw the cats with best stripes hanging upside down by their feet')
tokens = []
for token in doc:
    tokens.append(token)
print(tokens)

[the, bats, saw, the, cats, with, best, stripes, hanging, upside, down, by, their, feet]


In [18]:
lemmatized_sentence = " ".join([token.lemma_ for token in doc])
print(lemmatized_sentence)

the bat see the cat with good stripe hang upside down by -PRON- foot


2. TreeTagger Lemmatization

In [19]:
!pip install pattern
import pattern
from pattern.en import lemma, lexeme
from pattern.en import parse

Collecting pattern
  Downloading Pattern-3.6.0.tar.gz (22.2 MB)
[K     |████████████████████████████████| 22.2 MB 1.4 MB/s 
Collecting backports.csv
  Downloading backports.csv-1.0.7-py2.py3-none-any.whl (12 kB)
Collecting mysqlclient
  Downloading mysqlclient-2.1.0.tar.gz (87 kB)
[K     |████████████████████████████████| 87 kB 5.3 MB/s 
Collecting feedparser
  Downloading feedparser-6.0.8-py3-none-any.whl (81 kB)
[K     |████████████████████████████████| 81 kB 7.6 MB/s 
[?25hCollecting pdfminer.six
  Downloading pdfminer.six-20220319-py3-none-any.whl (5.6 MB)
[K     |████████████████████████████████| 5.6 MB 27.4 MB/s 
Collecting python-docx
  Downloading python-docx-0.8.11.tar.gz (5.6 MB)
[K     |████████████████████████████████| 5.6 MB 29.1 MB/s 
[?25hCollecting cherrypy
  Downloading CherryPy-18.6.1-py2.py3-none-any.whl (419 kB)
[K     |████████████████████████████████| 419 kB 49.2 MB/s 
Collecting portend>=2.1.1
  Downloading portend-3.1.0-py3-none-any.whl (5.3 kB)
Collecti

In [21]:
sentence = "the bats saw the cats with best stripes hanging upside down by their feet"
lemmatized_sentence = " ".join([lemma(word) for word in sentence.split()])
print(lemmatized_sentence)

the bat see the cat with best stripe hang upside down by their feet


In [22]:
all_lemmas_for_each_word = [lexeme(wd) for wd in sentence.split()]
print(all_lemmas_for_each_word)

[['the', 'thes', 'thing', 'thed'], ['bat', 'bats', 'batting', 'batted'], ['see', 'sees', 'seeing', 'saw', 'seen'], ['the', 'thes', 'thing', 'thed'], ['cat', 'cats', 'catting', 'catted'], ['with', 'withs', 'withing', 'withed'], ['best', 'bests', 'besting', 'bested'], ['stripe', 'stripes', 'striping', 'striped'], ['hang', 'hangs', 'hanging', 'hung'], ['upside', 'upsides', 'upsiding', 'upsided'], ['down', 'downs', 'downing', 'downed'], ['by', 'bies', 'bying', 'bied'], ['their', 'theirs', 'theiring', 'theired'], ['feet', 'feets', 'feeting', 'feeted']]


3. Genism

In [23]:
from gensim.utils import lemmatize

In [26]:
sentence = "the bats saw the cats with best stripes hanging upside down by their feet"
 
lemmatized_sentence = [word.decode('utf-8').split('.')[0] for word in lemmatize(sentence)]
 
print(lemmatized_sentence)

['bat/NN', 'see/VB', 'cat/NN', 'best/JJ', 'stripe/NN', 'hang/VB', 'upside/RB', 'foot/NN']
