## Lexical Analysis

In [2]:
import numpy as np
import pandas as pd
##Install spacy using pip (pip install spacy) then download 
#python -m spacy download en  #for english
#python -m spacy download en_core_web_lg
#python -m spacy download en_core_web_sm
import spacy
import os
import nltk

### Tokenization

In [5]:
demo_sent1 = "@uday can't wait for the #nlp notes YAAAAAAY!!! #deeplearning https://udai.gitbook.io/practical-ml/"
demo_sent2 = "That U.S.A. poster-print costs $12.40..."
demo_sent3 = "I am writing NLP basics."
all_sents = [demo_sent1, demo_sent2, demo_sent3]
print(all_sents)

["@uday can't wait for the #nlp notes YAAAAAAY!!! #deeplearning https://udai.gitbook.io/practical-ml/", 'That U.S.A. poster-print costs $12.40...', 'I am writing NLP basics.']


#### White Space Tokenizer

In [7]:
for sent in all_sents:
    print(sent.split(' '))

['@uday', "can't", 'wait', 'for', 'the', '#nlp', 'notes', 'YAAAAAAY!!!', '#deeplearning', 'https://udai.gitbook.io/practical-ml/']
['That', 'U.S.A.', 'poster-print', 'costs', '$12.40...']
['I', 'am', 'writing', 'NLP', 'basics.']


#### NLTK word tokenizer

In [10]:
from nltk.tokenize import word_tokenize
for sent in all_sents:
    print(word_tokenize(sent))

['@', 'uday', 'ca', "n't", 'wait', 'for', 'the', '#', 'nlp', 'notes', 'YAAAAAAY', '!', '!', '!', '#', 'deeplearning', 'https', ':', '//udai.gitbook.io/practical-ml/']
['That', 'U.S.A.', 'poster-print', 'costs', '$', '12.40', '...']
['I', 'am', 'writing', 'NLP', 'basics', '.']


#### NLTK Regex Tokenizer

In [12]:
pattern = r'''(?x)     # set flag to allow verbose regexps
...     (?:[A-Z]\.)+       # abbreviations, 
...   | \w+(?:-\w+)*       # words with optional internal hyphens
...   | \$?\d+(?:\.\d+)?%? # currency and percentages, 
...   | \.\.\.             # ellipsis
...   | [][.,;"'?():-_`]   # these are separate tokens; includes ], [
 '''
for sent in all_sents:
    print(nltk.regexp_tokenize(sent, pattern))

['@', 'uday', 'can', "'", 't', 'wait', 'for', 'the', 'nlp', 'notes', 'YAAAAAAY', 'deeplearning', 'https', ':', 'udai', '.', 'gitbook', '.', 'io', 'practical-ml']
['That', 'U.S.A.', 'poster-print', 'costs', '$12.40', '...']
['I', 'am', 'writing', 'NLP', 'basics', '.']


#### spaCy Tokenizer

In [15]:
##loading spaCy english module
nlp = spacy.load("en_core_web_lg")
#printing
for sent in all_sents:
    print([token.text for token in nlp(sent)])

['@uday', 'ca', "n't", 'wait', 'for', 'the', '#', 'nlp', 'notes', 'YAAAAAAY', '!', '!', '!', '#', 'deeplearning', 'https://udai.gitbook.io/practical-ml/']
['That', 'U.S.A.', 'poster', '-', 'print', 'costs', '$', '12.40', '...']
['I', 'am', 'writing', 'NLP', 'basics', '.']


## Morphological Analysis

### Lemmatization

#### NLTK workdnet lemmatizer

In [21]:
##nltk lemmatizer
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize('running'))
print(lemmatizer.lemmatize('runner'))
print(lemmatizer.lemmatize('runners'))

running
runner
runner


### Stemming

In [24]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
print(stemmer.stem('running'))
print(stemmer.stem('runner'))
print(stemmer.stem('runners'))

run
runner
runner
