# My endeavors to learn nltk as part of the minor project I will be working on 
### corpora - body of text. eg: medical journals, presidential speeches etc
### lexicon - words and their meanings

### NLTK works with different languages as well other than english!!

In [1]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

In [2]:
example_text = "Hello Mr. Smith. It is a good day. I am pleased to meet you. I hope happiness comes your way."

In [3]:
print(sent_tokenize(example_text))

['Hello Mr. Smith.', 'It is a good day.', 'I am pleased to meet you.', 'I hope happiness comes your way.']


In [4]:
print(word_tokenize(example_text))

['Hello', 'Mr.', 'Smith', '.', 'It', 'is', 'a', 'good', 'day', '.', 'I', 'am', 'pleased', 'to', 'meet', 'you', '.', 'I', 'hope', 'happiness', 'comes', 'your', 'way', '.']


In [5]:
for i in word_tokenize(example_text):
    print(i)


Hello
Mr.
Smith
.
It
is
a
good
day
.
I
am
pleased
to
meet
you
.
I
hope
happiness
comes
your
way
.


## Stop words : The words that we dont care about and dont need in our analysis like a, the, and etc, and also words that create general confusion like sarcastic words

In [6]:
from nltk.corpus import stopwords

In [7]:
example_sentence = "This is an example sentence for to check how does the stopwords work. Mind you nltk already has a library of english stopwords. You need not provide one."

In [8]:
stop_words = set(stopwords.words("english"))
# sets english as the defalut list of stopwords

In [9]:
print(stop_words)
# These are the set of predefined stopwords in the library

{'ve', 'does', 'very', 'further', 'will', 'after', 'had', 'only', 'm', "aren't", 'this', 'in', 'so', 'few', 'but', 'during', 'she', 'aren', 'has', 'with', 'about', 'they', "don't", 'here', 'until', 'all', 'on', 'for', 'did', "weren't", 'from', 'which', 'before', 'any', 's', 'off', 'too', "won't", 'by', 'itself', 'it', 'am', 'how', 'i', 'both', 'isn', 'them', "shouldn't", 'nor', 'myself', 'shouldn', 'these', 'most', 'mightn', 'we', 'the', 'her', 'have', "you've", 'into', 'as', 'mustn', "that'll", 'ours', "hadn't", "hasn't", 'over', 'd', 'weren', 'at', 'than', 'me', 'up', 'above', "didn't", 'ourselves', 'shan', 'again', 'been', 'below', 'can', 'him', 'are', 'to', 'your', 'there', 'when', 'or', 'more', "mightn't", 'herself', 'was', "wouldn't", 'were', 'those', 'own', 'is', 'between', 'yourself', 'just', 'who', 'what', 'and', 'hasn', "you're", 'while', 'against', 'you', 'themselves', 'being', 'then', 'why', "she's", 'an', 'if', "needn't", 'doing', "wasn't", 'of', 'doesn', 'wouldn', 'not', 

In [10]:
words = word_tokenize(example_sentence)

In [11]:
filtered_sentence = []
for w in words:
    if w not in stop_words:
        filtered_sentence.append(w)
print(filtered_sentence)        

['This', 'example', 'sentence', 'check', 'stopwords', 'work', '.', 'Mind', 'nltk', 'already', 'library', 'english', 'stopwords', '.', 'You', 'need', 'provide', 'one', '.']


In [12]:
# another short way or oneliner to do the same thing
filtered_sentence = [w for w in words if not w in stop_words]

In [13]:
print(filtered_sentence)

['This', 'example', 'sentence', 'check', 'stopwords', 'work', '.', 'Mind', 'nltk', 'already', 'library', 'english', 'stopwords', '.', 'You', 'need', 'provide', 'one', '.']


# Stemming : ride and riding are same
### alternative to it is wordnet and sinset which is more frequently used

In [14]:
from nltk.stem import PorterStemmer

In [15]:
ps = PorterStemmer()
example_words = ["python","pythoner","pythonly","pythonista","pythoning"]
for w in example_words:
    print(ps.stem(w))

python
python
pythonli
pythonista
python


In [16]:
new_text = "Aiming high is our duty, dreaming is also our duty. Endeavors to learn must not stop. Desire to grow must grow on. Community growth becomes important for the development of the nation."
words = word_tokenize(new_text)

for w in words:
    print(ps.stem(w))

aim
high
is
our
duti
,
dream
is
also
our
duti
.
endeavor
to
learn
must
not
stop
.
desir
to
grow
must
grow
on
.
commun
growth
becom
import
for
the
develop
of
the
nation
.


# Speech Tagging

In [None]:
from nltk.corpus import state_union
# state union addresses by various presidents over the past 60 yrs
from nltk.tokenize import PunktSentenceTokenizer
# It is an unsupervised ML sentence tokenizer, though it is pretrained we can always retrain it
sample_test = state_union.raw("2006")

In [19]:

# Dummy text 
txt = "Sukanya, Rajib and Naba are my good friends. Sukanya is getting married next yearMarriage is a big step in one’s life. It is both exciting and frightening.   But friendship is a sacred bond between people.  It is a special kind of love between us.  Many of you must have tried searching for a friend   but never found the right one."
    
  
# sent_tokenize is one of instances of  
# PunktSentenceTokenizer from the nltk.tokenize.punkt module 
  
tokenized = sent_tokenize(txt) 
for i in tokenized: 
      
    # Word tokenizers is used to find the words  
    # and punctuation in a string 
    wordsList = nltk.word_tokenize(i) 
  
    # removing stop words from wordList 
    wordsList = [w for w in wordsList if not w in stop_words]  
  
    #  Using a Tagger. Which is part-of-speech  
    # tagger or POS-tagger.  
    tagged = nltk.pos_tag(wordsList) 
  
    print(tagged) 

[('Sukanya', 'NNP'), (',', ','), ('Rajib', 'NNP'), ('Naba', 'NNP'), ('good', 'JJ'), ('friends', 'NNS'), ('.', '.')]
[('Sukanya', 'NNP'), ('getting', 'VBG'), ('married', 'VBD'), ('next', 'JJ'), ('yearMarriage', 'NN'), ('big', 'JJ'), ('step', 'NN'), ('one', 'CD'), ('’', 'NNP'), ('life', 'NN'), ('.', '.')]
[('It', 'PRP'), ('exciting', 'VBG'), ('frightening', 'VBG'), ('.', '.')]
[('But', 'CC'), ('friendship', 'NN'), ('sacred', 'VBD'), ('bond', 'NN'), ('people', 'NNS'), ('.', '.')]
[('It', 'PRP'), ('special', 'JJ'), ('kind', 'NN'), ('love', 'VB'), ('us', 'PRP'), ('.', '.')]
[('Many', 'JJ'), ('must', 'MD'), ('tried', 'VB'), ('searching', 'VBG'), ('friend', 'NN'), ('never', 'RB'), ('found', 'VBD'), ('right', 'JJ'), ('one', 'CD'), ('.', '.')]


## Chunking

