# NLTK 

In [1]:
from nltk.corpus import brown

## DATA COLLECTION

In [2]:
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [3]:
data=brown.sents(categories='editorial')#sents gives the document
print(type(data),len(data))

<class 'nltk.corpus.reader.util.ConcatenatedCorpusView'> 2997


In [4]:
print(data)

[['Assembly', 'session', 'brought', 'much', 'good'], ['The', 'General', 'Assembly', ',', 'which', 'adjourns', 'today', ',', 'has', 'performed', 'in', 'an', 'atmosphere', 'of', 'crisis', 'and', 'struggle', 'from', 'the', 'day', 'it', 'convened', '.'], ...]


In [5]:
print(len(data))

2997


# NLP PIPELINE


In [6]:
# DATA COLLECTION
#TOKENISATION,STOPWARDS REMOVAL,STEMMING
#BUILDING A VOCAB
#VECTORISE THE DOCUMENTS
#PERFORMING CLASSIFICATION/CLUSTERING

# 2.TOKENISATION

In [7]:
text="A very plesant day ,the weather was very cool. I went to market to buy some chocolates"

In [8]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [9]:
sents=sent_tokenize(text)

In [10]:
print(sents)

['A very plesant day ,the weather was very cool.', 'I went to market to buy some chocolates']


In [11]:
word_list=word_tokenize(sents[0])

In [12]:
print(word_list)

['A', 'very', 'plesant', 'day', ',', 'the', 'weather', 'was', 'very', 'cool', '.']


In [13]:
word_list=word_tokenize(sents[0].upper())
print(word_list)

['A', 'VERY', 'PLESANT', 'DAY', ',', 'THE', 'WEATHER', 'WAS', 'VERY', 'COOL', '.']


# STOPWORDS REMOVAL

In [14]:
from nltk.corpus import stopwords

In [15]:
sw=set(stopwords.words('english'))

In [16]:
print(sw)

{'few', 'now', 'haven', 'with', "you've", 'into', "hadn't", 'through', 'over', "you'd", 'again', 'being', 'this', 'be', 'themselves', 'yourselves', 'if', 'y', "didn't", 'any', 'off', 'mightn', 'shan', 'for', 'about', 'he', "shan't", 'when', 'while', 'isn', 'himself', 'same', "haven't", 'its', 'have', 'having', 'those', 'both', 'hasn', 'or', 'yours', 'couldn', 'because', 'needn', "don't", 'd', 'had', 'should', 'which', 'wouldn', 'itself', 'once', 'of', 'here', 'you', 'under', 'i', 'as', 'your', 'ain', 'no', "you'll", 'it', 'don', "weren't", 'been', 'above', 'is', 'up', 'will', 'do', 'whom', 'down', "doesn't", 's', 'where', 'who', 'just', 'were', "she's", "needn't", 'o', 'these', 'before', 'some', "should've", 'further', 'other', 'weren', 'out', 'that', 'after', 'below', 'only', 'until', 'very', 'an', "won't", 'mustn', "wasn't", 'too', 'but', 'at', 'to', 'how', 'most', 'theirs', 'am', 'in', 'shouldn', 'the', 'ma', 'aren', 'our', "mustn't", 'all', 'there', 'than', 'nor', 'so', 'their', 'm

In [17]:
print(len(sw))

179


# FILTER THE WORDS

In [18]:
def filter(word_list):
    usefl_wrd=[w for w in word_list if w not in sw]
    return usefl_wrd

In [19]:
usefl_wrd=filter(word_list)
print(usefl_wrd)

['A', 'VERY', 'PLESANT', 'DAY', ',', 'THE', 'WEATHER', 'WAS', 'VERY', 'COOL', '.']


In [20]:
from nltk.tokenize import RegexpTokenizer

In [21]:
tokenizer=RegexpTokenizer("[,.]")

In [22]:
sents="send the 50 documents to abc,def,xyz."
print(tokenizer.tokenize(sents))

[',', ',', '.']


In [23]:
tokenizer=RegexpTokenizer("[a-z0-9]+")

In [24]:
sents="send the 50 documents to abc,def,xyz."
print(tokenizer.tokenize(sents))

['send', 'the', '50', 'documents', 'to', 'abc', 'def', 'xyz']


# STEMMING

In [25]:
# PROCESS THAT TRANSFORM PARTICULAR WORDS INTO ROOT WORDS
#JUMPING,JUMP,JUMPED=>jump
#WAITING, WAITED, WAIT=>wait

In [26]:
text="the quick brown fox was seen jumping over the lazy dog from high wall.Foxes love to jump "

In [27]:
word_list= tokenizer.tokenize(text.lower())
print(word_list)

['the', 'quick', 'brown', 'fox', 'was', 'seen', 'jumping', 'over', 'the', 'lazy', 'dog', 'from', 'high', 'wall', 'foxes', 'love', 'to', 'jump']


## types of stemmers

In [28]:
#snowball stemmer multilingual
#porter stemmer
#lancaster stemmer

In [29]:
from nltk.stem.snowball import PorterStemmer,SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer

In [30]:
ps=PorterStemmer()

In [31]:
ps.stem("jumped")

'jump'

In [32]:
ps.stem("waiting")

'wait'

In [33]:
ps.stem("awesome")

'awesom'

In [34]:
ps.stem("prernna")

'prernna'

In [35]:
ls=LancasterStemmer()
ls.stem("awesome")
print(ls.stem("teenager"))
print(ps.stem("teenager"))

teen
teenag


In [36]:
ss=SnowballStemmer('english')
ss.stem('waiting')

'wait'

In [37]:
ss=SnowballStemmer('french')
ss.stem('waiting')

'waiting'

# BAG OF WORDS

In [38]:
corpus=['Indian cricket team will win world cup says captain Virat,world cup will be held at india in next year.',
       'We will win next lokh sabha election ,says Indian PM',
       'The novel Rabindranath tagore won the heart of people',
        'The movie Razzi is an exciting Thriller based upon real story'
       ]

In [39]:
print(corpus)

['Indian cricket team will win world cup says captain Virat,world cup will be held at india in next year.', 'We will win next lokh sabha election ,says Indian PM', 'The novel Rabindranath tagore won the heart of people', 'The movie Razzi is an exciting Thriller based upon real story']


In [41]:
def myTokenizer(sentence):
    words=tokenizer.tokenize(sentence.lower())
    return filter(words)
list_words=myTokenizer(corpus[0])
print(len(list_words))

15


In [42]:
print(myTokenizer(corpus[0]))

['indian', 'cricket', 'team', 'win', 'world', 'cup', 'says', 'captain', 'virat', 'world', 'cup', 'held', 'india', 'next', 'year']


In [44]:
from sklearn.feature_extraction.text import CountVectorizer

In [45]:
cv=CountVectorizer(tokenizer=myTokenizer)

In [50]:
vectorized_corpus=cv.fit_transform(corpus)

In [51]:
print(vectorized_corpus)

  (0, 9)	1
  (0, 2)	1
  (0, 23)	1
  (0, 27)	1
  (0, 28)	2
  (0, 3)	2
  (0, 20)	1
  (0, 1)	1
  (0, 26)	1
  (0, 7)	1
  (0, 8)	1
  (0, 12)	1
  (0, 29)	1
  (1, 9)	1
  (1, 27)	1
  (1, 20)	1
  (1, 12)	1
  (1, 10)	1
  (1, 19)	1
  (1, 4)	1
  (1, 15)	1
  (2, 13)	1
  (2, 16)	1
  (2, 22)	1
  (2, 6)	1
  (2, 14)	1
  (3, 11)	1
  (3, 17)	1
  (3, 5)	1
  (3, 24)	1
  (3, 0)	1
  (3, 25)	1
  (3, 18)	1
  (3, 21)	1


In [52]:
vc=vectorized_corpus.toarray()

In [57]:
print(vc[0])
print(cv.vocabulary_)

[0 1 1 2 0 0 0 1 1 1 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 1 1 2 1]
{'indian': 9, 'cricket': 2, 'team': 23, 'win': 27, 'world': 28, 'cup': 3, 'says': 20, 'captain': 1, 'virat': 26, 'held': 7, 'india': 8, 'next': 12, 'year': 29, 'lokh': 10, 'sabha': 19, 'election': 4, 'pm': 15, 'novel': 13, 'rabindranath': 16, 'tagore': 22, 'heart': 6, 'people': 14, 'movie': 11, 'razzi': 17, 'exciting': 5, 'thriller': 24, 'based': 0, 'upon': 25, 'real': 18, 'story': 21}


In [58]:
cv.inverse_transform(vc[0])

[array(['captain', 'cricket', 'cup', 'held', 'india', 'indian', 'next',
        'says', 'team', 'virat', 'win', 'world', 'year'], dtype='<U12')]