In [None]:
import nltk
nltk.download('brown')

In [1]:
from nltk.corpus import brown # corpus means a bag of words in a certain category.
# lot of options to choose frm

In [2]:
brown.categories() #types of words present in brown corpus

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [3]:
brown.words?

[1;31mSignature:[0m [0mbrown[0m[1;33m.[0m[0mwords[0m[1;33m([0m[0mfileids[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m [0mcategories[0m[1;33m=[0m[1;32mNone[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
:return: the given file(s) as a list of words
    and punctuation symbols.
:rtype: list(str)
[1;31mFile:[0m      c:\users\91865\anaconda3\lib\site-packages\nltk\corpus\reader\tagged.py
[1;31mType:[0m      method


In [4]:
brown.words()

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]

In [5]:
# sentences belong to class adventure
data = brown.sents(categories=["romance"])
data

[['They', 'neither', 'liked', 'nor', 'disliked', 'the', 'Old', 'Man', '.'], ['To', 'them', 'he', 'could', 'have', 'been', 'the', 'broken', 'bell', 'in', 'the', 'church', 'tower', 'which', 'rang', 'before', 'and', 'after', 'Mass', ',', 'and', 'at', 'noon', ',', 'and', 'at', 'six', 'each', 'evening', '--', 'its', 'tone', ',', 'repetitive', ',', 'monotonous', ',', 'never', 'breaking', 'the', 'boredom', 'of', 'the', 'streets', '.'], ...]

In [6]:
len(data)

4431

In [7]:
" ".join(data[0]) # sentence at 0th position

'They neither liked nor disliked the Old Man .'

In [8]:
" ".join(data[45])

'But when he called for his withered , wrinkled sister Rose to care for him and the children , had he guessed that all he would remember of his woman was the memory of her climbing into that streetcar ? ?'

# Tokenization

In [11]:
from nltk.tokenize import sent_tokenize, word_tokenize
#nltk.download('punkt')

In [12]:
document = """ It was a very good movie. The cast was amazing and I liked the story.
I went to the movie hall to see it.
"""

sentence = "Code for Cause is too OP kunal@codeforcause.org"

In [13]:
sents = sent_tokenize(document)
print(sents)
len(sents)

[' It was a very good movie.', 'The cast was amazing and I liked the story.', 'I went to the movie hall to see it.']


3

In [14]:
words = word_tokenize(sentence) # also break down special characters
print(words)
print(len(words))

['Code', 'for', 'Cause', 'is', 'too', 'OP', 'kunal', '@', 'codeforcause.org']
9


# Stopword removal

In [15]:
from nltk.corpus import stopwords
#nltk.download('stopwords')

In [16]:
sw = set(stopwords.words('english')) # words with no value, so ignore
# sw

In [17]:
text = "I am not a very good cricket player".split()
print(text)

['I', 'am', 'not', 'a', 'very', 'good', 'cricket', 'player']


In [18]:
def remove_stoprwords(text, stopwords):
    useful = [w for w in text if w not in stopwords]
    return useful

In [19]:
useful_words = remove_stoprwords(text, sw)
useful_words

['I', 'good', 'cricket', 'player']

### tokenisation using regex

In [20]:
sent = "My email is kunal@codeforcause.org, please don't spam my inbox"

In [21]:
from nltk.tokenize import RegexpTokenizer

In [22]:
tokenizer = RegexpTokenizer('[a-zA-Z@.]+') #need them
useful = tokenizer.tokenize(sentence)
print(useful)

['Code', 'for', 'Cause', 'is', 'too', 'OP', 'kunal@codeforcause.org']


# Stemmers

In [23]:
from nltk.stem import SnowballStemmer, PorterStemmer, LancasterStemmer
# nltk provides us: Porter, Snowball, Lancaster stemmers

In [24]:
ps = PorterStemmer()

In [25]:
ps.stem('laughing')

'laugh'

In [26]:
# SnowballStemmer = Multilingul, supports other langs also.

In [27]:
corpus = [
    'Dan Morgan told himself he would forget Ann Turner.',
    'Sometimes he woke up in the middle of the night thinking of Ann , and then could not get back to sleep .',
    'His plans and dreams had revolved around her so much and for so long that now he felt as if he had nothing .',
    'He found that if he was tired enough at night , he went to sleep simply because he was too exhausted to stay awake .'
]

# Vocabulary

In [57]:
from sklearn.feature_extraction.text import CountVectorizer

In [58]:
cv = CountVectorizer() #taking unique words, put in single list

In [59]:
vc = cv.fit_transform(corpus)

In [60]:
vc = vc.toarray()
print(vc[0]) # only 1st sentence
print(cv.vocabulary_) # dan occurs at 9 index, so on

[0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1]
{'dan': 9, 'morgan': 27, 'told': 47, 'himself': 21, 'he': 19, 'would': 54, 'forget': 15, 'ann': 1, 'turner': 49, 'sometimes': 39, 'woke': 53, 'up': 50, 'in': 24, 'the': 42, 'middle': 26, 'of': 33, 'night': 29, 'thinking': 44, 'and': 0, 'then': 43, 'could': 8, 'not': 30, 'get': 17, 'back': 6, 'to': 46, 'sleep': 37, 'his': 22, 'plans': 34, 'dreams': 10, 'had': 18, 'revolved': 35, 'around': 2, 'her': 20, 'so': 38, 'much': 28, 'for': 14, 'long': 25, 'that': 41, 'now': 32, 'felt': 13, 'as': 3, 'if': 23, 'nothing': 31, 'found': 16, 'was': 51, 'tired': 45, 'enough': 11, 'at': 4, 'went': 52, 'simply': 36, 'because': 7, 'too': 48, 'exhausted': 12, 'stay': 40, 'awake': 5}


In [61]:
print(len(cv.vocabulary_)) # 55 unique words

55


In [62]:
numbers = vc[2]
print(numbers)
print(len(numbers))

[2 0 1 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0 2 2 1 0 1 1 0 1 0 0 1 0 0 1 1 0 1 1 0
 0 2 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
55


In [63]:
len(vc[1]) # len is always 55

55

In [64]:
def myTokenizer(document):
    words = tokenizer.tokenize(document.lower())
    # remove the stopwords
    words = remove_stoprwords(words, sw)
    return words

In [65]:
myTokenizer('this is a random text')

['random', 'text']

In [66]:
cv = CountVectorizer(tokenizer=myTokenizer) #takes a tokenizer

In [67]:
vc = cv.fit_transform(corpus).toarray()
print(vc)

[[0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1]
 [1 1 0 0 1 1 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 1 0]
 [1 0 1 0 0 0 0 1 0 0 1 0 0 0 1 0 0 1 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 1 0 0 0 0 1 1 0 0 1 0 0 0 0 0 1 0 0 0 1 1 0 1 0 1 0 0 1 0 0]]


In [68]:
len(vc[0])

33

In [69]:
cv.vocabulary_

{'dan': 6,
 'morgan': 16,
 'told': 28,
 'would': 32,
 'forget': 11,
 'ann': 1,
 'turner.': 29,
 'sometimes': 24,
 'woke': 31,
 'middle': 15,
 'night': 18,
 'thinking': 26,
 'could': 5,
 'get': 13,
 'back': 4,
 'sleep': 23,
 '.': 0,
 'plans': 20,
 'dreams': 7,
 'revolved': 21,
 'around': 2,
 'much': 17,
 'long': 14,
 'felt': 10,
 'nothing': 19,
 'found': 12,
 'tired': 27,
 'enough': 8,
 'went': 30,
 'simply': 22,
 'exhausted': 9,
 'stay': 25,
 'awake': 3}

In [74]:
len(cv.transform([sent]).toarray()[0]) #refer to differnce b/w fir trans and trans

5

In [75]:
cv.vocabulary_

{'email': 0, 'kunal@codeforcause.org': 2, 'please': 3, 'spam': 4, 'inbox': 1}