## Introduction to Natural Language Processing

In [1]:
#Corpus is a collection of text

from nltk.corpus import brown

In [2]:
print(brown.categories())
print(len(brown.categories()))

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
15


In [3]:
data = brown.sents(categories = 'fiction') # to get data for a particular categories
# sents() gives the list of secentences in the mentioned caterious which contain list of words in teh scentence

In [4]:
' '.join(data[1]) #It joins all the words in a list

'Scotty did not go back to school .'

## Bag of Words Pipeline
- Get the Data/Corpus
- Tokenisation, Stopward Removal
- Stemming / Lematisation
- Building a Vocab
- Vectorization
- Classification

It is used to convert text into numbers so that it can be fed to a classifier

### Tokenisation & Stopword Removal

In [5]:
document = """It was a very plesant day. The weather was cool and there were light showers.
I went to the market to buy some fruits."""

sentence = "Send all the 50 documents related to chapters 1,2,3 at prateek@cb.com"

In [6]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [7]:
sents = sent_tokenize(document)
print(sents)
print(len(sents))

['It was a very plesant day.', 'The weather was cool and there were light showers.', 'I went to the market to buy some fruits.']
3


In [8]:
sentence.split()

['Send',
 'all',
 'the',
 '50',
 'documents',
 'related',
 'to',
 'chapters',
 '1,2,3',
 'at',
 'prateek@cb.com']

In [9]:
words = word_tokenize(sentence)

In [10]:
words

['Send',
 'all',
 'the',
 '50',
 'documents',
 'related',
 'to',
 'chapters',
 '1,2,3',
 'at',
 'prateek',
 '@',
 'cb.com']

### Stopwords

In [11]:
from nltk.corpus import stopwords

sw = set(stopwords.words('english')) ## To get the stopwords in the form of words

In [12]:
print(sw)

{'when', 'those', "shan't", 'both', "hadn't", 'against', "you'd", 'he', 'these', 'it', 'isn', 'be', 'was', 'about', 'doesn', 'through', 'because', 're', "isn't", 'our', 'having', 'own', 'ain', "it's", 'over', 'should', "couldn't", 'myself', "you've", 'if', "should've", 'on', 'she', 'themselves', "aren't", "hasn't", 'can', "mightn't", 's', 'didn', 'any', 'had', 'himself', 'above', 'y', 'we', 'herself', 'some', 'only', 'than', 'between', 'm', 'the', 'wasn', "weren't", 'and', 'into', 'needn', 'its', 'this', 'yourself', 'at', 'mightn', 'couldn', 'ourselves', 'where', 'does', 'do', 'for', 'am', 'just', 'with', 'they', 'will', 'their', 'yours', 'so', 'are', 'of', 'few', 'which', 'what', 'll', 'ma', 'wouldn', 'weren', 'you', 'after', "that'll", 'from', 'not', 'while', 'is', 'there', 'nor', "shouldn't", 'or', 'out', 'shan', "she's", 'in', 'under', "mustn't", 'whom', 'again', 'why', 'o', 'been', 'up', 'shouldn', 'a', 'my', 'who', 'during', 'theirs', 'below', 'most', 't', 'further', 'once', 'by'

In [13]:
def remove_stopwords(text,stopwords):
    useful_words = [w for w in text if w not in stopwords]
    return useful_words

In [14]:
text = "i am not bothered about her very much".split() #We have to sent it as different words otherwise it will read word by word
useful_text = remove_stopwords(text,sw)
print(useful_text)

['bothered', 'much']


In [15]:
'not' in sw

True

### Tokenisation using Regular Expression

In [16]:
sentence = "Send all the 50 documents related to chapters 1,2,3 at prateek@cb.com"

In [19]:
from nltk.tokenize import RegexpTokenizer

In [22]:
tokenizer = RegexpTokenizer('[a-zA-Z@.]+')
useful_text = tokenizer.tokenize(sentence) ## To split scentence into words containing the sequence mentioned in the RegexpTokenizer

In [23]:
useful_text

['Send',
 'all',
 'the',
 'documents',
 'related',
 'to',
 'chapters',
 'at',
 'prateek@cb.com']

### Stemming
- Process that transforms particular words(verbs, plurals) into their radical form
- Preserve the semantics of the sentence without increasing the number of unique tokens
- Example - jumps, jumping, jumped, jump ==> jump

In [24]:
text = """Foxes love to make jumps. The quick brown fox was seen jumping over the lovely dog from a 6ft feet high wall"""

##### Stemmer in NLTK
- Snowball Stemmer - Multilingual Stemmer(Supportes some other languages other than English)
- Porter Stemmer
- Lincaster Stemmer

In [26]:
from nltk.stem.snowball import SnowballStemmer,PorterStemmer
from nltk.stem.lancaster import LancasterStemmer

In [27]:
ps =  PorterStemmer()

In [28]:
ps.stem('jumping')

'jump'

In [29]:
ps.stem('jump')

'jump'

In [30]:
ps.stem('lovely')

'love'

In [31]:
ps.stem('love')

'love'

In [32]:
#SnowballStemmer
ss = SnowballStemmer('english')

In [33]:
ss.stem('lovely')

'love'

In [35]:
ss.stem('jumping')

'jump'

In [36]:
#Lemmatisation
from nltk.stem import WordNetLemmatizer #Works almost like stemming

In [37]:
wn = WordNetLemmatizer()
wn.lemmatize('jumping')

'jumping'

### Building a Vocab & Vectorization 

In [85]:
corpus = [
    'Indian cricket team will win World Cup, says Capt. Virat Kholi, World cup will be held at Sri Lanka',
    'We will win next Lok Sabha Election, says confidant Indian PM',
    'The nobel laurate won the hearts of the people',
    'The movie Raazi is an exciting Indian spy thriller based upon a real story'
]

In [97]:
from sklearn.feature_extraction.text import CountVectorizer

In [87]:
cv = CountVectorizer()

In [88]:
vectorized_corpus =cv.fit_transform(corpus)

In [89]:
vectorized_corpus = vectorized_corpus.toarray()

In [90]:
vectorized_corpus[0]
len(vectorized_corpus[0])

41

In [69]:
print(cv.vocabulary_)

{'indian': 12, 'cricket': 6, 'team': 31, 'will': 37, 'win': 38, 'world': 40, 'cup': 7, 'says': 27, 'capt': 4, 'virat': 35, 'kholi': 14, 'be': 3, 'held': 11, 'at': 1, 'sri': 29, 'lanka': 15, 'we': 36, 'next': 19, 'lok': 17, 'sabha': 26, 'election': 8, 'confidant': 5, 'pmthe': 23, 'nobel': 20, 'laurate': 16, 'won': 39, 'the': 32, 'hearts': 10, 'of': 21, 'peoplethe': 22, 'movie': 18, 'raazi': 24, 'is': 13, 'an': 0, 'exciting': 9, 'spy': 28, 'thriller': 33, 'based': 2, 'upon': 34, 'real': 25, 'story': 30}


In [70]:
len(cv.vocabulary_.keys())

41

In [92]:
#Reverse Mapping

numbers = vectorized_corpus[2]
numbers

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 1, 0],
      dtype=int64)

In [95]:
s = cv.inverse_transform(numbers)
print(s)

[array(['hearts', 'laurate', 'nobel', 'of', 'people', 'the', 'won'],
      dtype='<U9')]


### Vectorisation eith Stopword Removal 

In [99]:
def myTokenizer(document):
    words = tokenizer.tokenize(document.lower())
    
    #Remove Stopwords
    words = remove_stopwords(words,sw)
    return words

In [100]:
cv = CountVectorizer(tokenizer=myTokenizer)

In [102]:
vectorized_corpus = cv.fit_transform(corpus).toarray()

In [103]:
print(vectorized_corpus)

[[0 1 0 1 2 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 1 2]
 [0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 1 1 0 0 1 0 1 0 1 1 0 0 0]]


In [104]:
print(len(vectorized_corpus[0]))

32


In [105]:
cv.inverse_transform(vectorized_corpus)

[array(['capt.', 'cricket', 'cup', 'held', 'indian', 'kholi', 'lanka',
        'says', 'sri', 'team', 'virat', 'win', 'world'], dtype='<U9'),
 array(['confidant', 'election', 'indian', 'lok', 'next', 'pm', 'sabha',
        'says', 'win'], dtype='<U9'),
 array(['hearts', 'laurate', 'nobel', 'people'], dtype='<U9'),
 array(['based', 'exciting', 'indian', 'movie', 'raazi', 'real', 'spy',
        'story', 'thriller', 'upon'], dtype='<U9')]

In [106]:
# For test data

test_corpus = [
    'Indian cricket rocks!'
]

In [108]:
cv.transform(test_corpus).toarray()

array([[0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [109]:
cv.vocabulary_

{'indian': 9,
 'cricket': 3,
 'team': 26,
 'win': 30,
 'world': 31,
 'cup': 4,
 'says': 22,
 'capt.': 1,
 'virat': 29,
 'kholi': 10,
 'held': 8,
 'sri': 24,
 'lanka': 11,
 'next': 15,
 'lok': 13,
 'sabha': 21,
 'election': 5,
 'confidant': 2,
 'pm': 18,
 'nobel': 16,
 'laurate': 12,
 'hearts': 7,
 'people': 17,
 'movie': 14,
 'raazi': 19,
 'exciting': 6,
 'spy': 23,
 'thriller': 27,
 'based': 0,
 'upon': 28,
 'real': 20,
 'story': 25}

### More Ways to Create Features
- Unigram - takes every word as a feature(so far we did this)
- Bigrams - can take two consiquitive word as a single feature(eg. not good)
- Trigrams
- n-grams
- TF-IDF Normalisation

In [117]:
sent_1 = ["this is good movie"]
sent_2 = ["this is good movie but actor is not present"]
sent_3 = ["this is not good movie"]

In [121]:
cv = CountVectorizer(ngram_range=(1,3))

**ngram_range**
- (1,1) - Unigram
- (2,2) - Bigram
- (3,3) - Trigram
- (1,3) - Conbiantion of unigram, bigram and Trigram

In [122]:
docs = [sent_1[0],sent_2[0]]
cv.fit_transform(docs).toarray()

array([[0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
        1],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1]], dtype=int64)

In [123]:
cv.vocabulary_

{'this': 20,
 'is': 9,
 'good': 6,
 'movie': 14,
 'this is': 21,
 'is good': 10,
 'good movie': 7,
 'this is good': 22,
 'is good movie': 11,
 'but': 3,
 'actor': 0,
 'not': 17,
 'present': 19,
 'movie but': 15,
 'but actor': 4,
 'actor is': 1,
 'is not': 12,
 'not present': 18,
 'good movie but': 8,
 'movie but actor': 16,
 'but actor is': 5,
 'actor is not': 2,
 'is not present': 13}

###  Tf - idf Normalisation
- Aboid features that occur very often, because they contain less information
- Infromation decreases as the number of occurances increases across different type of documnets
- So we define another term - term-document-frequency which associates a weight with every term

**Tf** - term frequency - how many times a word accur in a document

**idf** - basically tells in how many documents it is present 
            idf(t,d) = log(N)/(count(t,d)) where N is number of document
              and count(t,d) is how many times term t apperared iin document d

**tf.idf** - it basically tells how good a word is for identifing document

In [124]:
sent_1 = "this is good movie"
sent_2 = "this was good movie"
sent_3 = "this is not good movie"

corpus = [sent_1,sent_2,sent_3]

In [125]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [126]:
tfidf = TfidfVectorizer()

In [129]:
vc = tfidf.fit_transform(corpus)

In [130]:
print(vc)

  (0, 4)	0.4633342717458061
  (0, 1)	0.5966272352795762
  (0, 0)	0.4633342717458061
  (0, 2)	0.4633342717458061
  (1, 4)	0.4128585720620119
  (1, 0)	0.4128585720620119
  (1, 2)	0.4128585720620119
  (1, 5)	0.6990303272568005
  (2, 4)	0.3645443967613799
  (2, 1)	0.4694172843223779
  (2, 0)	0.3645443967613799
  (2, 2)	0.3645443967613799
  (2, 3)	0.6172273175654565


In [132]:
tfidf.vocabulary_

{'this': 4, 'is': 1, 'good': 0, 'movie': 2, 'was': 5, 'not': 3}