In [1]:
#counts the unique number of words
from sklearn.feature_extraction.text import CountVectorizer
vect=CountVectorizer(binary=True)
corpus=["Tessaract is good optical character recognition engine  ", "optical character recognition is significant"]
vect.fit(corpus) #look at corpus and gives index to it

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [2]:
vocab=vect.vocabulary_

In [3]:
#sorted count of words is displayed ie how many times the particular word is occuring is displayed in sorted manner
for key in sorted(vocab.keys()):
    print("{}:{}".format(key, vocab[key]))

character:0
engine:1
good:2
is:3
optical:4
recognition:5
significant:6
tessaract:7


In [4]:
#vectorize the sentense using count vector
print(vect.transform(["This is a good optical illusion"]).toarray())

[[0 0 1 1 1 0 0 0]]


In [5]:
print(vect.transform(corpus).toarray())

[[1 1 1 1 1 1 0 1]
 [1 0 0 1 1 1 1 0]]


### Cosine Similarity

In [6]:
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
#two 1D vector is created and then calculating the cosine similarity between them
similarity = cosine_similarity(vect.transform(["Google Cloud Vision is a character recognition engine"]).toarray(), vect.transform(["OCR is an optical character recognition engine"]).toarray())

In [8]:
print(similarity) 

[[0.89442719]]


### Additional tasks performed on own examples
### 1. Exploring TfidfVectoriser

In [9]:
#Converts a raw document into matrix of TF-IDF features(Term freq- Inverse Doc freq)
from sklearn.feature_extraction.text import TfidfVectorizer 


In [10]:
vect1=TfidfVectorizer(binary=False) # False to get 0/1 output If true all non zero terms count=1
corpus=["Tessaract is good optical character recognition engine  ", "optical character recognition is significant"]
vect1.fit(corpus)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [11]:
vocab1=vect1.vocabulary_

In [12]:
for key in sorted(vocab1.keys()):
    print("{}:{}".format(key, vocab1[key]))

character:0
engine:1
good:2
is:3
optical:4
recognition:5
significant:6
tessaract:7


In [13]:
print(vect1.transform(corpus).toarray())

[[0.3174044  0.44610081 0.44610081 0.3174044  0.3174044  0.3174044
  0.         0.44610081]
 [0.4090901  0.         0.         0.4090901  0.4090901  0.4090901
  0.57496187 0.        ]]


### The above values are tf-idf values which are different than countvectoriser as they gives binary values

### 2. Exploring cosine similarity

In [14]:
similarity1 = cosine_similarity(vect.transform(["Good morning"]).toarray(),+ 
            vect.transform(["Welcome to SNLP class"]).toarray())
print(similarity1)  

[[0.]]


### There is no common word in both the statements so similarity came out to be 0

In [15]:
similarity2 = cosine_similarity(vect.transform([""" One of the first things required for natural language processing (NLP) tasks is a corpus. 
In linguistics and NLP, corpus (literally Latin for body) refers to a collection of texts."""]).toarray(),+ 
            vect.transform([""" One of the first things required for natural language processing (NLP) tasks is a corpus. 
In linguistics and NLP, corpus (literally Latin for body) refers to a collection of texts."""]).toarray())
print(similarity2)  

[[1.]]


### In above example both the statements exactly match, hence similarity is 1