In [10]:
# from normalization import normalize_corpus
# from utils import build_feature_matrix
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [4]:
# We define a toy corpus (collection of documents) to explore the ideas
toy_corpus = ['The sky is blue',
'The sky is blue and beautiful',
'Look at the bright blue sky!',
'Python is a great Programming language',
'Python and Java are popular Programming languages',
'Among Programming languages, both Python and Java are the most used in Analytics',
'The fox is quicker than the lazy dog',
'The dog is smarter than the fox',
'The dog, fox and cat are good friends']

# Documents that we will be measuring similarities for
query_docs = ['The fox is definitely smarter than the dog',
            'Java is a static typed programming language unlike Python',
            'I love to relax under the beautiful blue sky!']  

In [6]:
def normalize_corpus(corpus, lemmatize=True, only_text_chars=False, tokenize=False):
    
    normalized_corpus = []    
    for text in corpus:
        text = text.lower()
        normalized_corpus.append(text)
            
    return normalized_corpus

In [11]:
# We improve our feature matrix builder with 3 additional optional parameters
# This allows us to extract not only word features, but also n-gram features
# We can also set the minimum and maximum frequencies to be considered as valid
# NB: All these are simply passed on to sklearn's Vectorizer classes
def build_feature_matrix(documents, feature_type='frequency',
                         ngram_range=(1, 1), min_df=0.0, max_df=1.0):

    feature_type = feature_type.lower().strip()  
    
    if feature_type == 'binary':
        vectorizer = CountVectorizer(binary=True, min_df=min_df,
                                     max_df=max_df, ngram_range=ngram_range)
    elif feature_type == 'frequency':
        vectorizer = CountVectorizer(binary=False, min_df=min_df,
                                     max_df=max_df, ngram_range=ngram_range)
    elif feature_type == 'tfidf':
        vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df, 
                                     ngram_range=ngram_range)
    else:
        raise Exception("Wrong feature type entered. Possible values: 'binary', 'frequency', 'tfidf'")

    feature_matrix = vectorizer.fit_transform(documents).astype(float)
    
    return vectorizer, feature_matrix

In [19]:
def compute_cosine_similarity(doc_features, corpus_features,
                              top_n=3):
    # Get document vectors
    doc_features = doc_features[0]
    # Compute similarities by calling dot.product on transposed corpus feature vector
    similarity = np.dot(doc_features, 
                        corpus_features.T)
    similarity = similarity.toarray()[0]
    # Get docs with highest similarity scores
    top_docs = similarity.argsort()[::-1][:top_n]
    top_docs_with_score = [(index, round(similarity[index], 3))
                            for index in top_docs]
    return top_docs_with_score


In [7]:
norm_corpus = normalize_corpus(toy_corpus, lemmatize=False)

In [8]:
norm_corpus

['the sky is blue',
 'the sky is blue and beautiful',
 'look at the bright blue sky!',
 'python is a great programming language',
 'python and java are popular programming languages',
 'among programming languages, both python and java are the most used in analytics',
 'the fox is quicker than the lazy dog',
 'the dog is smarter than the fox',
 'the dog, fox and cat are good friends']

In [12]:
tfidf_vectorizer, tfidf_features = build_feature_matrix(norm_corpus,
                                                        feature_type='tfidf',
                                                        ngram_range=(1, 1), 
                                                        min_df=0.0, max_df=1.0)

In [16]:
tfidf_features.shape

(9, 32)

In [20]:
# Similarly, we normalize and extract features from the query corpus
norm_query_docs =  normalize_corpus(query_docs, lemmatize=True)   
# We use the same vectorizer that we used to build the feature matrix for the corpus also for query doc         
query_docs_tfidf = tfidf_vectorizer.transform(norm_query_docs)

for index, doc in enumerate(query_docs):
    
    doc_tfidf = query_docs_tfidf[index]
    top_similar_docs = compute_cosine_similarity(doc_tfidf,
                                             tfidf_features,
                                             top_n=2)
    print('Document',index+1 ,':', doc)
    print('Top', len(top_similar_docs), 'similar docs:')
    print('-'*40)
    for doc_index, sim_score in top_similar_docs:
        print('Doc num: {} Similarity Score: {}\nDoc: {}'.format(doc_index+1,
                                                                 sim_score,
                                                                 toy_corpus[doc_index]))
        print('-'*40)    
    print()

Document 1 : The fox is definitely smarter than the dog
Top 2 similar docs:
----------------------------------------
Doc num: 8 Similarity Score: 1.0
Doc: The dog is smarter than the fox
----------------------------------------
Doc num: 7 Similarity Score: 0.671
Doc: The fox is quicker than the lazy dog
----------------------------------------

Document 2 : Java is a static typed programming language unlike Python
Top 2 similar docs:
----------------------------------------
Doc num: 4 Similarity Score: 0.739
Doc: Python is a great Programming language
----------------------------------------
Doc num: 5 Similarity Score: 0.48
Doc: Python and Java are popular Programming languages
----------------------------------------

Document 3 : I love to relax under the beautiful blue sky!
Top 2 similar docs:
----------------------------------------
Doc num: 2 Similarity Score: 0.867
Doc: The sky is blue and beautiful
----------------------------------------
Doc num: 1 Similarity Score: 0.67
Doc: 