# Document similarity using dtm, tf_idf and cosine similarity


In [1]:
import numpy as np
import pandas as pd

from nltk import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
import re

from scipy.spatial import distance
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# import plotting libraries
from mpl_toolkits.mplot3d.axes3d import Axes3D

import matplotlib
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
%matplotlib inline 

import seaborn as sns
sns.set(style="white", color_codes=True)
sns.set(font_scale=1.5)

from sklearn.preprocessing import normalize

In [2]:
documents = [
    "Machine learning is great coding love coding is tennis tennis great",
    "MachineS learning is great coding love coding",
    "Soccer is fun to watch",
    "Tennis has practice machine love playing tennis"
]

In [3]:
# use stemmer
porter = PorterStemmer()

In [4]:
# Use NLTK's PorterStemmer
def stemming_tokenizer(str_input):
    
    # remove punctiations etc
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    
    # apply stemming
    words = [porter.stem(word) for word in words]
    
    return words

## using countvectorizer

In [5]:
# Prepare BOW, dtm
count_vectorizer = CountVectorizer(tokenizer = stemming_tokenizer, 
                                   stop_words= 'english')

count_matrix     = count_vectorizer.fit_transform(documents)

  'stop_words.' % sorted(inconsistent))


In [7]:
# dtm looks like this ...
print(count_matrix.toarray())

[[2 0 2 0 1 1 1 0 0 0 2 0]
 [2 0 1 0 1 1 1 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 1 0 1]
 [0 0 0 1 0 1 1 1 1 0 2 0]]


In [8]:
# using sklearn cosine similarity function
cosine_similarity(count_matrix[0:1], count_matrix)

array([[1.        , 0.82158384, 0.        , 0.51639778]])

In [11]:
# using sklearn cosine similarity function
cosine_similarity(count_matrix)

array([[1.        , 0.82158384, 0.        , 0.51639778],
       [0.82158384, 1.        , 0.        , 0.23570226],
       [0.        , 0.        , 1.        , 0.        ],
       [0.51639778, 0.23570226, 0.        , 1.        ]])

## using tfidf vectorizer

In [6]:
# Prepare BOW, dtm
tfidf_vectorizer = TfidfVectorizer(tokenizer=stemming_tokenizer, 
                                   stop_words='english')

tfidf_matrix     = tfidf_vectorizer.fit_transform(documents)

In [7]:
# dtm looks like this ...
print(tfidf_matrix.toarray())

[[0.52868518 0.         0.52868518 0.         0.26434259 0.21400811
  0.21400811 0.         0.         0.         0.52868518 0.        ]
 [0.73968323 0.         0.36984162 0.         0.36984162 0.29941866
  0.29941866 0.         0.         0.         0.         0.        ]
 [0.         0.57735027 0.         0.         0.         0.
  0.         0.         0.         0.57735027 0.         0.57735027]
 [0.         0.         0.         0.39837187 0.         0.2542756
  0.2542756  0.39837187 0.39837187 0.         0.62816192 0.        ]]


In [8]:
# BOW / vocab
tfidf_vectorizer.vocabulary_

{'machin': 6,
 'learn': 4,
 'great': 2,
 'code': 0,
 'love': 5,
 'tenni': 10,
 'soccer': 9,
 'fun': 1,
 'watch': 11,
 'ha': 3,
 'practic': 8,
 'play': 7}

In [9]:
# using sklearn cosine similarity function
cosine_similarity(tfidf_matrix[0:1], tfidf_matrix)

array([[1.        , 0.81251028, 0.        , 0.44093398]])

In [16]:
# using sklearn cosine similarity function
cosine_similarity(tfidf_matrix)

array([[1.        , 0.81251028, 0.        , 0.44093398],
       [0.81251028, 1.        , 0.        , 0.15226972],
       [0.        , 0.        , 1.        , 0.        ],
       [0.44093398, 0.15226972, 0.        , 1.        ]])

- note : same word repeated in documents may increase the similarity but tf-idf will control that beyond a point