In [11]:
import numpy
import scipy
import scipy.sparse
import sklearn
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cluster import KMeans
import sklearn.metrics.pairwise
import string
import collections

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
def process_text(text, stem=True):
    """ Tokenize text and stem words removing punctuation """
    text = text.translate(string.punctuation)
    tokens = word_tokenize(text)
 
    if stem:
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(t) for t in tokens]
 
    return tokens

In [16]:
descriptions = []

with open('coco_val.txt', encoding = "utf8") as f:
    for line in f:
        text = line.lower()                                       ## Lowercase all characters
        text = text.replace("[comma]"," ")                        ## Replace [commas] with empty space
        for ch in text:
            if ch < "0" or (ch < "a" and ch > "9") or ch > "z":   ## The cleaning operation happens here, remove all special characters
                text = text.replace(ch," ")
        text = ' '.join(text.split())                             ## Remove double spacing from sentences
        descriptions.append(text)
dataSet = numpy.array(descriptions[0:100])

In [17]:
vectorizer = TfidfVectorizer(tokenizer=process_text, stop_words='english')
TfIdf_dataSet = vectorizer.fit_transform(dataSet)
print("What our Tf-Idf looks like: ")
print()
print(TfIdf_dataSet[0:1])

vectorVocab = vectorizer._validate_vocabulary()

What our Tf-Idf looks like: 

  (0, 35)	0.44471458766073974
  (0, 93)	0.2992971973799435
  (0, 76)	0.44471458766073974
  (0, 206)	0.3454517762690821
  (0, 147)	0.44471458766073974
  (0, 222)	0.44471458766073974


In [18]:
cosineSimilarity = sklearn.metrics.pairwise.cosine_similarity(TfIdf_dataSet)
print(cosineSimilarity)

[[1.         0.22252321 0.19205792 ... 0.         0.         0.        ]
 [0.22252321 1.         0.33283069 ... 0.         0.         0.        ]
 [0.19205792 0.33283069 1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.         0.16957961 0.        ]
 [0.         0.         0.         ... 0.16957961 1.         0.13207091]
 [0.         0.         0.         ... 0.         0.13207091 1.        ]]


In [20]:
numpy.fill_diagonal(cosineSimilarity,1.1)
cosineSimilaritySorted = numpy.argsort((-1*(cosineSimilarity)),axis=1)
print(cosineSimilaritySorted)
cosineSimilaritySorted = numpy.argsort((-1*(cosineSimilarity)),axis=1)
top5similar = (cosineSimilaritySorted[:,0:5])
print()
print(top5similar)

[[ 0  4  1 ... 33 31 99]
 [ 1  2 12 ... 33 31 99]
 [ 2  3  4 ... 33 31 99]
 ...
 [97 95 98 ... 26 24 99]
 [98 96 25 ... 26 50 49]
 [99 94 93 ... 25 23 49]]

[[ 0  4  1  2  3]
 [ 1  2 12  3  4]
 [ 2  3  4  1 12]
 [ 3  2  4  1 12]
 [ 4  2  3  1  0]
 [ 5  6  7  9  8]
 [ 6  7  5  9  8]
 [ 7  6  5 91  9]
 [ 8  9 20  6 15]
 [ 9  8 18 82 80]
 [10 11 53 12 14]
 [11 10 12 14 13]
 [12 13 14  1 11]
 [13 12 14 11 10]
 [14 12 13 11 10]
 [15 30 32 18 49]
 [16 15 19 18 58]
 [17 19 15 24 18]
 [18 15  9 44 58]
 [19 17 16 15 53]
 [20 23 21 40 22]
 [21 20 41 23 40]
 [22 23 20 24 21]
 [23 22 20 24 21]
 [24 23 22 15 18]
 [25 27 65 29 98]
 [26 28 29 65 27]
 [27 25 65 26 28]
 [28 26 29 65 27]
 [29 26 28 65 69]
 [30 32 15 40 44]
 [31 82 83 87 15]
 [32 30 15 40 44]
 [33 43 34 15 24]
 [34 43 44 15 33]
 [35 36 39 38 37]
 [36 35 39 38 37]
 [37 39 36 35 38]
 [38 39 36 35 37]
 [39 38 36 35 37]
 [40 20 30 32 15]
 [41 21 51 52 47]
 [42 44 43 30 40]
 [43 44 34 42 15]
 [44 43 42 30 34]
 [45 46 47 49 48]
 [46 47 45 49 4