In [1]:
import numpy
import sklearn
import sklearn.metrics.pairwise
import string

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/u990505/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/u990505/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
def process_text(text, stem=True):
    """ Tokenize text and stem words removing punctuation """
    text = text.translate(string.punctuation)
    tokens = word_tokenize(text)
 
    if stem:
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(t) for t in tokens]
 
    return tokens

In [4]:
descriptions = []

with open('descriptions.txt', encoding = "utf8") as f:
    for line in f:
        text = line.lower()                                       ## Lowercase all characters
        text = text.replace("[comma]"," ")                        ## Replace [commas] with empty space
        for ch in text:
            if ch < "0" or (ch < "a" and ch > "9") or ch > "z":   ## The cleaning operation happens here, remove all special characters
                text = text.replace(ch," ")
        text = ' '.join(text.split())                             ## Remove double spacing from sentences
        descriptions.append(text)
dataSet = numpy.array(descriptions)

In [5]:
vectorizer = TfidfVectorizer(stop_words='english')
TfIdf_dataSet = vectorizer.fit_transform(dataSet)
print("What our Tf-Idf looks like: ")
print()
print(TfIdf_dataSet[0:1])

vectorVocab = vectorizer._validate_vocabulary()

What our Tf-Idf looks like: 

  (0, 3085)	0.180162937562
  (0, 1351)	0.108587000666
  (0, 3257)	0.300162482365
  (0, 2585)	0.34043262183
  (0, 2139)	0.113064162116
  (0, 4024)	0.218541466991
  (0, 1981)	0.255811941541
  (0, 3626)	0.316791499169
  (0, 1152)	0.243157539988
  (0, 4070)	0.268466343093
  (0, 1073)	0.304137097616
  (0, 2120)	0.268466343093
  (0, 1687)	0.0661908689456
  (0, 1200)	0.263831802395
  (0, 1473)	0.316791499169
  (0, 1117)	0.164803210907
  (0, 4092)	0.161675882415


In [6]:
cosineSimilarity = sklearn.metrics.pairwise.cosine_similarity(TfIdf_dataSet)
print(cosineSimilarity)

[[ 1.          0.06419899  0.01454109 ...,  0.00629138  0.06862054
   0.07042177]
 [ 0.06419899  1.          0.06743771 ...,  0.04726818  0.06877843
   0.04734656]
 [ 0.01454109  0.06743771  1.         ...,  0.00565678  0.065617
   0.02817174]
 ..., 
 [ 0.00629138  0.04726818  0.00565678 ...,  1.          0.00482428
   0.00497692]
 [ 0.06862054  0.06877843  0.065617   ...,  0.00482428  1.          0.07985904]
 [ 0.07042177  0.04734656  0.02817174 ...,  0.00497692  0.07985904  1.        ]]


In [7]:
numpy.fill_diagonal(cosineSimilarity,1.1)
cosineSimilaritySorted = numpy.argsort((-1*(cosineSimilarity)),axis=1)
print(cosineSimilaritySorted)
cosineSimilaritySorted = numpy.argsort((-1*(cosineSimilarity)),axis=1)
top5similar = (cosineSimilaritySorted[:,0:5])
print()
print(top5similar)

[[   0 1454   65 ..., 1179  883  317]
 [   1  556  549 ...,  883  857  853]
 [   2  342    4 ..., 1007  373 1303]
 ..., 
 [1477 1372  210 ...,  687 1011  575]
 [1478  967  706 ...,  865  275 1365]
 [1479 1341 1144 ...,  676 1178  690]]

[[   0 1454   65   66  406]
 [   1  556  549 1373  944]
 [   2  342    4  288  379]
 ..., 
 [1477 1372  210  902  681]
 [1478  967  706  669 1084]
 [1479 1341 1144  500  773]]


In [8]:
numpy.savetxt("results.csv", top5similar.astype(int), fmt='%i', delimiter=",")