In [1]:
import numpy
import sklearn
import sklearn.metrics.pairwise
from sklearn.metrics.pairwise import pairwise_distances
import string
import collections
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

In [2]:
descriptions = []

with open('descriptions.txt', encoding = "utf8") as f:
    for line in f:
        text = line.lower()                                       ## Lowercase all characters
        text = text.replace("[comma]"," ")                        ## Replace [commas] with empty space
        for ch in text:
            if ch < "0" or (ch < "a" and ch > "9") or ch > "z":   ## The cleaning operation happens here, remove all special characters
                text = text.replace(ch," ")
        text = ' '.join(text.split())                             ## Remove double spacing from sentences
        descriptions.append(text)
dataSet = numpy.array(descriptions)

In [3]:
vectorizer = TfidfVectorizer(stop_words='english')
TfIdf_dataSet = vectorizer.fit_transform(dataSet)
print("What our Tf-Idf looks like: ")
print()
print(TfIdf_dataSet[0:1])

What our Tf-Idf looks like: 

  (0, 3085)	0.18016293756155294
  (0, 1351)	0.10858700066557808
  (0, 3257)	0.3001624823648848
  (0, 2585)	0.34043262183046635
  (0, 2139)	0.11306416211646501
  (0, 4024)	0.21854146699056293
  (0, 1981)	0.25581194154091114
  (0, 3626)	0.31679149916873117
  (0, 1152)	0.24315753998842082
  (0, 4070)	0.2684663430934014
  (0, 1073)	0.30413709761624086
  (0, 2120)	0.2684663430934014
  (0, 1687)	0.06619086894558779
  (0, 1200)	0.26383180239523174
  (0, 1473)	0.31679149916873117
  (0, 1117)	0.16480321090672406
  (0, 4092)	0.16167588241502212


In [4]:
vectorizer = TfidfVectorizer(stop_words='english')
TfIdf_dataSet = vectorizer.fit_transform(dataSet)

In [9]:
LSA = TruncatedSVD(n_components=300)
LSAData = LSA.fit_transform(TfIdf_dataSet)
print(LSAData.shape)

(1480, 300)


In [10]:
cosineSimilarity = sklearn.metrics.pairwise.cosine_similarity(LSAData)
numpy.fill_diagonal(cosineSimilarity,1.1)
cosineSimilaritySorted = numpy.argsort((-1*(cosineSimilarity)),axis=1)
top5similar = (cosineSimilaritySorted[:,0:5])
print()
print(top5similar)


[[   0 1454   66 1085  406]
 [   1  556  549 1373  206]
 [   2  342  379  811    4]
 ...
 [1477 1372  530  392  681]
 [1478  967  669  706 1084]
 [1479 1341  352 1144  504]]


In [11]:
print(dataSet[0])
print(dataSet[66])

round face short and overweight likes to wear jeans and sweaters drinks wine at dinner short liberal overweight short hair eats at whole foods does not work our very much
plain freckles short hair boring eyes likes earrings hair parted normally probably slightly overweight but not sure about that she likes twilight she isn t married she is sad she is slightly overweight


In [12]:
numpy.savetxt("results4.csv", top5similar.astype(int), fmt='%i', delimiter=",")