In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.ml.feature import Tokenizer, CountVectorizer, MinHashLSH
from pyspark.sql.functions import col

In [2]:
data= sc.wholeTextFiles('/user/ncn251/cookbook_text')

In [3]:
df= data.toDF(['filename', 'content'])

In [4]:
tokenizer = Tokenizer(inputCol="content",outputCol="words")
wordsDf= tokenizer.transform(df)

In [5]:
vocabSize=1000000
Vector = CountVectorizer(inputCol="words", outputCol="features", vocabSize=vocabSize, minDF=10)

In [6]:
model=Vector.fit(wordsDf)
vectorizedDf=model.transform(wordsDf).select('filename', 'features')


In [7]:
mh =MinHashLSH(inputCol="features", outputCol="hashValues")
model= mh.fit(vectorizedDf)
model.transform(vectorizedDf).show()

+--------------------+--------------------+------------------+
|            filename|            features|        hashValues|
+--------------------+--------------------+------------------+
|hdfs://dumbo/user...|(15582,[0,1,2,3,4...|[[-2.038051916E9]]|
|hdfs://dumbo/user...|(15582,[0,1,2,3,4...| [[-2.03725558E9]]|
|hdfs://dumbo/user...|(15582,[0,1,2,3,4...|[[-2.037521916E9]]|
|hdfs://dumbo/user...|(15582,[0,1,2,3,4...| [[-2.03725558E9]]|
|hdfs://dumbo/user...|(15582,[0,1,2,3,4...|[[-2.038051916E9]]|
|hdfs://dumbo/user...|(15582,[0,1,2,3,4...| [[-2.03725558E9]]|
|hdfs://dumbo/user...|(15582,[0,1,2,3,4...|[[-2.038051916E9]]|
|hdfs://dumbo/user...|(15582,[0,1,2,3,4...|[[-2.038051916E9]]|
|hdfs://dumbo/user...|(15582,[0,1,2,3,4...|[[-2.037788252E9]]|
|hdfs://dumbo/user...|(15582,[0,1,2,3,4...|[[-2.037521916E9]]|
|hdfs://dumbo/user...|(15582,[0,1,2,3,4...|[[-2.036991916E9]]|
|hdfs://dumbo/user...|(15582,[0,1,2,3,4...|[[-2.037521916E9]]|
|hdfs://dumbo/user...|(15582,[0,1,2,3,4...|[[-2.0380519

In [8]:
threshold= 0.8
output=model.approxSimilarityJoin(vectorizedDf, vectorizedDf, threshold)

In [9]:
output.columns

['datasetA', 'datasetB', 'distCol']

In [10]:
similarityDf=output.select(col('datasetA.filename').alias('filename1'), col('datasetB.filename').alias('filename2'))

In [11]:
similarityDf.toPandas().to_csv('similarity.csv', header=True)