In [46]:
from pyspark.sql import SQLContext, Row
from pyspark.ml.feature import CountVectorizer
from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.mllib.linalg import Vector, Vectors

In [51]:
path = "/user/ncn251/cookbook_text1.zip"

def zip_extract(x):
    in_memory_data = io.BytesIO(x[1])
    file_obj = zipfile.ZipFile(in_memory_data, "r")
    files = [i for i in file_obj.namelist()]
    return [file_obj.open(file).read() for file in files]

zips=sc.binaryFiles(path,100)
zipData=sc.parallelize(zips.map(zip_extract).collect(),100)

data = zipData.zipWithIndex().map(lambda words: Row(idd=words[1],words=words[0].split(" ")))


In [52]:
docDF = spark.createDataFrame(data)
Vector = CountVectorizer(inputCol="words", outputCol="vectors")
model = Vector.fit(docDF)
result = model.transform(docDF)


In [53]:
corpus = result.select("idd", "vectors").rdd.map(lambda x: [x[0],Vectors.fromML(x[1])]).cache()
corpus

PythonRDD[793] at RDD at PythonRDD.scala:48

In [54]:
# Cluster the documents into three topics using LDA
ldaModel = LDA.train(corpus, k=3,maxIterations=100,optimizer='online')
topics = ldaModel.topicsMatrix()
vocabArray = model.vocabulary

In [55]:
wordNumbers = 100 # number of words per topic
topicIndices = sc.parallelize(ldaModel.describeTopics(maxTermsPerTopic = wordNumbers))


In [56]:
def topic_render(topic):  # specify vector id of words to actual words
    terms = topic[0]
    result = []
    for i in range(wordNumbers):
        term = vocabArray[terms[i]]
        result.append(term)
    return result


In [57]:
topics_final = topicIndices.map(lambda topic: topic_render(topic)).collect()

In [61]:
for topic in range(len(topics_final)):
    print ("Topic" + str(topic) + ":")
    print(topics_final[topic])
#     for term in topics_final[topic]:
#         print (term)
    print ('\n')

Topic0:
['.', 'he', 'his', 'was', 'I', 'had', 'her', 'she', 'my', 'said', 'were', '', 'old', 'their', 'him', 'S', 'the', 'who', 'He', 'man', 'me', 'that', 'how', 'young', 'Zuñi', 'our', 'shall', 'we', 'to', 'came', 'did', 'Mary', 'what', 'thought', 'you', 'would', 'see', 'went', 'could', 'thou', 'people', 'him,', 'woman', 'but', 'down', 'told', 'tell', '"I', 'yet', 'She', 'house', 'of', 'toward', 'women', 'men', 'he,', 'thy', 'began', 'poor', 'ancient', 'might', 'And', 'youth', 'us', 'took', 'and', 'for', 'grew', 'heard', 'looked', 'like', 'saw', 'sat', 'boy', 'am', 'at', 'great', 'go', 'corn', 'man,', 'know', 'himself', 'girl', 'forth', 'father', 'gave', 'ever', 'knew', 'asked', 'deer', 'home', 'hunter', 'So', 'ye', 'Time.', '"', 'me,', 'they', 'brought', 'by']


Topic1:
['', '&#160;', '1', '-', '"', '&#224;', 'la', '--', 'de', '2', 'AND', 'No.', '.....', 'OF', 'or', 'of,', 'for', '3', 'A', 'Cream', 'To', 'Fruit', 'TO', 'WITH', '4', 'FOR', '1/2', 'Sauce', 'au', 'with', 'See', 'LA', 'B