In [1]:
import findspark
findspark.init()

import pyspark
from pyspark import SparkConf, SparkContext


from pyspark.sql import SparkSession, SQLContext, Row
from pyspark.sql.types import ArrayType, FloatType, StringType, IntegerType
from pyspark.sql.functions import udf, row_number,column
from pyspark.sql.window import Window


In [2]:
import numpy as np
from itertools import islice

In [3]:
from pyspark.ml.linalg import Vector, Vectors, VectorUDT,SparseVector
from pyspark.ml.feature import CountVectorizer,StopWordsRemover, HashingTF, IDF, Tokenizer

from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.mllib.feature import Word2Vec, IDF, HashingTF
from pyspark.mllib.linalg import Vector, Vectors, VectorUDT,SparseVector

In [4]:
sc = pyspark.SparkContext.getOrCreate()

In [5]:
sc

In [6]:
sc.stop()

In [7]:
num_rows_to_show = 20
text_file = 'data/listings.csv'

In [8]:
sc = SparkContext()
spark = SparkSession(sc)

In [9]:
df = spark.read.csv(text_file, inferSchema=True, header=True)
corpus = df.select("id", "name").dropna(subset="name")

In [10]:
tokenizer = Tokenizer(inputCol="name", outputCol="words")
docDF = tokenizer.transform(corpus)

In [11]:
Vector = CountVectorizer(inputCol="words", outputCol="vectors", minDF=5.0)
model = Vector.fit(docDF)
result = model.transform(docDF)

In [12]:
result.show(10, False)

+-----+-------------------------------------------------+---------------------------------------------------------+--------------------------------------------------------------------+
|id   |name                                             |words                                                    |vectors                                                             |
+-----+-------------------------------------------------+---------------------------------------------------------+--------------------------------------------------------------------+
|2818 |Quiet Garden View Room & Super Fast WiFi         |[quiet, garden, view, room, &, super, fast, wifi]        |(1196,[8,9,17,31,51,141,237,1167],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|20168|100%Centre-Studio 1 Private Floor/Bathroom       |[100%centre-studio, 1, private, floor/bathroom]          |(1196,[20,103],[1.0,1.0])                                           |
|25428|Lovely apt in City Centre (Jordaan)              |[lovely, apt, in, 

In [13]:
def get_words_vectors(e):
    a = list(e.indices) # [0:4]
    return Vectors.dense(a) # str(type(e.values)) str(a)#

def vector_length(l):
    return len(l)

# my_udf = udf(my_udf_map, StringType())
# my_udf = udf(my_udf_map, ArrayType(FloatType()))

my_udf = udf(get_words_vectors, VectorUDT())
count_vector_len = udf(vector_length, IntegerType())


result2 = result.withColumn('vectors2', my_udf(result.vectors))
result2 = result2.withColumn('v_len', count_vector_len(result2.vectors2))
result2 = result2.filter(result2['v_len'] > 0)

In [14]:
result2.select("id","vectors2","v_len").show(10,False)

+-----+-------------------------------------------+-----+
|id   |vectors2                                   |v_len|
+-----+-------------------------------------------+-----+
|2818 |[8.0,9.0,17.0,31.0,51.0,141.0,237.0,1167.0]|8    |
|20168|[20.0,103.0]                               |2    |
|25428|[1.0,6.0,11.0,25.0,32.0,491.0]             |6    |
|27886|[1.0,22.0,52.0,67.0,111.0,134.0,811.0]     |7    |
|28658|[6.0,8.0,10.0,11.0,15.0,302.0]             |6    |
|28871|[8.0,57.0,105.0]                           |3    |
|29051|[8.0,57.0,309.0]                           |3    |
|31080|[0.0,45.0,53.0,85.0,1157.0]                |5    |
|38266|[1.0,4.0,7.0,42.0,48.0,51.0,84.0]          |7    |
|41125|[0.0,2.0,14.0,243.0]                       |4    |
+-----+-------------------------------------------+-----+
only showing top 10 rows



In [15]:
w = Window().orderBy(column("id"))
result3 = result2.withColumn("id", row_number().over(w)).select("id", "vectors2").rdd.map(lambda x: [int(x[0]), x[1]])

In [16]:
result3.toDF().show()

+---+--------------------+
| _1|                  _2|
+---+--------------------+
|  1|[265.0,682.0,763....|
|  2|[265.0,682.0,970....|
|  3|[16.0,45.0,62.0,8...|
|  4|[13.0,14.0,32.0,5...|
|  5|[26.0,28.0,682.0,...|
|  6|[28.0,386.0,682.0...|
|  7|[26.0,682.0,733.0...|
|  8|[0.0,1.0,28.0,303...|
|  9|[1.0,12.0,51.0,10...|
| 10|[28.0,265.0,682.0...|
| 11|[28.0,386.0,682.0...|
| 12|[0.0,4.0,6.0,11.0...|
| 13|       [34.0,1175.0]|
| 14|[3.0,17.0,29.0,49...|
| 15|[2.0,79.0,80.0,14...|
| 16|[0.0,1.0,30.0,245.0]|
| 17|[0.0,1.0,15.0,150.0]|
| 18|[141.0,351.0,596....|
| 19|[4.0,6.0,11.0,13....|
| 20|[1.0,29.0,30.0,40...|
+---+--------------------+
only showing top 20 rows



In [17]:
result3.toDF().printSchema()

root
 |-- _1: long (nullable = true)
 |-- _2: vector (nullable = true)



In [18]:
#Train the LDA model

seed=1
num_topics = 5
max_iterations = 4

# num_topics, maxIterations=max_iterations, seed=seed

ldaModel = LDA.train(result3, k=num_topics, maxIterations=max_iterations, seed=seed)

In [19]:
# Output topics. Each is a distribution over words (matching word count vectors)
print("Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize())
      + " words):")


Learned topics (as distributions over vocab of 4 words):


In [20]:
vocab_size = ldaModel.vocabSize()
topics = ldaModel.describeTopics()
#topics = ldaModel.topicsMatrix()

In [21]:
topics[1]

([4, 5, 3, 6],
 [0.1863323292922061,
  0.17667451228557676,
  0.15902971201358637,
  0.1473319326677002])

In [22]:
for key,topic in enumerate(topics):
    print("Topic #" + str(key+1) + ":")
    
    for key2, word in enumerate(topic[0]):
        print(str(word),": ",round(topic[1][key2],3),sep="")
    print("")
#     for word in range(0, vocab_size):
#         print(" " + str(topics[word][topic]))

Topic #1:
5: 0.195
4: 0.192
3: 0.166
6: 0.153

Topic #2:
4: 0.186
5: 0.177
3: 0.159
6: 0.147

Topic #3:
4: 0.198
5: 0.188
3: 0.169
6: 0.138

Topic #4:
4: 0.203
5: 0.19
3: 0.154
6: 0.15

Topic #5:
5: 0.193
4: 0.189
6: 0.153
3: 0.149



In [23]:
docDF.show(10, False)

+-----+-------------------------------------------------+---------------------------------------------------------+
|id   |name                                             |words                                                    |
+-----+-------------------------------------------------+---------------------------------------------------------+
|2818 |Quiet Garden View Room & Super Fast WiFi         |[quiet, garden, view, room, &, super, fast, wifi]        |
|20168|100%Centre-Studio 1 Private Floor/Bathroom       |[100%centre-studio, 1, private, floor/bathroom]          |
|25428|Lovely apt in City Centre (Jordaan)              |[lovely, apt, in, city, centre, (jordaan)]               |
|27886|Romantic, stylish B&B houseboat in canal district|[romantic,, stylish, b&b, houseboat, in, canal, district]|
|28658|Cosy guest room near city centre -1              |[cosy, guest, room, near, city, centre, -1]              |
|28871|Comfortable double room                          |[comfortable, d

In [53]:
import time
from pyspark.ml.feature import IDF

from pyspark.mllib.linalg import Vector as oldVector, Vectors as oldVectors

from pyspark.ml.linalg import Vector as newVector, Vectors as newVectors

In [91]:
print(time.strftime('%m%d%Y %H:%M:%S'))

cv = CountVectorizer(inputCol="words", outputCol="raw_features", vocabSize=5000, minDF=10.0)
cvmodel = cv.fit(docDF)

print(time.strftime('%m%d%Y %H:%M:%S'))

07042019 15:56:02
07042019 15:56:02


In [92]:
print(time.strftime('%m%d%Y %H:%M:%S'))
result_cv = cvmodel.transform(docDF)
print(time.strftime('%m%d%Y %H:%M:%S'))

07042019 15:56:03
07042019 15:56:03


In [93]:
result_cv.show(1)

+----+--------------------+--------------------+--------------------+
|  id|                name|               words|        raw_features|
+----+--------------------+--------------------+--------------------+
|2818|Quiet Garden View...|[quiet, garden, v...|(714,[8,9,17,31,5...|
+----+--------------------+--------------------+--------------------+
only showing top 1 row



In [94]:
result_cv = result_cv.drop("name")

In [95]:
rs = result_cv.rdd.map(lambda x: (x[1], x[0], oldVectors.fromML(x[2])))

In [96]:
rs_df = rs.toDF(['tweet_words', 'index', 'raw_features'])

In [97]:
rs.take(1)

[(['quiet', 'garden', 'view', 'room', '&', 'super', 'fast', 'wifi'],
  '2818',
  SparseVector(714, {8: 1.0, 9: 1.0, 17: 1.0, 31: 1.0, 51: 1.0, 141: 1.0, 237: 1.0}))]

In [99]:
rs_df.show(10)

+--------------------+-----+--------------------+
|         tweet_words|index|        raw_features|
+--------------------+-----+--------------------+
|[quiet, garden, v...| 2818|(714,[8,9,17,31,5...|
|[100%centre-studi...|20168|(714,[20,103],[1....|
|[lovely, apt, in,...|25428|(714,[1,6,11,25,3...|
|[romantic,, styli...|27886|(714,[1,22,52,67,...|
|[cosy, guest, roo...|28658|(714,[6,8,10,11,1...|
|[comfortable, dou...|28871|(714,[8,57,105],[...|
|[comfortable, sin...|29051|(714,[8,57,309],[...|
|[2-story, apartme...|31080|(714,[0,45,53,85]...|
|[nice, and, quiet...|38266|(714,[1,4,7,42,48...|
|[amsterdam, cente...|41125|(714,[0,2,14,242]...|
+--------------------+-----+--------------------+
only showing top 10 rows



In [101]:
w = Window().orderBy(column("index"))

rs_df = rs_df.withColumn("index", row_number().over(w))
#.select("tweet_words", "index", "raw_features")
#.rdd.map(lambda x: [x[0], int(x[1]), x[2]])


#rs_df = rs_df.toDF()

In [102]:
rs_df.show(10)

+--------------------+-----+--------------------+
|         tweet_words|index|        raw_features|
+--------------------+-----+--------------------+
|          [27987182]|    1|         (714,[],[])|
|          [21686664]|    2|         (714,[],[])|
|          [21686664]|    3|         (714,[],[])|
|          [17607060]|    4|         (714,[],[])|
|           [5630387]|    5|         (714,[],[])|
|[yays, bickersgra...|    6|(714,[265,697],[1...|
|[yays, bickersgra...|    7|(714,[265,697],[1...|
|[roof, terrace, c...|    8|(714,[16,45,62,84...|
|[amazing, apt, ne...|    9|(714,[13,14,32,58...|
|[yays, oostenburg...|   10|(714,[26,28,697],...|
+--------------------+-----+--------------------+
only showing top 10 rows



In [103]:
print(time.strftime('%m%d%Y %H:%M:%S'))

idf = IDF(inputCol="raw_features", outputCol="features")
idfModel = idf.fit(result_cv)
result_tfidf = idfModel.transform(result_cv)
print(time.strftime('%m%d%Y %H:%M:%S'))

07042019 15:56:36
07042019 15:56:37


In [104]:
# Run the LDA Topic Modeler

# Note the time before and after is printed in order to find out how much time it takes to process x number of records

print(time.strftime('%m%d%Y %H:%M:%S'))
num_topics = 10
max_iterations = 20
lda_model = LDA.train(rs_df['index', 'raw_features'].rdd.map(list), k=num_topics, maxIterations=max_iterations)
print(time.strftime('%m%d%Y %H:%M:%S'))

07042019 15:56:38
07042019 15:56:51


In [105]:
vocabArray = cvmodel.vocabulary

In [106]:
wordNumbers = 20
topicIndices = sc.parallelize(lda_model.describeTopics(maxTermsPerTopic = wordNumbers))

In [107]:
def topic_render(topic):
    terms = topic[0]
    result = []
    for i in range(wordNumbers):
        term = vocabArray[terms[i]]
        result.append(term)
    return result

In [108]:
print(time.strftime('%m%d%Y %H:%M:%S'))
topics_final = topicIndices.map(lambda topic:
                               topic_render(topic)).collect()
print(time.strftime('%m%d%Y %H:%M:%S'))

07042019 15:57:24
07042019 15:57:24


In [109]:
# Display topics

for topic in range(len(topics_final)):
    print("Topic" + str(topic) + ":")
    for term in topics_final[topic]:
        print(term)
    print('\n')

Topic0:
apartment
in
amsterdam
with
the
spacious
city
and
room
near
&
centre
house
to
cosy
center
garden
cozy
modern
of


Topic1:
apartment
in
amsterdam
with
the
spacious
city
and
&
room
near
centre
house
to
center
cosy
cozy
garden
of
private


Topic2:
apartment
in
amsterdam
with
the
spacious
city
and
&
near
room
centre
house
to
cosy
center
cozy
of
garden
modern


Topic3:
apartment
in
amsterdam
with
the
spacious
city
and
room
near
&
centre
house
to
center
cosy
cozy
garden
of
modern


Topic4:
apartment
in
amsterdam
with
the
spacious
city
and
room
&
centre
near
house
to
center
cosy
cozy
garden
of
modern


Topic5:
apartment
in
amsterdam
with
the
spacious
city
and
&
room
near
centre
house
to
cosy
center
garden
cozy
private
of


Topic6:
apartment
in
amsterdam
with
the
spacious
city
and
room
&
near
centre
house
to
center
garden
cozy
cosy
of
modern


Topic7:
apartment
in
amsterdam
with
the
spacious
city
and
near
&
room
centre
house
to
center
cosy
garden
cozy
modern
private


Topic8:
apartment

### below working with floats approach
https://stackoverflow.com/questions/42051184/latent-dirichlet-allocation-lda-in-spark

In [19]:
from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.mllib.linalg import Vectors

# Load and parse the data
data = sc.textFile("./sample_lda_data.txt")
parsedData = data.map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')]))
# Index documents with unique IDs
corpus = parsedData.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()



In [25]:
corpus.toDF().show(10,False)

+---+---------------------------------------------+
|_1 |_2                                           |
+---+---------------------------------------------+
|0  |[1.0,2.0,6.0,0.0,2.0,3.0,1.0,1.0,0.0,0.0,3.0]|
|1  |[1.0,3.0,0.0,1.0,3.0,0.0,0.0,2.0,0.0,0.0,1.0]|
|2  |[1.0,4.0,1.0,0.0,0.0,4.0,9.0,0.0,1.0,2.0,0.0]|
|3  |[2.0,1.0,0.0,3.0,0.0,0.0,5.0,0.0,2.0,3.0,9.0]|
|4  |[3.0,1.0,1.0,9.0,3.0,0.0,2.0,0.0,0.0,1.0,3.0]|
|5  |[4.0,2.0,0.0,3.0,4.0,5.0,1.0,1.0,1.0,4.0,0.0]|
|6  |[2.0,1.0,0.0,3.0,0.0,0.0,5.0,0.0,2.0,2.0,9.0]|
|7  |[1.0,1.0,1.0,9.0,2.0,1.0,2.0,0.0,0.0,1.0,3.0]|
|8  |[4.0,4.0,0.0,3.0,4.0,2.0,1.0,3.0,0.0,0.0,0.0]|
|9  |[2.0,8.0,2.0,0.0,3.0,0.0,2.0,0.0,2.0,7.0,2.0]|
+---+---------------------------------------------+
only showing top 10 rows



In [21]:
corpus.toDF().printSchema()

root
 |-- _1: long (nullable = true)
 |-- _2: vector (nullable = true)



In [22]:
corpus.toDF().show(10)
# Cluster the documents into three topics using LDA
ldaModel = LDA.train(corpus, k=3)

# Output topics. Each is a distribution over words (matching word count vectors)
print("Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize())
      + " words):")
topics = ldaModel.topicsMatrix()
for topic in range(3):
    print("Topic " + str(topic) + ":")
    for word in range(0, ldaModel.vocabSize()):
        print(" " + str(topics[word][topic]))

+---+--------------------+
| _1|                  _2|
+---+--------------------+
|  0|[1.0,2.0,6.0,0.0,...|
|  1|[1.0,3.0,0.0,1.0,...|
|  2|[1.0,4.0,1.0,0.0,...|
|  3|[2.0,1.0,0.0,3.0,...|
|  4|[3.0,1.0,1.0,9.0,...|
|  5|[4.0,2.0,0.0,3.0,...|
|  6|[2.0,1.0,0.0,3.0,...|
|  7|[1.0,1.0,1.0,9.0,...|
|  8|[4.0,4.0,0.0,3.0,...|
|  9|[2.0,8.0,2.0,0.0,...|
+---+--------------------+
only showing top 10 rows

Learned topics (as distributions over vocab of 11 words):
Topic 0:
 3.4375543252729233
 5.627205295421971
 5.1071077404596945
 28.5811901504307
 3.021725594081267
 2.8325826294464127
 13.268104957996346
 0.7738939744618656
 2.2705047662424396
 4.447564961062405
 18.512210539204546
Topic 1:
 11.516720318003484
 11.765497162662928
 4.493294264073986
 3.1875705472492806
 5.7212631071986335
 5.806064540980248
 13.024024782014733
 2.0097314808376345
 4.36399447641905
 12.462556616039091
 11.476154999526333
Topic 2:
 11.045725356723594
 11.6072975419151
 2.399597995466321
 8.23123930232002
 16.2

In [169]:
tokenizer = Tokenizer(inputCol="name", outputCol="raw_words")

In [170]:
wordsData = tokenizer.transform(result) #corpus

In [171]:
locale = spark._jvm.java.util.Locale
locale.setDefault(locale.forLanguageTag("en-US"))

StopWordsRemover.loadDefaultStopWords("english")

remover = StopWordsRemover(inputCol="raw_words", outputCol="words2")
wordsData = remover.transform(wordsData)

In [172]:
hashingTF = HashingTF(inputCol="words2", outputCol="rawFeatures", numFeatures=10000)
featurizedData = hashingTF.transform(wordsData)

In [173]:
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=1)
idfModel = idf.fit(featurizedData)

tfidf = idfModel.transform(featurizedData)

In [183]:
tfidf.select("features").show(10, False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|features                                                                                                                                                                                            |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|(10000,[494,1692,1789,2659,7293,8048,8562,9263],[2.445567235227968,2.8094747144167127,3.6888794541139363,5.2799682278798405,3.247046701834897,5.983267779903803,2.46837734783679,8.083328608786376])|
|(10000,[4235,4744,8704,9743],[7.102499355774649,4.745189363091357,2.909063890908317,8.776475789346321])                                                                                             |
|(100

In [36]:
# topics = ldaModel.topicsMatrix()
# vocabArray = model.vocabulary

In [None]:
# wordNumbers = 10  # number of words per topic
# topicIndices = sc.parallelize(ldaModel.describeTopics(maxTermsPerTopic = wordNumbers))

# def topic_render(topic):  # specify vector id of words to actual words
#     terms = topic[0]
#     result = []
#     for i in range(wordNumbers):
#         term = vocabArray[terms[i]]
#         result.append(term)
#     return result

# topics_final = topicIndices.map(lambda topic: topic_render(topic)).collect()

# for topic in range(len(topics_final)):
#     print ("Topic" + str(topic) + ":")
#     for term in topics_final[topic]:
#         print (term)
#     print ('\n')