In [1]:
import findspark
findspark.init()

import pyspark
from pyspark import SparkConf, SparkContext


from pyspark.sql import SparkSession, SQLContext, Row
from pyspark.sql.types import ArrayType, FloatType, StringType, IntegerType
from pyspark.sql.functions import udf, row_number,column
from pyspark.sql.window import Window


In [2]:
import numpy as np
from itertools import islice

In [3]:
from pyspark.ml.linalg import Vector, Vectors, VectorUDT,SparseVector
from pyspark.ml.feature import CountVectorizer,StopWordsRemover, HashingTF, IDF, Tokenizer

from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.mllib.feature import Word2Vec, IDF, HashingTF
from pyspark.mllib.linalg import Vector, Vectors, VectorUDT,SparseVector

In [4]:
sc = pyspark.SparkContext.getOrCreate()

In [5]:
sc

In [7]:
sc.stop()

In [8]:
num_rows_to_show = 20
text_file = 'data/listings.csv'

In [9]:
sc = SparkContext()
spark = SparkSession(sc)

In [10]:
df = spark.read.csv(text_file, inferSchema=True, header=True)
corpus = df.select("id", "name").dropna(subset="name")

In [11]:
tokenizer = Tokenizer(inputCol="name", outputCol="words")
docDF = tokenizer.transform(corpus)

In [12]:
Vector = CountVectorizer(inputCol="words", outputCol="vectors", minDF=5.0)
model = Vector.fit(docDF)
result = model.transform(docDF)

In [13]:
result.show(10, False)

+-----+-------------------------------------------------+---------------------------------------------------------+--------------------------------------------------------------------+
|id   |name                                             |words                                                    |vectors                                                             |
+-----+-------------------------------------------------+---------------------------------------------------------+--------------------------------------------------------------------+
|2818 |Quiet Garden View Room & Super Fast WiFi         |[quiet, garden, view, room, &, super, fast, wifi]        |(1196,[8,9,17,31,51,141,237,1167],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|20168|100%Centre-Studio 1 Private Floor/Bathroom       |[100%centre-studio, 1, private, floor/bathroom]          |(1196,[20,103],[1.0,1.0])                                           |
|25428|Lovely apt in City Centre (Jordaan)              |[lovely, apt, in, 

In [13]:
def get_words_vectors(e):
    a = list(e.indices) # [0:4]
    return Vectors.dense(a) # str(type(e.values)) str(a)#

def vector_length(l):
    return len(l)

# my_udf = udf(my_udf_map, StringType())
# my_udf = udf(my_udf_map, ArrayType(FloatType()))

my_udf = udf(get_words_vectors, VectorUDT())
count_vector_len = udf(vector_length, IntegerType())


result2 = result.withColumn('vectors2', my_udf(result.vectors))
result2 = result2.withColumn('v_len', count_vector_len(result2.vectors2))
result2 = result2.filter(result2['v_len'] > 0)

In [14]:
result2.select("id","vectors2","v_len").show(10,False)

+-----+-------------------------------------------+-----+
|id   |vectors2                                   |v_len|
+-----+-------------------------------------------+-----+
|2818 |[8.0,9.0,17.0,31.0,51.0,141.0,237.0,1167.0]|8    |
|20168|[20.0,103.0]                               |2    |
|25428|[1.0,6.0,11.0,25.0,32.0,491.0]             |6    |
|27886|[1.0,22.0,52.0,67.0,111.0,134.0,811.0]     |7    |
|28658|[6.0,8.0,10.0,11.0,15.0,302.0]             |6    |
|28871|[8.0,57.0,105.0]                           |3    |
|29051|[8.0,57.0,309.0]                           |3    |
|31080|[0.0,45.0,53.0,85.0,1157.0]                |5    |
|38266|[1.0,4.0,7.0,42.0,48.0,51.0,84.0]          |7    |
|41125|[0.0,2.0,14.0,243.0]                       |4    |
+-----+-------------------------------------------+-----+
only showing top 10 rows



In [15]:
w = Window().orderBy(column("id"))
result3 = result2.withColumn("id", row_number().over(w)).select("id", "vectors2").rdd.map(lambda x: [int(x[0]), x[1]])

In [16]:
result3.toDF().show()

+---+--------------------+
| _1|                  _2|
+---+--------------------+
|  1|[265.0,682.0,763....|
|  2|[265.0,682.0,970....|
|  3|[16.0,45.0,62.0,8...|
|  4|[13.0,14.0,32.0,5...|
|  5|[26.0,28.0,682.0,...|
|  6|[28.0,386.0,682.0...|
|  7|[26.0,682.0,733.0...|
|  8|[0.0,1.0,28.0,303...|
|  9|[1.0,12.0,51.0,10...|
| 10|[28.0,265.0,682.0...|
| 11|[28.0,386.0,682.0...|
| 12|[0.0,4.0,6.0,11.0...|
| 13|       [34.0,1175.0]|
| 14|[3.0,17.0,29.0,49...|
| 15|[2.0,79.0,80.0,14...|
| 16|[0.0,1.0,30.0,245.0]|
| 17|[0.0,1.0,15.0,150.0]|
| 18|[141.0,351.0,596....|
| 19|[4.0,6.0,11.0,13....|
| 20|[1.0,29.0,30.0,40...|
+---+--------------------+
only showing top 20 rows



In [17]:
result3.toDF().printSchema()

root
 |-- _1: long (nullable = true)
 |-- _2: vector (nullable = true)



In [18]:
#Train the LDA model

seed=1
num_topics = 5
max_iterations = 4

# num_topics, maxIterations=max_iterations, seed=seed

ldaModel = LDA.train(result3, k=num_topics, maxIterations=max_iterations, seed=seed)

In [19]:
# Output topics. Each is a distribution over words (matching word count vectors)
print("Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize())
      + " words):")


Learned topics (as distributions over vocab of 4 words):


In [20]:
vocab_size = ldaModel.vocabSize()
topics = ldaModel.describeTopics()
#topics = ldaModel.topicsMatrix()

In [21]:
topics[1]

([4, 5, 3, 6],
 [0.1863323292922061,
  0.17667451228557676,
  0.15902971201358637,
  0.1473319326677002])

In [22]:
vocab_size = ldaModel.vocabSize()
topics = ldaModel.describeTopics()
#topics = ldaModel.topicsMatrix()

for key,topic in enumerate(topics):
    print("Topic #" + str(key+1) + ":")
    
    for key2, word in enumerate(topic[0]):
        print(str(word),": ",round(topic[1][key2],3),sep="")
    print("")
#     for word in range(0, vocab_size):
#         print(" " + str(topics[word][topic]))

Topic #1:
5: 0.195
4: 0.192
3: 0.166
6: 0.153

Topic #2:
4: 0.186
5: 0.177
3: 0.159
6: 0.147

Topic #3:
4: 0.198
5: 0.188
3: 0.169
6: 0.138

Topic #4:
4: 0.203
5: 0.19
3: 0.154
6: 0.15

Topic #5:
5: 0.193
4: 0.189
6: 0.153
3: 0.149



In [23]:
docDF.show(10, False)

+-----+-------------------------------------------------+---------------------------------------------------------+
|id   |name                                             |words                                                    |
+-----+-------------------------------------------------+---------------------------------------------------------+
|2818 |Quiet Garden View Room & Super Fast WiFi         |[quiet, garden, view, room, &, super, fast, wifi]        |
|20168|100%Centre-Studio 1 Private Floor/Bathroom       |[100%centre-studio, 1, private, floor/bathroom]          |
|25428|Lovely apt in City Centre (Jordaan)              |[lovely, apt, in, city, centre, (jordaan)]               |
|27886|Romantic, stylish B&B houseboat in canal district|[romantic,, stylish, b&b, houseboat, in, canal, district]|
|28658|Cosy guest room near city centre -1              |[cosy, guest, room, near, city, centre, -1]              |
|28871|Comfortable double room                          |[comfortable, d

In [14]:
import time
from pyspark.ml.feature import IDF

from pyspark.mllib.linalg import Vector as oldVector, Vectors as oldVectors
from pyspark.ml.linalg import Vector as newVector, Vectors as newVectors

In [45]:
print(time.strftime('%m%d%Y %H:%M:%S'))

cv = CountVectorizer(inputCol="words", outputCol="raw_features", vocabSize=5000, minDF=2.0)
cvmodel = cv.fit(docDF)

print(time.strftime('%m%d%Y %H:%M:%S'))

07042019 20:50:09
07042019 20:50:09


In [46]:
print(time.strftime('%m%d%Y %H:%M:%S'))
result_cv = cvmodel.transform(docDF)
print(time.strftime('%m%d%Y %H:%M:%S'))

07042019 20:50:10
07042019 20:50:10


In [47]:
result_cv.show(1)

+----+--------------------+--------------------+--------------------+
|  id|                name|               words|        raw_features|
+----+--------------------+--------------------+--------------------+
|2818|Quiet Garden View...|[quiet, garden, v...|(2674,[8,9,17,31,...|
+----+--------------------+--------------------+--------------------+
only showing top 1 row



In [48]:
result_cv = result_cv.drop("name")

In [49]:
rs = result_cv.rdd.map(lambda x: (x[1], x[0], oldVectors.fromML(x[2])))

In [50]:
rs_df = rs.toDF(['tweet_words', 'index', 'raw_features'])

In [51]:
rs.take(1)

[(['quiet', 'garden', 'view', 'room', '&', 'super', 'fast', 'wifi'],
  '2818',
  SparseVector(2674, {8: 1.0, 9: 1.0, 17: 1.0, 31: 1.0, 51: 1.0, 141: 1.0, 237: 1.0, 1189: 1.0}))]

In [52]:
rs_df.show(10)

+--------------------+-----+--------------------+
|         tweet_words|index|        raw_features|
+--------------------+-----+--------------------+
|[quiet, garden, v...| 2818|(2674,[8,9,17,31,...|
|[100%centre-studi...|20168|(2674,[20,103,258...|
|[lovely, apt, in,...|25428|(2674,[1,6,11,25,...|
|[romantic,, styli...|27886|(2674,[1,22,52,67...|
|[cosy, guest, roo...|28658|(2674,[6,8,10,11,...|
|[comfortable, dou...|28871|(2674,[8,57,105],...|
|[comfortable, sin...|29051|(2674,[8,57,308],...|
|[2-story, apartme...|31080|(2674,[0,45,53,85...|
|[nice, and, quiet...|38266|(2674,[1,4,7,42,4...|
|[amsterdam, cente...|41125|(2674,[0,2,14,243...|
+--------------------+-----+--------------------+
only showing top 10 rows



In [53]:
w = Window().orderBy(column("index"))

rs_df = rs_df.withColumn("index", row_number().over(w))
#.select("tweet_words", "index", "raw_features")
#.rdd.map(lambda x: [x[0], int(x[1]), x[2]])


#rs_df = rs_df.toDF()

In [54]:
rs_df.show(10)

+--------------------+-----+--------------------+
|         tweet_words|index|        raw_features|
+--------------------+-----+--------------------+
|          [27987182]|    1|        (2674,[],[])|
|          [21686664]|    2| (2674,[2292],[1.0])|
|          [21686664]|    3| (2674,[2292],[1.0])|
|          [17607060]|    4|        (2674,[],[])|
|           [5630387]|    5|        (2674,[],[])|
|[yays, bickersgra...|    6|(2674,[265,693,77...|
|[yays, bickersgra...|    7|(2674,[265,693,95...|
|[roof, terrace, c...|    8|(2674,[16,45,62,8...|
|[amazing, apt, ne...|    9|(2674,[13,14,32,5...|
|[yays, oostenburg...|   10|(2674,[26,28,693,...|
+--------------------+-----+--------------------+
only showing top 10 rows



In [55]:
print(time.strftime('%m%d%Y %H:%M:%S'))

idf = IDF(inputCol="raw_features", outputCol="features")
idfModel = idf.fit(result_cv)
result_tfidf = idfModel.transform(result_cv)
print(time.strftime('%m%d%Y %H:%M:%S'))

07042019 20:50:23
07042019 20:50:23


In [57]:
# Run the LDA Topic Modeler

# Note the time before and after is printed in order to find out how much time it takes to process x number of records

print(time.strftime('%m%d%Y %H:%M:%S'))
num_topics = 5
max_iterations = 20
lda_model = LDA.train(rs_df['index', 'raw_features'].rdd.map(list), k=3, maxIterations=max_iterations)
print(time.strftime('%m%d%Y %H:%M:%S'))

07042019 20:50:36
07042019 20:50:48


In [58]:
vocabArray = cvmodel.vocabulary

In [66]:
wordNumbers = 5
topicIndices = sc.parallelize(lda_model.describeTopics(maxTermsPerTopic = wordNumbers))

In [67]:
def topic_render(topic):
    terms = topic[0]
    result = []
    for i in range(wordNumbers):
        term = vocabArray[terms[i]]
        result.append(term)
    return result

In [68]:
print(time.strftime('%m%d%Y %H:%M:%S'))
topics_final = topicIndices.map(lambda topic:
                               topic_render(topic)).collect()
print(time.strftime('%m%d%Y %H:%M:%S'))

07042019 20:52:28
07042019 20:52:28


In [69]:
topicIndices.toDF().show(10,False)

+---------------+------------------------------------------------------------------------------------------------------------+
|_1             |_2                                                                                                          |
+---------------+------------------------------------------------------------------------------------------------------------+
|[0, 1, 2, 3, 4]|[0.06335696079244235, 0.05744357220262513, 0.03490818674478035, 0.025647518722451645, 0.02059753463886322]  |
|[0, 1, 2, 3, 4]|[0.06456946840019714, 0.058078645857665374, 0.03584059111040546, 0.026100114725632065, 0.020141746742449437]|
|[0, 1, 2, 3, 4]|[0.06441698974795014, 0.057568139817236824, 0.036025481875182175, 0.02489031113403188, 0.021939967390061845]|
+---------------+------------------------------------------------------------------------------------------------------------+



In [70]:
topics

[([0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   18,
   17,
   19,
   21,
   20,
   23,
   22,
   24,
   25,
   26,
   28,
   29,
   30,
   27,
   34,
   32,
   31,
   33,
   37,
   36,
   39,
   35,
   40,
   38,
   41,
   42,
   44,
   45,
   43,
   47,
   46,
   48,
   49,
   52,
   53,
   50,
   51,
   56,
   55,
   60,
   54,
   58,
   63,
   57,
   62,
   59,
   66,
   61,
   70,
   64,
   69,
   71,
   68,
   67,
   65,
   73,
   72,
   74,
   76,
   75,
   79,
   77,
   80,
   82,
   83,
   78,
   85,
   81,
   84,
   86,
   89,
   90,
   87,
   88,
   95,
   93,
   101,
   92,
   91,
   107,
   97,
   102,
   96,
   106,
   98,
   99,
   94,
   104,
   113,
   100,
   103,
   105,
   109,
   110,
   108,
   111,
   118,
   117,
   115,
   119,
   116,
   112,
   124,
   114,
   121,
   120,
   122,
   123,
   131,
   126,
   125,
   138,
   143,
   140,
   129,
   136,
   134,
   128,
   133,
   130,
   135,
   127

In [71]:
vocab_size = lda_model.vocabSize()
topics = lda_model.describeTopics()
#topics = ldaModel.topicsMatrix()

for key,topic in enumerate(topics):
    print("Topic #" + str(key+1) + ":")
    
    for key2, word in enumerate(topic[0]):
        print(str(word),": ",round(topic[1][key2],3),sep="")
    print("")
#     for word in range(0, vocab_size):
#         print(" " + str(topics[word][topic]))

Topic #1:
0: 0.063
1: 0.057
2: 0.035
3: 0.026
4: 0.021
5: 0.019
6: 0.018
7: 0.016
8: 0.015
9: 0.015
10: 0.015
11: 0.014
12: 0.013
13: 0.012
14: 0.011
15: 0.011
16: 0.011
18: 0.01
17: 0.01
19: 0.01
21: 0.009
20: 0.009
23: 0.009
22: 0.009
24: 0.009
25: 0.009
26: 0.009
28: 0.007
29: 0.007
30: 0.007
27: 0.007
34: 0.007
32: 0.007
31: 0.007
33: 0.007
37: 0.007
36: 0.006
39: 0.006
35: 0.006
40: 0.006
38: 0.006
41: 0.006
42: 0.005
44: 0.005
45: 0.005
43: 0.005
47: 0.005
46: 0.005
48: 0.005
49: 0.005
52: 0.004
53: 0.004
50: 0.004
51: 0.004
56: 0.004
55: 0.004
60: 0.004
54: 0.004
58: 0.004
63: 0.004
57: 0.004
62: 0.004
59: 0.003
66: 0.003
61: 0.003
70: 0.003
64: 0.003
69: 0.003
71: 0.003
68: 0.003
67: 0.003
65: 0.003
73: 0.003
72: 0.003
74: 0.003
76: 0.003
75: 0.003
79: 0.003
77: 0.002
80: 0.002
82: 0.002
83: 0.002
78: 0.002
85: 0.002
81: 0.002
84: 0.002
86: 0.002
89: 0.002
90: 0.002
87: 0.002
88: 0.002
95: 0.002
93: 0.002
101: 0.002
92: 0.002
91: 0.002
107: 0.002
97: 0.002
102: 0.002
96: 0.002


1229: 0.0
1618: 0.0
1225: 0.0
1230: 0.0
1768: 0.0
1765: 0.0
1486: 0.0
1269: 0.0
1291: 0.0
1451: 0.0
1278: 0.0
1457: 0.0
1465: 0.0
1719: 0.0
1657: 0.0
1393: 0.0
1513: 0.0
1534: 0.0
1428: 0.0
1602: 0.0
1316: 0.0
1336: 0.0
1501: 0.0
1480: 0.0
1664: 0.0
1700: 0.0
1442: 0.0
1646: 0.0
1436: 0.0
1443: 0.0
1217: 0.0
1211: 0.0
1342: 0.0
1490: 0.0
1266: 0.0
1571: 0.0
1454: 0.0
1497: 0.0
1694: 0.0
1479: 0.0
1695: 0.0
1391: 0.0
1526: 0.0
1584: 0.0
1445: 0.0
1660: 0.0
1547: 0.0
1516: 0.0
1434: 0.0
1502: 0.0
1320: 0.0
1530: 0.0
1687: 0.0
1598: 0.0
1387: 0.0
1772: 0.0
1576: 0.0
1527: 0.0
1747: 0.0
1424: 0.0
1701: 0.0
1411: 0.0
1574: 0.0
1559: 0.0
1438: 0.0
1459: 0.0
1599: 0.0
1612: 0.0
1718: 0.0
1620: 0.0
1564: 0.0
1610: 0.0
1587: 0.0
1420: 0.0
1426: 0.0
1573: 0.0
1655: 0.0
1538: 0.0
1546: 0.0
1592: 0.0
1593: 0.0
1570: 0.0
1440: 0.0
1681: 0.0
1553: 0.0
1770: 0.0
1638: 0.0
1601: 0.0
1712: 0.0
1688: 0.0
1589: 0.0
1469: 0.0
1542: 0.0
1711: 0.0
1636: 0.0
1512: 0.0
1627: 0.0
1568: 0.0
1622: 0.0
1773: 0.0


2194: 0.0
1793: 0.0
2151: 0.0
2590: 0.0
2481: 0.0
2393: 0.0
2375: 0.0
2533: 0.0
2096: 0.0
2487: 0.0
2632: 0.0
2551: 0.0
2203: 0.0
2489: 0.0
2581: 0.0
2074: 0.0
2349: 0.0
2094: 0.0
2602: 0.0
1841: 0.0
2338: 0.0
2046: 0.0
2650: 0.0
2368: 0.0
2420: 0.0
2237: 0.0
2488: 0.0
2335: 0.0
2275: 0.0
1822: 0.0
2448: 0.0
1925: 0.0
2503: 0.0
2010: 0.0
1889: 0.0
2167: 0.0
1909: 0.0
2136: 0.0
2160: 0.0
1904: 0.0
2588: 0.0
2536: 0.0
2260: 0.0
1802: 0.0
1975: 0.0
2346: 0.0
2388: 0.0
2266: 0.0
1839: 0.0
2326: 0.0
2130: 0.0
2362: 0.0
2666: 0.0
2377: 0.0
2414: 0.0
2539: 0.0
1888: 0.0
2302: 0.0
2364: 0.0
2205: 0.0
2165: 0.0
1796: 0.0
2013: 0.0
1891: 0.0
1805: 0.0
1933: 0.0
2058: 0.0
2102: 0.0
2234: 0.0
2243: 0.0
2072: 0.0
2475: 0.0
1983: 0.0
1900: 0.0
2317: 0.0
1993: 0.0
1977: 0.0
2661: 0.0
2663: 0.0
2213: 0.0
2638: 0.0
1969: 0.0
2591: 0.0
1868: 0.0
1847: 0.0
2332: 0.0
1825: 0.0
2259: 0.0
1921: 0.0
2485: 0.0
1860: 0.0
2067: 0.0
2520: 0.0
2463: 0.0
2315: 0.0
1817: 0.0
2630: 0.0
2662: 0.0
1957: 0.0
2659: 0.0


378: 0.0
445: 0.0
395: 0.0
419: 0.0
375: 0.0
399: 0.0
400: 0.0
389: 0.0
368: 0.0
396: 0.0
353: 0.0
418: 0.0
442: 0.0
435: 0.0
415: 0.0
456: 0.0
460: 0.0
386: 0.0
439: 0.0
404: 0.0
401: 0.0
428: 0.0
406: 0.0
511: 0.0
420: 0.0
464: 0.0
496: 0.0
431: 0.0
453: 0.0
463: 0.0
416: 0.0
402: 0.0
370: 0.0
390: 0.0
430: 0.0
316: 0.0
436: 0.0
457: 0.0
440: 0.0
470: 0.0
381: 0.0
443: 0.0
449: 0.0
446: 0.0
469: 0.0
422: 0.0
433: 0.0
459: 0.0
403: 0.0
482: 0.0
429: 0.0
412: 0.0
493: 0.0
486: 0.0
425: 0.0
455: 0.0
466: 0.0
541: 0.0
408: 0.0
432: 0.0
458: 0.0
441: 0.0
411: 0.0
374: 0.0
462: 0.0
527: 0.0
450: 0.0
599: 0.0
485: 0.0
472: 0.0
394: 0.0
474: 0.0
465: 0.0
471: 0.0
447: 0.0
423: 0.0
551: 0.0
427: 0.0
489: 0.0
492: 0.0
520: 0.0
448: 0.0
556: 0.0
438: 0.0
479: 0.0
526: 0.0
499: 0.0
491: 0.0
475: 0.0
504: 0.0
461: 0.0
532: 0.0
548: 0.0
421: 0.0
424: 0.0
537: 0.0
579: 0.0
454: 0.0
477: 0.0
521: 0.0
478: 0.0
407: 0.0
550: 0.0
490: 0.0
554: 0.0
426: 0.0
508: 0.0
552: 0.0
603: 0.0
534: 0.0
495: 0.0
4

1504: 0.0
1408: 0.0
1509: 0.0
1560: 0.0
1741: 0.0
1524: 0.0
1532: 0.0
1737: 0.0
1565: 0.0
1474: 0.0
1527: 0.0
1558: 0.0
1687: 0.0
1745: 0.0
1622: 0.0
1682: 0.0
1743: 0.0
1441: 0.0
1570: 0.0
1642: 0.0
1587: 0.0
1574: 0.0
1588: 0.0
1489: 0.0
1592: 0.0
1450: 0.0
1477: 0.0
1681: 0.0
1567: 0.0
1711: 0.0
1449: 0.0
1659: 0.0
1688: 0.0
1421: 0.0
1573: 0.0
1428: 0.0
1664: 0.0
1683: 0.0
1708: 0.0
1701: 0.0
1442: 0.0
1528: 0.0
1775: 0.0
1598: 0.0
1694: 0.0
1761: 0.0
1457: 0.0
1705: 0.0
1716: 0.0
1758: 0.0
1452: 0.0
1424: 0.0
1599: 0.0
1733: 0.0
2239: 0.0
1470: 0.0
1557: 0.0
1584: 0.0
2532: 0.0
2419: 0.0
2141: 0.0
1869: 0.0
2336: 0.0
2353: 0.0
2226: 0.0
2254: 0.0
2077: 0.0
2216: 0.0
2508: 0.0
1961: 0.0
2314: 0.0
2060: 0.0
2238: 0.0
2618: 0.0
2494: 0.0
2643: 0.0
1936: 0.0
2386: 0.0
1955: 0.0
2108: 0.0
2669: 0.0
2008: 0.0
2063: 0.0
2210: 0.0
2151: 0.0
2448: 0.0
2040: 0.0
2573: 0.0
2256: 0.0
2623: 0.0
2027: 0.0
2672: 0.0
2310: 0.0
2569: 0.0
1855: 0.0
2376: 0.0
2104: 0.0
1964: 0.0
1888: 0.0
2445: 0.0


2223: 0.0
2527: 0.0
1992: 0.0
1999: 0.0
2200: 0.0
2269: 0.0
2599: 0.0
2215: 0.0
2301: 0.0
2337: 0.0
2524: 0.0
2280: 0.0
2557: 0.0
1794: 0.0
2154: 0.0
2600: 0.0
2578: 0.0
2437: 0.0
2391: 0.0
1885: 0.0
1954: 0.0
1901: 0.0
2111: 0.0
2485: 0.0
2520: 0.0
2420: 0.0
2161: 0.0
2522: 0.0
2582: 0.0
1803: 0.0
1960: 0.0
2348: 0.0
2252: 0.0
2307: 0.0
2616: 0.0
2000: 0.0
1884: 0.0
1923: 0.0
2257: 0.0
1937: 0.0
1874: 0.0
2430: 0.0
2435: 0.0
1873: 0.0
2411: 0.0
2607: 0.0
2206: 0.0
2418: 0.0
2135: 0.0
2054: 0.0
2510: 0.0
2572: 0.0
2080: 0.0
2020: 0.0
2517: 0.0
2365: 0.0
2467: 0.0
2574: 0.0
2486: 0.0
2125: 0.0
2114: 0.0
2436: 0.0
2530: 0.0
1897: 0.0
2014: 0.0
1952: 0.0
2441: 0.0
2333: 0.0
2271: 0.0
2589: 0.0
1791: 0.0
2227: 0.0
2563: 0.0
2149: 0.0
2397: 0.0
2370: 0.0
2399: 0.0
2144: 0.0
1808: 0.0
2356: 0.0
2038: 0.0
2593: 0.0
2264: 0.0
2595: 0.0
1963: 0.0
2068: 0.0
1988: 0.0
2539: 0.0
1967: 0.0
2274: 0.0
2661: 0.0
2065: 0.0
1811: 0.0
2584: 0.0
1892: 0.0
2059: 0.0
2620: 0.0
1865: 0.0
2207: 0.0
2051: 0.0


1269: 0.0
1758: 0.0
1252: 0.0
1728: 0.0
1560: 0.0
1204: 0.0
1509: 0.0
1200: 0.0
1215: 0.0
1228: 0.0
1240: 0.0
1422: 0.0
1472: 0.0
1267: 0.0
1659: 0.0
1254: 0.0
1372: 0.0
1474: 0.0
1368: 0.0
1452: 0.0
1737: 0.0
1489: 0.0
1450: 0.0
1360: 0.0
1549: 0.0
1421: 0.0
1584: 0.0
1569: 0.0
1707: 0.0
1529: 0.0
1540: 0.0
1567: 0.0
1599: 0.0
1511: 0.0
1741: 0.0
1752: 0.0
1427: 0.0
1742: 0.0
1365: 0.0
1676: 0.0
1404: 0.0
1699: 0.0
1736: 0.0
1308: 0.0
1441: 0.0
1554: 0.0
1471: 0.0
1697: 0.0
1609: 0.0
1528: 0.0
1702: 0.0
1648: 0.0
1449: 0.0
1544: 0.0
1724: 0.0
1651: 0.0
1424: 0.0
1212: 0.0
1491: 0.0
1745: 0.0
1632: 0.0
1430: 0.0
1488: 0.0
1588: 0.0
1631: 0.0
1684: 0.0
1721: 0.0
1535: 0.0
1467: 0.0
1621: 0.0
1415: 0.0
1282: 0.0
1400: 0.0
1477: 0.0
1634: 0.0
1688: 0.0
1635: 0.0
1641: 0.0
1330: 0.0
1711: 0.0
1399: 0.0
1577: 0.0
1743: 0.0
1464: 0.0
1616: 0.0
1680: 0.0
1753: 0.0
1573: 0.0
1448: 0.0
1504: 0.0
1681: 0.0
1722: 0.0
1622: 0.0
1592: 0.0
1603: 0.0
1783: 0.0
1701: 0.0
1738: 0.0
1644: 0.0
1598: 0.0


1839: 0.0
2317: 0.0
2257: 0.0
2396: 0.0
1808: 0.0
2104: 0.0
2030: 0.0
2436: 0.0
2552: 0.0
1887: 0.0
2360: 0.0
2577: 0.0
2156: 0.0
1924: 0.0
2041: 0.0
2324: 0.0
2438: 0.0
1949: 0.0
2455: 0.0
2227: 0.0
2568: 0.0
2344: 0.0
2393: 0.0
2072: 0.0
2462: 0.0
2645: 0.0
2390: 0.0
2063: 0.0
2255: 0.0
2367: 0.0
1956: 0.0
1981: 0.0
2196: 0.0
2105: 0.0
2288: 0.0
2511: 0.0
1986: 0.0
2446: 0.0
2637: 0.0
1898: 0.0
1995: 0.0
2202: 0.0
2293: 0.0
2064: 0.0
2605: 0.0
2154: 0.0
2456: 0.0
2108: 0.0
2553: 0.0
1862: 0.0
2285: 0.0
2287: 0.0
1863: 0.0
2316: 0.0
2332: 0.0
2118: 0.0
1817: 0.0
2157: 0.0
2112: 0.0
2402: 0.0
2355: 0.0
2328: 0.0
2356: 0.0
2666: 0.0
1846: 0.0
1879: 0.0
2663: 0.0
2441: 0.0
2492: 0.0
2519: 0.0
2421: 0.0
2086: 0.0
2100: 0.0
2653: 0.0
2121: 0.0
2358: 0.0
2656: 0.0
2235: 0.0
2434: 0.0
2140: 0.0
1975: 0.0
2052: 0.0
2004: 0.0
2294: 0.0
2623: 0.0
2335: 0.0
1920: 0.0
2584: 0.0
1834: 0.0
1919: 0.0
1833: 0.0
2581: 0.0
1867: 0.0
1852: 0.0
2061: 0.0
2053: 0.0
2209: 0.0
2197: 0.0
2405: 0.0
2263: 0.0


In [72]:
# Display topics

for topic in range(len(topics_final)):
    print("Topic" + str(topic) + ":")
    for term in topics_final[topic]:
        print(term)
    print('\n')

Topic0:
apartment
in
amsterdam
with
the


Topic1:
apartment
in
amsterdam
with
the


Topic2:
apartment
in
amsterdam
with
the




### below working with floats approach
https://stackoverflow.com/questions/42051184/latent-dirichlet-allocation-lda-in-spark

In [19]:
from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.mllib.linalg import Vectors

# Load and parse the data
data = sc.textFile("./sample_lda_data.txt")
parsedData = data.map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')]))
# Index documents with unique IDs
corpus = parsedData.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()



In [25]:
corpus.toDF().show(10,False)

+---+---------------------------------------------+
|_1 |_2                                           |
+---+---------------------------------------------+
|0  |[1.0,2.0,6.0,0.0,2.0,3.0,1.0,1.0,0.0,0.0,3.0]|
|1  |[1.0,3.0,0.0,1.0,3.0,0.0,0.0,2.0,0.0,0.0,1.0]|
|2  |[1.0,4.0,1.0,0.0,0.0,4.0,9.0,0.0,1.0,2.0,0.0]|
|3  |[2.0,1.0,0.0,3.0,0.0,0.0,5.0,0.0,2.0,3.0,9.0]|
|4  |[3.0,1.0,1.0,9.0,3.0,0.0,2.0,0.0,0.0,1.0,3.0]|
|5  |[4.0,2.0,0.0,3.0,4.0,5.0,1.0,1.0,1.0,4.0,0.0]|
|6  |[2.0,1.0,0.0,3.0,0.0,0.0,5.0,0.0,2.0,2.0,9.0]|
|7  |[1.0,1.0,1.0,9.0,2.0,1.0,2.0,0.0,0.0,1.0,3.0]|
|8  |[4.0,4.0,0.0,3.0,4.0,2.0,1.0,3.0,0.0,0.0,0.0]|
|9  |[2.0,8.0,2.0,0.0,3.0,0.0,2.0,0.0,2.0,7.0,2.0]|
+---+---------------------------------------------+
only showing top 10 rows



In [21]:
corpus.toDF().printSchema()

root
 |-- _1: long (nullable = true)
 |-- _2: vector (nullable = true)



In [22]:
corpus.toDF().show(10)
# Cluster the documents into three topics using LDA
ldaModel = LDA.train(corpus, k=3)

# Output topics. Each is a distribution over words (matching word count vectors)
print("Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize())
      + " words):")
topics = ldaModel.topicsMatrix()
for topic in range(3):
    print("Topic " + str(topic) + ":")
    for word in range(0, ldaModel.vocabSize()):
        print(" " + str(topics[word][topic]))

+---+--------------------+
| _1|                  _2|
+---+--------------------+
|  0|[1.0,2.0,6.0,0.0,...|
|  1|[1.0,3.0,0.0,1.0,...|
|  2|[1.0,4.0,1.0,0.0,...|
|  3|[2.0,1.0,0.0,3.0,...|
|  4|[3.0,1.0,1.0,9.0,...|
|  5|[4.0,2.0,0.0,3.0,...|
|  6|[2.0,1.0,0.0,3.0,...|
|  7|[1.0,1.0,1.0,9.0,...|
|  8|[4.0,4.0,0.0,3.0,...|
|  9|[2.0,8.0,2.0,0.0,...|
+---+--------------------+
only showing top 10 rows

Learned topics (as distributions over vocab of 11 words):
Topic 0:
 3.4375543252729233
 5.627205295421971
 5.1071077404596945
 28.5811901504307
 3.021725594081267
 2.8325826294464127
 13.268104957996346
 0.7738939744618656
 2.2705047662424396
 4.447564961062405
 18.512210539204546
Topic 1:
 11.516720318003484
 11.765497162662928
 4.493294264073986
 3.1875705472492806
 5.7212631071986335
 5.806064540980248
 13.024024782014733
 2.0097314808376345
 4.36399447641905
 12.462556616039091
 11.476154999526333
Topic 2:
 11.045725356723594
 11.6072975419151
 2.399597995466321
 8.23123930232002
 16.2

In [169]:
tokenizer = Tokenizer(inputCol="name", outputCol="raw_words")

In [170]:
wordsData = tokenizer.transform(result) #corpus

In [171]:
locale = spark._jvm.java.util.Locale
locale.setDefault(locale.forLanguageTag("en-US"))

StopWordsRemover.loadDefaultStopWords("english")

remover = StopWordsRemover(inputCol="raw_words", outputCol="words2")
wordsData = remover.transform(wordsData)

In [172]:
hashingTF = HashingTF(inputCol="words2", outputCol="rawFeatures", numFeatures=10000)
featurizedData = hashingTF.transform(wordsData)

In [173]:
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=1)
idfModel = idf.fit(featurizedData)

tfidf = idfModel.transform(featurizedData)

In [183]:
tfidf.select("features").show(10, False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|features                                                                                                                                                                                            |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|(10000,[494,1692,1789,2659,7293,8048,8562,9263],[2.445567235227968,2.8094747144167127,3.6888794541139363,5.2799682278798405,3.247046701834897,5.983267779903803,2.46837734783679,8.083328608786376])|
|(10000,[4235,4744,8704,9743],[7.102499355774649,4.745189363091357,2.909063890908317,8.776475789346321])                                                                                             |
|(100

In [36]:
# topics = ldaModel.topicsMatrix()
# vocabArray = model.vocabulary

In [None]:
# wordNumbers = 10  # number of words per topic
# topicIndices = sc.parallelize(ldaModel.describeTopics(maxTermsPerTopic = wordNumbers))

# def topic_render(topic):  # specify vector id of words to actual words
#     terms = topic[0]
#     result = []
#     for i in range(wordNumbers):
#         term = vocabArray[terms[i]]
#         result.append(term)
#     return result

# topics_final = topicIndices.map(lambda topic: topic_render(topic)).collect()

# for topic in range(len(topics_final)):
#     print ("Topic" + str(topic) + ":")
#     for term in topics_final[topic]:
#         print (term)
#     print ('\n')