In [7]:
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.sql import SQLContext, SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import Word2Vec
from pyspark.sql.functions import *

In [8]:
sc=SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

In [9]:
documentsPath = "../data/documents.txt"
documents = sqlContext.read.format("csv").option("header", "true").load(documentsPath)

In [10]:
documents.show(10, False)

+-------------------------------+
|text                           |
+-------------------------------+
|The sun is a star              |
|The earth is a planet          |
|The moon is a satellite’s earth|
|The sun is yellow              |
+-------------------------------+



In [11]:
# To lower and split by space
documents = documents.withColumn("text_splitted", split(lower(col("text")), " "))

In [None]:
# Word2Vec
word2Vec = Word2Vec(vectorSize=100, minCount=0, maxIter=100, inputCol="text_splitted", outputCol="features")
model = word2Vec.fit(documents)

In [None]:
result = model.transform(documents)

In [None]:
synonyms = model.findSynonyms('sun', 5)
synonyms.show()

In [None]:
# KMeans Clustering
numIterations = 200
numberClusters = 2
kmeans = KMeans().setMaxIter(numIterations).setK(numberClusters).setSeed(1)
kmeans_model = kmeans.fit(result)

In [None]:
# Make predictions
predictions = kmeans_model.transform(result)

In [None]:
predictions.show()

In [180]:
# Save Output
predictionsPath = "../data/predictions.txt"
predictions.select(["text", "prediction"]).write.csv(predictionsPath, mode="overwrite", header="true")