In [0]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import CountVectorizer, HashingTF, IDF, RegexTokenizer, Word2Vec
from pyspark.sql.functions import format_number, col

In [0]:
spark = SparkSession.builder.appName('nlp_part2').getOrCreate()
spark

In [0]:
sent_df = spark.createDataFrame([
    (0.0,'Hi I just heard about Spark'),
    (0.0,'I wish Java could use case classes'),
    (1.0,'Logistic regression models are neat')
],['label','sentence'])
sent_df.show()

+-----+--------------------+
|label|            sentence|
+-----+--------------------+
|  0.0|Hi I just heard a...|
|  0.0|I wish Java could...|
|  1.0|Logistic regressi...|
+-----+--------------------+



In [0]:
tokenizer = RegexTokenizer(inputCol='sentence',outputCol='tokens',pattern='\\W')
tokenized_sent_df = tokenizer.transform(sent_df)
tokenized_sent_df.show(truncate=False)

+-----+-----------------------------------+------------------------------------------+
|label|sentence                           |tokens                                    |
+-----+-----------------------------------+------------------------------------------+
|0.0  |Hi I just heard about Spark        |[hi, i, just, heard, about, spark]        |
|0.0  |I wish Java could use case classes |[i, wish, java, could, use, case, classes]|
|1.0  |Logistic regression models are neat|[logistic, regression, models, are, neat] |
+-----+-----------------------------------+------------------------------------------+



In [0]:
tf = HashingTF(inputCol='tokens',outputCol='rawFeatures')
tokenized_sent_df = tf.transform(tokenized_sent_df)
tokenized_sent_df.show(truncate=False)

+-----+-----------------------------------+------------------------------------------+------------------------------------------------------------------------------------+
|label|sentence                           |tokens                                    |rawFeatures                                                                         |
+-----+-----------------------------------+------------------------------------------+------------------------------------------------------------------------------------+
|0.0  |Hi I just heard about Spark        |[hi, i, just, heard, about, spark]        |(262144,[18700,19036,33808,66273,97171,173558],[1.0,1.0,1.0,1.0,1.0,1.0])           |
|0.0  |I wish Java could use case classes |[i, wish, java, could, use, case, classes]|(262144,[19036,20719,55551,58672,98717,109547,192310],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|1.0  |Logistic regression models are neat|[logistic, regression, models, are, neat] |(262144,[46243,58267,91006,160975,190884],[1.0,1.0,1.0

In [0]:
idf = IDF(inputCol='rawFeatures',outputCol='features',minDocFreq=2)
idf_model = idf.fit(tokenized_sent_df)
tokenized_sent_df = idf_model.transform(tokenized_sent_df)
tokenized_sent_df.show(truncate=False)

+-----+-----------------------------------+------------------------------------------+------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+
|label|sentence                           |tokens                                    |rawFeatures                                                                         |features                                                                                            |
+-----+-----------------------------------+------------------------------------------+------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+
|0.0  |Hi I just heard about Spark        |[hi, i, just, heard, about, spark]        |(262144,[18700,19036,33808,66273,97171,173558],[1.0,1.0,1.0,1.0,1.0,1.0])           |(262144,[1

In [0]:
tokenized_sent_df.select('label','features').show(truncate=False)

+-----+----------------------------------------------------------------------------------------------------+
|label|features                                                                                            |
+-----+----------------------------------------------------------------------------------------------------+
|0.0  |(262144,[18700,19036,33808,66273,97171,173558],[0.0,0.28768207245178085,0.0,0.0,0.0,0.0])           |
|0.0  |(262144,[19036,20719,55551,58672,98717,109547,192310],[0.28768207245178085,0.0,0.0,0.0,0.0,0.0,0.0])|
|1.0  |(262144,[46243,58267,91006,160975,190884],[0.0,0.0,0.0,0.0,0.0])                                    |
+-----+----------------------------------------------------------------------------------------------------+



In [0]:
words_df = spark.createDataFrame([
    (0,"a b c".split()),
    (1,"a b c a".split()),
    (2,"a c b a b".split())
],['id','words'])
words_df.show()

+---+---------------+
| id|          words|
+---+---------------+
|  0|      [a, b, c]|
|  1|   [a, b, c, a]|
|  2|[a, c, b, a, b]|
+---+---------------+



In [0]:
vectorizer = CountVectorizer(inputCol='words',outputCol='features',vocabSize=3,minDF=2)
cv_model = vectorizer.fit(words_df)
cv_model

CountVectorizerModel: uid=CountVectorizer_54378f8705a6, vocabularySize=3

In [0]:
vectorized_words_df = cv_model.transform(words_df)
vectorized_words_df.show(truncate=False)

+---+---------------+-------------------------+
|id |words          |features                 |
+---+---------------+-------------------------+
|0  |[a, b, c]      |(3,[0,1,2],[1.0,1.0,1.0])|
|1  |[a, b, c, a]   |(3,[0,1,2],[2.0,1.0,1.0])|
|2  |[a, c, b, a, b]|(3,[0,1,2],[2.0,2.0,1.0])|
+---+---------------+-------------------------+



In [0]:
sent = ("b a " * 100 + "a c b " * 10).split(" ")

words_df = spark.createDataFrame([
    (sent,), 
    (sent,)
], ["sentence"])
words_df.show()

+--------------------+
|            sentence|
+--------------------+
|[b, a, b, a, b, a...|
|[b, a, b, a, b, a...|
+--------------------+



In [0]:
wv = Word2Vec(inputCol='sentence',vectorSize=5,seed=48,windowSize=10,maxSentenceLength=500,minCount=1,outputCol='embeddings')
wv_model = wv.fit(words_df)
wv_model

Word2VecModel: uid=Word2Vec_0f743b1716f7, numWords=4, vectorSize=5

In [0]:
wv_model.getVectors().show(truncate=False)

+----+----------------------------------------------------------------------------------------------------------+
|word|vector                                                                                                    |
+----+----------------------------------------------------------------------------------------------------------+
|a   |[0.43426597118377686,-0.12470865249633789,-0.5958183407783508,-0.19327032566070557,0.7723482847213745]    |
|b   |[0.6657069325447083,-0.1493173986673355,-0.27567753195762634,0.29769062995910645,0.42605358362197876]     |
|c   |[0.35067981481552124,-0.030499424785375595,-0.35829445719718933,-0.025705864652991295,0.40814152359962463]|
|    |[0.07161520421504974,0.06222974881529808,-0.14337986707687378,-0.04092687740921974,0.1599714756011963]    |
+----+----------------------------------------------------------------------------------------------------------+



In [0]:
wv_model.findSynonyms("b",2).select(['word',format_number(col('similarity'),5).alias('similarity')]).show()

+----+----------+
|word|similarity|
+----+----------+
|   c|   0.86205|
|   a|   0.75559|
+----+----------+



In [0]:
wv_model.findSynonyms("c",2).select(['word',format_number(col('similarity'),5).alias('similarity')]).show()

+----+----------+
|word|similarity|
+----+----------+
|   a|   0.97499|
|    |   0.91277|
+----+----------+



In [0]:
wv_model.findSynonyms("a",2).select(['word',format_number(col('similarity'),5).alias('similarity')]).show()

+----+----------+
|word|similarity|
+----+----------+
|   c|   0.97499|
|    |   0.92265|
+----+----------+



In [0]:
words_df = wv_model.transform(words_df)
words_df.show(truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------+
|sentence                                                                                                                                                                                                 