In [1]:
import findspark
findspark.init('/home/ubuntu/spark-2.4.5-bin-hadoop2.7')
from pyspark.sql import SparkSession

from pyspark.ml.linalg import Vector
from pyspark.ml.feature import Tokenizer, RegexTokenizer, StopWordsRemover, NGram
from pyspark.ml.feature import HashingTF, IDF, CountVectorizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
import os
os.chdir('/home/ubuntu/data')

In [2]:
spark = SparkSession.builder.appName('NLP').getOrCreate()

In [3]:
sent_df = sentenceDataFrame = spark.createDataFrame([
    (0, "Hi I heard about Spark"),
    (1, "I wish Java could use case classes"),
    (2, "Logistic,regression,models,are,neat")
], ["id", "sentence"])
sent_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- sentence: string (nullable = true)



In [4]:
sent_df.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|Hi I heard about ...|
|  1|I wish Java could...|
|  2|Logistic,regressi...|
+---+--------------------+



In [5]:
tokken = Tokenizer(inputCol='sentence', outputCol='words')
regex_tokken = RegexTokenizer(inputCol='sentence', outputCol='words', pattern='\\W')
count_tokkens = udf(lambda words: len(words), IntegerType())

In [6]:
tokkenized_tokken = tokken.transform(sent_df)
tokkenized_tokken.show()

+---+--------------------+--------------------+
| id|            sentence|               words|
+---+--------------------+--------------------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|
|  1|I wish Java could...|[i, wish, java, c...|
|  2|Logistic,regressi...|[logistic,regress...|
+---+--------------------+--------------------+



In [7]:
tokkenized_tokken.withColumn('tokens', count_tokkens(col('words'))).show()

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|     5|
|  1|I wish Java could...|[i, wish, java, c...|     7|
|  2|Logistic,regressi...|[logistic,regress...|     1|
+---+--------------------+--------------------+------+



In [8]:
tokkenized_tokken = regex_tokken.transform(sent_df)
tokkenized_tokken.show()

+---+--------------------+--------------------+
| id|            sentence|               words|
+---+--------------------+--------------------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|
|  1|I wish Java could...|[i, wish, java, c...|
|  2|Logistic,regressi...|[logistic, regres...|
+---+--------------------+--------------------+



In [9]:
tokkenized_tokken.withColumn('tokens', count_tokkens(col('words'))).show()

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|     5|
|  1|I wish Java could...|[i, wish, java, c...|     7|
|  2|Logistic,regressi...|[logistic, regres...|     5|
+---+--------------------+--------------------+------+



In [10]:
sentenceData = spark.createDataFrame([
    (0, ["I", "saw", "the", "red", "balloon"]),
    (1, ["Mary", "had", "a", "little", "lamb"])
], ["id", "tokens"])
sentenceData.show()

+---+--------------------+
| id|              tokens|
+---+--------------------+
|  0|[I, saw, the, red...|
|  1|[Mary, had, a, li...|
+---+--------------------+



In [11]:
remover = StopWordsRemover(inputCol="tokens", outputCol="filtered")
remover.transform(sentenceData).show(truncate=False)

+---+----------------------------+--------------------+
|id |tokens                      |filtered            |
+---+----------------------------+--------------------+
|0  |[I, saw, the, red, balloon] |[saw, red, balloon] |
|1  |[Mary, had, a, little, lamb]|[Mary, little, lamb]|
+---+----------------------------+--------------------+



In [12]:
wordDataFrame = spark.createDataFrame([
    (0, ["Hi", "I", "heard", "about", "Spark"]),
    (1, ["I", "wish", "Java", "could", "use", "case", "classes"]),
    (2, ["Logistic", "regression", "models", "are", "neat"])
], ["id", "words"])

ngram = NGram(n=2, inputCol="words", outputCol="ngrams")

ngramDataFrame = ngram.transform(wordDataFrame)
ngramDataFrame.select("ngrams").show(truncate=False)

+------------------------------------------------------------------+
|ngrams                                                            |
+------------------------------------------------------------------+
|[Hi I, I heard, heard about, about Spark]                         |
|[I wish, wish Java, Java could, could use, use case, case classes]|
|[Logistic regression, regression models, models are, are neat]    |
+------------------------------------------------------------------+



In [13]:
sent_df = sentenceDataFrame = spark.createDataFrame([
    (0, "Hi I heard about Spark"),
    (1, "I wish Java could use case classes"),
    (2, "Logistic regression models are neat")
], ["id", "sentence"])
sent_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- sentence: string (nullable = true)



In [14]:
tokken = Tokenizer(inputCol='sentence', outputCol='words')
tokkenized_tokken = tokken.transform(sent_df)
tokkenized_tokken.show()

+---+--------------------+--------------------+
| id|            sentence|               words|
+---+--------------------+--------------------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|
|  1|I wish Java could...|[i, wish, java, c...|
|  2|Logistic regressi...|[logistic, regres...|
+---+--------------------+--------------------+



In [15]:
hashing_tf = HashingTF(inputCol='words', outputCol='rawFeatures')
featureise_df = hashing_tf.transform(tokkenized_tokken)
featureise_df.show()

+---+--------------------+--------------------+--------------------+
| id|            sentence|               words|         rawFeatures|
+---+--------------------+--------------------+--------------------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|(262144,[24417,49...|
|  1|I wish Java could...|[i, wish, java, c...|(262144,[20719,24...|
|  2|Logistic regressi...|[logistic, regres...|(262144,[13671,91...|
+---+--------------------+--------------------+--------------------+



In [16]:
idf = IDF(inputCol='rawFeatures', outputCol='features')
idf_model = idf.fit(featureise_df)
rescaled_data = idf_model.transform(featureise_df)
rescaled_data.show()

+---+--------------------+--------------------+--------------------+--------------------+
| id|            sentence|               words|         rawFeatures|            features|
+---+--------------------+--------------------+--------------------+--------------------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|(262144,[24417,49...|(262144,[24417,49...|
|  1|I wish Java could...|[i, wish, java, c...|(262144,[20719,24...|(262144,[20719,24...|
|  2|Logistic regressi...|[logistic, regres...|(262144,[13671,91...|(262144,[13671,91...|
+---+--------------------+--------------------+--------------------+--------------------+



In [17]:
rescaled_data.select('id', 'features').show(truncate=False)

+---+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|id |features                                                                                                                                                                                        |
+---+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0  |(262144,[24417,49304,73197,91137,234657],[0.28768207245178085,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453])                                                     |
|1  |(262144,[20719,24417,55551,116873,147765,162369,192310],[0.6931471805599453,0.28768207245178085,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453])|
|2  |

In [19]:
df = spark.createDataFrame([
    (0, "a b c".split(" ")),
    (1, "a b b c a".split(" "))
], ["id", "words"])


cv = CountVectorizer(inputCol='words', outputCol='features', vocabSize=3, minDF=2.0)
cv_model = cv.fit(df)
cv_model.transform(df).show(truncate=False)

+---+---------------+-------------------------+
|id |words          |features                 |
+---+---------------+-------------------------+
|0  |[a, b, c]      |(3,[0,1,2],[1.0,1.0,1.0])|
|1  |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|
+---+---------------+-------------------------+

