In [153]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [154]:
spark = SparkSession.Builder().getOrCreate()

# Text Pre - processing

In [155]:
inputdf = spark.createDataFrame([(0, 'I am learning Spark.')
                                , (1, 'I love machine learning and deep learning.')
                                , (2, 'I have completed master in big data analytics.')
                                , (3, 'What about you my friend.')
                                , (4, 'I need a job.')], ['id', 'Document'])

In [156]:
inputdf.printSchema()

root
 |-- id: long (nullable = true)
 |-- Document: string (nullable = true)



In [157]:
inputdf.toPandas().head()

Unnamed: 0,id,Document
0,0,I am learning Spark.
1,1,I love machine learning and deep learning.
2,2,I have completed master in big data analytics.
3,3,What about you my friend.
4,4,I need a job.


In [158]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import NGram
from pyspark.ml.feature import CountVectorizer

In [159]:
tokenizer = Tokenizer(inputCol="Document", outputCol="words")

tokenizerdf = tokenizer.transform(inputdf)
tokenizerdf.select('id', 'Document', 'words').show(truncate = False)

+---+----------------------------------------------+-------------------------------------------------------+
|id |Document                                      |words                                                  |
+---+----------------------------------------------+-------------------------------------------------------+
|0  |I am learning Spark.                          |[i, am, learning, spark.]                              |
|1  |I love machine learning and deep learning.    |[i, love, machine, learning, and, deep, learning.]     |
|2  |I have completed master in big data analytics.|[i, have, completed, master, in, big, data, analytics.]|
|3  |What about you my friend.                     |[what, about, you, my, friend.]                        |
|4  |I need a job.                                 |[i, need, a, job.]                                     |
+---+----------------------------------------------+-------------------------------------------------------+



In [160]:
regexTokenizer = RegexTokenizer(inputCol = "Document", outputCol = "words", pattern = "\\s+|,|\\.")

tokenizedDF = regexTokenizer.transform(inputdf)
tokenizedDF.select('id', 'Document', 'words').show(truncate = False)

+---+----------------------------------------------+------------------------------------------------------+
|id |Document                                      |words                                                 |
+---+----------------------------------------------+------------------------------------------------------+
|0  |I am learning Spark.                          |[i, am, learning, spark]                              |
|1  |I love machine learning and deep learning.    |[i, love, machine, learning, and, deep, learning]     |
|2  |I have completed master in big data analytics.|[i, have, completed, master, in, big, data, analytics]|
|3  |What about you my friend.                     |[what, about, you, my, friend]                        |
|4  |I need a job.                                 |[i, need, a, job]                                     |
+---+----------------------------------------------+------------------------------------------------------+



In [161]:
stopwordsRemover = StopWordsRemover(inputCol = "words", outputCol = "words_filtered")
stopwords = stopwordsRemover.loadDefaultStopWords('english')

In [162]:
cleandf = stopwordsRemover.transform(tokenizedDF)
cleandf.show(truncate = True)

+---+--------------------+--------------------+--------------------+
| id|            Document|               words|      words_filtered|
+---+--------------------+--------------------+--------------------+
|  0|I am learning Spark.|[i, am, learning,...|   [learning, spark]|
|  1|I love machine le...|[i, love, machine...|[love, machine, l...|
|  2|I have completed ...|[i, have, complet...|[completed, maste...|
|  3|What about you my...|[what, about, you...|            [friend]|
|  4|       I need a job.|   [i, need, a, job]|         [need, job]|
+---+--------------------+--------------------+--------------------+



In [163]:
ngram = NGram(n = 2, inputCol = "words", outputCol = "bigrams")  # bigrams
ngramDF = ngram.transform(cleandf)
ngramDF.select('Document', 'bigrams').show(truncate=True)

+--------------------+--------------------+
|            Document|             bigrams|
+--------------------+--------------------+
|I am learning Spark.|[i am, am learnin...|
|I love machine le...|[i love, love mac...|
|I have completed ...|[i have, have com...|
|What about you my...|[what about, abou...|
|       I need a job.|[i need, need a, ...|
+--------------------+--------------------+



In [164]:
ngram = NGram(n = 3, inputCol = "words", outputCol = "trigrams")  # trigrams
ngramDF = ngram.transform(ngramDF)
ngramDF.select('Document', 'trigrams').show(truncate=True)

+--------------------+--------------------+
|            Document|            trigrams|
+--------------------+--------------------+
|I am learning Spark.|[i am learning, a...|
|I love machine le...|[i love machine, ...|
|I have completed ...|[i have completed...|
|What about you my...|[what about you, ...|
|       I need a job.|[i need a, need a...|
+--------------------+--------------------+



In [165]:
ngramDF.select('Document', 'bigrams', 'trigrams').collect()[0]

Row(Document='I am learning Spark.', bigrams=['i am', 'am learning', 'learning spark'], trigrams=['i am learning', 'am learning spark'])

# TF-IDF

In [166]:
countVectorizer = CountVectorizer(inputCol = "words_filtered", outputCol = "features_tf") 
model = countVectorizer.fit(cleandf)

In [167]:
print(model.vocabulary)

['learning', 'spark', 'job', 'completed', 'machine', 'deep', 'big', 'need', 'master', 'love', 'data', 'analytics', 'friend']


In [168]:
cleandf = model.transform(cleandf)
cleandf.show(truncate =True)

+---+--------------------+--------------------+--------------------+--------------------+
| id|            Document|               words|      words_filtered|         features_tf|
+---+--------------------+--------------------+--------------------+--------------------+
|  0|I am learning Spark.|[i, am, learning,...|   [learning, spark]|(13,[0,1],[1.0,1.0])|
|  1|I love machine le...|[i, love, machine...|[love, machine, l...|(13,[0,4,5,9],[2....|
|  2|I have completed ...|[i, have, complet...|[completed, maste...|(13,[3,6,8,10,11]...|
|  3|What about you my...|[what, about, you...|            [friend]|     (13,[12],[1.0])|
|  4|       I need a job.|   [i, need, a, job]|         [need, job]|(13,[2,7],[1.0,1.0])|
+---+--------------------+--------------------+--------------------+--------------------+



In [169]:
print(cleandf.collect()[0]["features_tf"].toArray())

[1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


# Document Frequency

In [170]:
from pyspark.ml.feature import HashingTF, IDF

In [171]:
idf = IDF(inputCol = "features_tf", outputCol = "features_tf_idf")
idfModel = idf.fit(cleandf)

In [172]:
idfModel = idfModel.transform(cleandf)
idfModel.show(truncate = True)

+---+--------------------+--------------------+--------------------+--------------------+--------------------+
| id|            Document|               words|      words_filtered|         features_tf|     features_tf_idf|
+---+--------------------+--------------------+--------------------+--------------------+--------------------+
|  0|I am learning Spark.|[i, am, learning,...|   [learning, spark]|(13,[0,1],[1.0,1.0])|(13,[0,1],[0.6931...|
|  1|I love machine le...|[i, love, machine...|[love, machine, l...|(13,[0,4,5,9],[2....|(13,[0,4,5,9],[1....|
|  2|I have completed ...|[i, have, complet...|[completed, maste...|(13,[3,6,8,10,11]...|(13,[3,6,8,10,11]...|
|  3|What about you my...|[what, about, you...|            [friend]|     (13,[12],[1.0])|(13,[12],[1.09861...|
|  4|       I need a job.|   [i, need, a, job]|         [need, job]|(13,[2,7],[1.0,1.0])|(13,[2,7],[1.0986...|
+---+--------------------+--------------------+--------------------+--------------------+--------------------+

