In [1]:
from __future__ import print_function
import findspark
import os
import sys
os.environ['SPARK_HOME'] = r'/Users/subham/Downloads/spark-3.0.0-bin-hadoop2.7'
findspark.init()
findspark.find()
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.feature import HashingTF
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import IDF

In [2]:
if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("Tf-Idf")\
        .getOrCreate()

## TFIDF

In [3]:
documentDF = spark.createDataFrame([
        (0,"Let's see an example of countVectorizer"),
        (1,"We will use pyspark library"),
        (2,"countVectorizer is important for NLP")
    ], ["index","sentence"])

In [4]:
documentDF.show(truncate=False)

+-----+---------------------------------------+
|index|sentence                               |
+-----+---------------------------------------+
|0    |Let's see an example of countVectorizer|
|1    |We will use pyspark library            |
|2    |countVectorizer is important for NLP   |
+-----+---------------------------------------+



In [5]:
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")


In [6]:
wordsDf = tokenizer.transform(documentDF)

In [7]:
wordsDf.show(truncate=False)

+-----+---------------------------------------+----------------------------------------------+
|index|sentence                               |words                                         |
+-----+---------------------------------------+----------------------------------------------+
|0    |Let's see an example of countVectorizer|[let's, see, an, example, of, countvectorizer]|
|1    |We will use pyspark library            |[we, will, use, pyspark, library]             |
|2    |countVectorizer is important for NLP   |[countvectorizer, is, important, for, nlp]    |
+-----+---------------------------------------+----------------------------------------------+



In [8]:
TF = HashingTF(inputCol="words", outputCol="term_frequency")

In [9]:

tf_data = TF.transform(wordsDf)

In [10]:
tf_data.show(truncate=False)

+-----+---------------------------------------+----------------------------------------------+---------------------------------------------------------------------------+
|index|sentence                               |words                                         |term_frequency                                                             |
+-----+---------------------------------------+----------------------------------------------+---------------------------------------------------------------------------+
|0    |Let's see an example of countVectorizer|[let's, see, an, example, of, countvectorizer]|(262144,[8538,75750,143202,152540,165615,219087],[1.0,1.0,1.0,1.0,1.0,1.0])|
|1    |We will use pyspark library            |[we, will, use, pyspark, library]             |(262144,[89356,98717,133073,156084,248899],[1.0,1.0,1.0,1.0,1.0])          |
|2    |countVectorizer is important for NLP   |[countvectorizer, is, important, for, nlp]    |(262144,[15391,76307,106776,106841,152540],[1.0,1.0

In [11]:
idf = IDF(inputCol="term_frequency", outputCol="idf")

In [12]:
idf_Model = idf.fit(tf_data)

In [13]:
new_Data = idf_Model.transform(tf_data)

In [14]:
new_Data.select("index", "idf").show(truncate=False)

+-----+----------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|index|idf                                                                                                                                                                   |
+-----+----------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0    |(262144,[8538,75750,143202,152540,165615,219087],[0.6931471805599453,0.6931471805599453,0.6931471805599453,0.28768207245178085,0.6931471805599453,0.6931471805599453])|
|1    |(262144,[89356,98717,133073,156084,248899],[0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453])                          |
|2    |(262144,[15391,76307,106776,106841,152540],[0.6931471805599453,0.6931471805599453,0.6931471805599453,0.693147180559945

In [None]:
spark.stop()

## CountVectorizer

In [20]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import CountVectorizer

In [15]:
if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("countVectorizer")\
        .getOrCreate()

In [16]:
documentDF = spark.createDataFrame([
        ("Let's see an example of countVectorizer".split(" "),),
        ("We will use pyspark library my name is my name is  ".split(" "),),
        ("countVectorizer is important for NLP".split(" "),)
    ], ["sentence"])

In [17]:
documentDF.show()

+--------------------+
|            sentence|
+--------------------+
|[Let's, see, an, ...|
|[We, will, use, p...|
|[countVectorizer,...|
+--------------------+



In [18]:
documentDF.show(truncate=False)

+-----------------------------------------------------------------+
|sentence                                                         |
+-----------------------------------------------------------------+
|[Let's, see, an, example, of, countVectorizer]                   |
|[We, will, use, pyspark, library, my, name, is, my, name, is, , ]|
|[countVectorizer, is, important, for, NLP]                       |
+-----------------------------------------------------------------+



In [21]:
count_vector = CountVectorizer(inputCol="sentence", outputCol="count_vector")
count_vector

CountVectorizer_3fc26f58e1ce

In [22]:
model = count_vector.fit(documentDF)

In [23]:
result = model.transform(documentDF)

In [24]:
result.show(truncate=False)

+-----------------------------------------------------------------+----------------------------------------------------------------+
|sentence                                                         |count_vector                                                    |
+-----------------------------------------------------------------+----------------------------------------------------------------+
|[Let's, see, an, example, of, countVectorizer]                   |(18,[1,7,8,11,13,14],[1.0,1.0,1.0,1.0,1.0,1.0])                 |
|[We, will, use, pyspark, library, my, name, is, my, name, is, , ]|(18,[0,2,3,4,5,6,9,10,12],[2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0])|
|[countVectorizer, is, important, for, NLP]                       |(18,[0,1,15,16,17],[1.0,1.0,1.0,1.0,1.0])                       |
+-----------------------------------------------------------------+----------------------------------------------------------------+

