In [1]:
import findspark
from pyspark.sql import SparkSession
findspark.init()
spark = SparkSession.builder.appName('Feature Extraction and Transformation').getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/12/12 14:59:15 WARN Utils: Your hostname, omar, resolves to a loopback address: 127.0.1.1; using 192.168.1.4 instead (on interface wlo1)
25/12/12 14:59:15 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/12 14:59:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
from pyspark.ml.feature import Tokenizer, CountVectorizer

In [5]:
#create a sample dataframe
sentenceDataFrame = spark.createDataFrame([
    (1, "Spark is a distributed computing system."),
    (2, "It provides interfaces for multiple languages"),
    (3, "Spark is built on top of Hadoop")
], ["id", "sentence"])

sentenceDataFrame.show(truncate=False)

+---+---------------------------------------------+
|id |sentence                                     |
+---+---------------------------------------------+
|1  |Spark is a distributed computing system.     |
|2  |It provides interfaces for multiple languages|
|3  |Spark is built on top of Hadoop              |
+---+---------------------------------------------+



In [9]:
tokenizer = Tokenizer(inputCol='sentence', outputCol='words')
token_df = tokenizer.transform(sentenceDataFrame)
token_df.show(truncate=False)

+---+---------------------------------------------+----------------------------------------------------+
|id |sentence                                     |words                                               |
+---+---------------------------------------------+----------------------------------------------------+
|1  |Spark is a distributed computing system.     |[spark, is, a, distributed, computing, system.]     |
|2  |It provides interfaces for multiple languages|[it, provides, interfaces, for, multiple, languages]|
|3  |Spark is built on top of Hadoop              |[spark, is, built, on, top, of, hadoop]             |
+---+---------------------------------------------+----------------------------------------------------+



In [11]:
textdata = [(1, "I love Spark Spark provides Python API ".split()),
            (2, "I love Python Spark supports Python".split()),
            (3, "Spark solves the big problem of big data".split())]

textData = spark.createDataFrame(textdata, ['id','words'])

textData.show(truncate=False)

+---+-------------------------------------------------+
|id |words                                            |
+---+-------------------------------------------------+
|1  |[I, love, Spark, Spark, provides, Python, API]   |
|2  |[I, love, Python, Spark, supports, Python]       |
|3  |[Spark, solves, the, big, problem, of, big, data]|
+---+-------------------------------------------------+



In [12]:
cv = CountVectorizer(inputCol='words',  outputCol='features')
model = cv.fit(textData)

### Explain  the output of CountVectorizer
The main goal of it is to count of each word in the whole document
**Explain the output**
output is  a tuple of integer and two lists
- integer refers to the total count of distinct word in the whole document
- it assign each value in the whole document to an index so this represent  indicies that are appeared in this document
- number of occurence of each value in its document

In [13]:
result = model.transform(textData)
result.show(truncate=False)

+---+-------------------------------------------------+----------------------------------------------------+
|id |words                                            |features                                            |
+---+-------------------------------------------------+----------------------------------------------------+
|1  |[I, love, Spark, Spark, provides, Python, API]   |(13,[0,1,2,3,5,7],[2.0,1.0,1.0,1.0,1.0,1.0])        |
|2  |[I, love, Python, Spark, supports, Python]       |(13,[0,1,2,3,12],[1.0,2.0,1.0,1.0,1.0])             |
|3  |[Spark, solves, the, big, problem, of, big, data]|(13,[0,4,6,8,9,10,11],[1.0,2.0,1.0,1.0,1.0,1.0,1.0])|
+---+-------------------------------------------------+----------------------------------------------------+



In [None]:
data  = [['omar youssef mohamed']]
df  = spark.createDataFrame(data,['text'])
tokenizer = Tokenizer(inputCol='text',outputCol='words')
tokenizer.transform(df).show(truncate=False)

+--------------------+------------------------+
|text                |words                   |
+--------------------+------------------------+
|omar youssef mohamed|[omar, youssef, mohamed]|
+--------------------+------------------------+



In [11]:
example_df  = spark.createDataFrame([(1,['a' ,'b','e','a','g']),
                                     (2,['b' ,'c' ,'d'])],['id','words'])
cv=  CountVectorizer(inputCol='words',outputCol='features')
model = cv.fit(example_df)
model.transform(example_df).show(truncate=False)

+---+---------------+-------------------------------+
|id |words          |features                       |
+---+---------------+-------------------------------+
|1  |[a, b, e, a, g]|(6,[0,1,2,4],[1.0,2.0,1.0,1.0])|
|2  |[b, c, d]      |(6,[0,3,5],[1.0,1.0,1.0])      |
+---+---------------+-------------------------------+

