# NLP Tools - 1

In [2]:
import findspark
findspark.init('/home/venkat/Downloads/spark-3.2.0-bin-hadoop3.2')

from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName('nlp').getOrCreate()

## Tokenizer

In [5]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer

In [7]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [9]:
sen_df = spark.createDataFrame([
        (0,'Hi I am learning Spark'),
        (1, 'I like spark'),
        (2, 'Logistic,regression,models,are,good')
    
], ['id','sentence'])

In [10]:
sen_df.show()

                                                                                

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|Hi I am learning ...|
|  1|        I like spark|
|  2|Logistic,regressi...|
+---+--------------------+



In [11]:
tokenizer = Tokenizer(inputCol='sentence', outputCol='words')

regex_tokenizer = RegexTokenizer(inputCol='sentence', outputCol='words', pattern='\\W') # pattern to split on

In [20]:
count_tokens = udf(lambda words:len(words), IntegerType()) #user defined function

In [15]:
tokenized = tokenizer.transform(sen_df)

In [17]:
tokenized.withColumn('tokens', count_tokens(col('words'))).show()

                                                                                

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi I am learning ...|[hi, i, am, learn...|     5|
|  1|        I like spark|    [i, like, spark]|     3|
|  2|Logistic,regressi...|[logistic,regress...|     1|
+---+--------------------+--------------------+------+



## the third sentence did not split

In [18]:
rg_tokenized = regex_tokenizer.transform(sen_df)

In [19]:
rg_tokenized.withColumn('tokens', count_tokens(col('words'))).show() #This works well

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi I am learning ...|[hi, i, am, learn...|     5|
|  1|        I like spark|    [i, like, spark]|     3|
|  2|Logistic,regressi...|[logistic, regres...|     5|
+---+--------------------+--------------------+------+



## Stop word removal - such as "a", "the" which occur quite frequently

In [21]:
from pyspark.ml.feature import StopWordsRemover

In [22]:
sentenceDataFrame = spark.createDataFrame([
        (0,['I', 'saw', 'the','green','horse']),
        (1, ['Mary', 'had', 'a','little','lamb'])
    
], ['id','tokens'])

In [24]:
remover = StopWordsRemover(inputCol='tokens', outputCol='filtered')
remover.transform(sentenceDataFrame).show()

+---+--------------------+--------------------+
| id|              tokens|            filtered|
+---+--------------------+--------------------+
|  0|[I, saw, the, gre...| [saw, green, horse]|
|  1|[Mary, had, a, li...|[Mary, little, lamb]|
+---+--------------------+--------------------+



## n-gram to transform input string (output of tokenizer)

In [25]:
from pyspark.ml.feature import NGram

In [27]:
wordDataFrame = spark.createDataFrame([
        (0,['I', 'saw', 'the','green','horse']),
        (1, ['Mary', 'had', 'a','little','lamb']),
        (2, ['Buzz', 'of', 'the','honey','bees'])
    
], ['id','words'])

In [28]:
ngram = NGram(n=2, inputCol='words', outputCol='grams')
ngram.transform(wordDataFrame).show(truncate=False)

+---+----------------------------+----------------------------------------+
|id |words                       |grams                                   |
+---+----------------------------+----------------------------------------+
|0  |[I, saw, the, green, horse] |[I saw, saw the, the green, green horse]|
|1  |[Mary, had, a, little, lamb]|[Mary had, had a, a little, little lamb]|
|2  |[Buzz, of, the, honey, bees]|[Buzz of, of the, the honey, honey bees]|
+---+----------------------------+----------------------------------------+

