In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('nlp').getOrCreate()

In [0]:
#Tokenization
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col,udf
from pyspark.sql.types import IntegerType

In [0]:
#Creating a df for sentences
sen_df = spark.createDataFrame([
    (0,'Hi I heard about Spark'),
    (1,'I wish java could use case classes'),
    (2,'Logistic,regression,models,are,used')
],['id','sentence'])

sen_df.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|Hi I heard about ...|
|  1|I wish java could...|
|  2|Logistic,regressi...|
+---+--------------------+



In [0]:
#Creating the tokenizer object
tokenizer = Tokenizer(inputCol='sentence',outputCol='words')

#Regex extracts based on whatever pattern we provide
regex_tokenizer = RegexTokenizer(inputCol='sentence', outputCol='words', pattern='\\W')

In [0]:
#udf
count_tokens = udf(lambda words:len(words),IntegerType())

In [0]:
tokenized = tokenizer.transform(sen_df)
tokenized.show()

+---+--------------------+--------------------+
| id|            sentence|               words|
+---+--------------------+--------------------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|
|  1|I wish java could...|[i, wish, java, c...|
|  2|Logistic,regressi...|[logistic,regress...|
+---+--------------------+--------------------+



In [0]:
tokenized.withColumn('tokens',count_tokens(col('words'))).show()
#we can see 3rd sentence has only 1 token. Hence it wasnt split

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|     5|
|  1|I wish java could...|[i, wish, java, c...|     7|
|  2|Logistic,regressi...|[logistic,regress...|     1|
+---+--------------------+--------------------+------+



In [0]:
#Hence we do regex
rg_tokenized = regex_tokenizer.transform(sen_df)
rg_tokenized.withColumn('tokens',count_tokens(col('words'))).show()

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|     5|
|  1|I wish java could...|[i, wish, java, c...|     7|
|  2|Logistic,regressi...|[logistic, regres...|     5|
+---+--------------------+--------------------+------+



In [0]:
#Removing common words using Stop Words Remover
from pyspark.ml.feature import StopWordsRemover

In [0]:
sentence_DataFrame = spark.createDataFrame([
    (0,['I','saw','the','green','light']),
    (1,['Mary','had','a','little','lamb']),
],['id','tokens'])

sentence_DataFrame.show()

+---+--------------------+
| id|              tokens|
+---+--------------------+
|  0|[I, saw, the, gre...|
|  1|[Mary, had, a, li...|
+---+--------------------+



In [0]:
#SWR object
remover = StopWordsRemover(inputCol='tokens',outputCol='filtered')
remover.transform(sentence_DataFrame).show()

+---+--------------------+--------------------+
| id|              tokens|            filtered|
+---+--------------------+--------------------+
|  0|[I, saw, the, gre...| [saw, green, light]|
|  1|[Mary, had, a, li...|[Mary, little, lamb]|
+---+--------------------+--------------------+



In [0]:
#n-gram
from pyspark.ml.feature import NGram

In [0]:
wordDataFrame = spark.createDataFrame([
    (0,['Hi', 'I', 'heard', 'about', 'Spark']),
    (1,['I', 'wish', 'java', 'could', 'use', 'case', 'classes']),
    (2,['Logistic', 'regression', 'models' ,'are' ,'neat'])
],['id','words'])

wordDataFrame.show()

+---+--------------------+
| id|               words|
+---+--------------------+
|  0|[Hi, I, heard, ab...|
|  1|[I, wish, java, c...|
|  2|[Logistic, regres...|
+---+--------------------+



In [0]:
ngram = NGram(inputCol='words',n=2,outputCol='grams')
ngram.transform(wordDataFrame).select('grams').show(truncate=False)

+------------------------------------------------------------------+
|grams                                                             |
+------------------------------------------------------------------+
|[Hi I, I heard, heard about, about Spark]                         |
|[I wish, wish java, java could, could use, use case, case classes]|
|[Logistic regression, regression models, models are, are neat]    |
+------------------------------------------------------------------+

