In [0]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, RegexTokenizer, StopWordsRemover, NGram
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [0]:
spark = SparkSession.builder.appName('nlp_part1').getOrCreate()
spark

In [0]:
sent_df = spark.createDataFrame([
    (0,'Hi I just heard about Spark'),
    (1,'I wish that Java could use case classes'),
    (2,'Logistic,regression,models,are,neat')
],['id','sentence'])
sent_df.show(truncate=False)

+---+---------------------------------------+
|id |sentence                               |
+---+---------------------------------------+
|0  |Hi I just heard about Spark            |
|1  |I wish that Java could use case classes|
|2  |Logistic,regression,models,are,neat    |
+---+---------------------------------------+



In [0]:
tokenizer = Tokenizer(inputCol='sentence',outputCol='tokens')
tokenized_sent_df = tokenizer.transform(sent_df)
tokenized_sent_df.show(truncate=False)

+---+---------------------------------------+------------------------------------------------+
|id |sentence                               |tokens                                          |
+---+---------------------------------------+------------------------------------------------+
|0  |Hi I just heard about Spark            |[hi, i, just, heard, about, spark]              |
|1  |I wish that Java could use case classes|[i, wish, that, java, could, use, case, classes]|
|2  |Logistic,regression,models,are,neat    |[logistic,regression,models,are,neat]           |
+---+---------------------------------------+------------------------------------------------+



In [0]:
count_tokens = udf(lambda words: len(words), IntegerType())
count_tokens

<function __main__.<lambda>(words)>

In [0]:
tokenized_sent_df = tokenized_sent_df.withColumn('num_tokens',count_tokens(col('tokens')))
tokenized_sent_df.show(truncate=False)

+---+---------------------------------------+------------------------------------------------+----------+
|id |sentence                               |tokens                                          |num_tokens|
+---+---------------------------------------+------------------------------------------------+----------+
|0  |Hi I just heard about Spark            |[hi, i, just, heard, about, spark]              |6         |
|1  |I wish that Java could use case classes|[i, wish, that, java, could, use, case, classes]|8         |
|2  |Logistic,regression,models,are,neat    |[logistic,regression,models,are,neat]           |1         |
+---+---------------------------------------+------------------------------------------------+----------+



In [0]:
regex_tokenizer = RegexTokenizer(inputCol='sentence',outputCol='tokens',pattern='\\W')
regex_tokenized_sent_df = regex_tokenizer.transform(sent_df)
regex_tokenized_sent_df.show(truncate=False)

+---+---------------------------------------+------------------------------------------------+
|id |sentence                               |tokens                                          |
+---+---------------------------------------+------------------------------------------------+
|0  |Hi I just heard about Spark            |[hi, i, just, heard, about, spark]              |
|1  |I wish that Java could use case classes|[i, wish, that, java, could, use, case, classes]|
|2  |Logistic,regression,models,are,neat    |[logistic, regression, models, are, neat]       |
+---+---------------------------------------+------------------------------------------------+



In [0]:
regex_tokenized_sent_df = regex_tokenized_sent_df.withColumn('num_tokens',count_tokens(col('tokens')))
regex_tokenized_sent_df.show(truncate=False)

+---+---------------------------------------+------------------------------------------------+----------+
|id |sentence                               |tokens                                          |num_tokens|
+---+---------------------------------------+------------------------------------------------+----------+
|0  |Hi I just heard about Spark            |[hi, i, just, heard, about, spark]              |6         |
|1  |I wish that Java could use case classes|[i, wish, that, java, could, use, case, classes]|8         |
|2  |Logistic,regression,models,are,neat    |[logistic, regression, models, are, neat]       |5         |
+---+---------------------------------------+------------------------------------------------+----------+



In [0]:
remover = StopWordsRemover(inputCol='tokens',outputCol='filtered_tokens')
regex_tokenized_sent_df = remover.transform(regex_tokenized_sent_df)
regex_tokenized_sent_df.show(truncate=False)

+---+---------------------------------------+------------------------------------------------+----------+------------------------------------+
|id |sentence                               |tokens                                          |num_tokens|filtered_tokens                     |
+---+---------------------------------------+------------------------------------------------+----------+------------------------------------+
|0  |Hi I just heard about Spark            |[hi, i, just, heard, about, spark]              |6         |[hi, heard, spark]                  |
|1  |I wish that Java could use case classes|[i, wish, that, java, could, use, case, classes]|8         |[wish, java, use, case, classes]    |
|2  |Logistic,regression,models,are,neat    |[logistic, regression, models, are, neat]       |5         |[logistic, regression, models, neat]|
+---+---------------------------------------+------------------------------------------------+----------+------------------------------------+

In [0]:
ngram = NGram(n=2,inputCol='tokens',outputCol='bigram_tokens')
regex_tokenized_sent_df = ngram.transform(regex_tokenized_sent_df)
regex_tokenized_sent_df.show(truncate=False)

+---+---------------------------------------+------------------------------------------------+----------+------------------------------------+-----------------------------------------------------------------------------+
|id |sentence                               |tokens                                          |num_tokens|filtered_tokens                     |bigram_tokens                                                                |
+---+---------------------------------------+------------------------------------------------+----------+------------------------------------+-----------------------------------------------------------------------------+
|0  |Hi I just heard about Spark            |[hi, i, just, heard, about, spark]              |6         |[hi, heard, spark]                  |[hi i, i just, just heard, heard about, about spark]                         |
|1  |I wish that Java could use case classes|[i, wish, that, java, could, use, case, classes]|8         |[wish, java

In [0]:
ngram = NGram(n=3,inputCol='tokens',outputCol='trigram_tokens')
regex_tokenized_sent_df = ngram.transform(regex_tokenized_sent_df)
regex_tokenized_sent_df.show(truncate=False)

+---+---------------------------------------+------------------------------------------------+----------+------------------------------------+-----------------------------------------------------------------------------+------------------------------------------------------------------------------------------------+
|id |sentence                               |tokens                                          |num_tokens|filtered_tokens                     |bigram_tokens                                                                |trigram_tokens                                                                                  |
+---+---------------------------------------+------------------------------------------------+----------+------------------------------------+-----------------------------------------------------------------------------+------------------------------------------------------------------------------------------------+
|0  |Hi I just heard about Spark            |[