In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import lower , regexp_replace,split,concat_ws,col,udf
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.types import ArrayType,StringType
import nltk

In [2]:
spark=SparkSession.builder.getOrCreate()

In [3]:
df=spark.read.text('/home/bigdata/Desktop/shakespeare.txt')

In [4]:
df.show(10)

+--------------------+
|               value|
+--------------------+
|This is the 100th...|
|is presented in c...|
|Library of the Fu...|
|often releases Et...|
|                    |
|         Shakespeare|
|                    |
|*This Etext has c...|
|                    |
|<<THIS ELECTRONIC...|
+--------------------+
only showing top 10 rows



In [5]:
df=df.withColumn('line',lower(col('value')))

In [6]:
df.show(2)

+--------------------+--------------------+
|               value|                line|
+--------------------+--------------------+
|This is the 100th...|this is the 100th...|
|is presented in c...|is presented in c...|
+--------------------+--------------------+
only showing top 2 rows



In [7]:
df=df.withColumn('line',regexp_replace(col('line'),r'[^a-z\s]',''))

In [8]:
df=df.withColumn('words',split(col('line'),' '))

In [9]:
df.show(4)

+--------------------+--------------------+--------------------+
|               value|                line|               words|
+--------------------+--------------------+--------------------+
|This is the 100th...|this is the th et...|[this, is, the, t...|
|is presented in c...|is presented in c...|[is, presented, i...|
|Library of the Fu...|library of the fu...|[library, of, the...|
|often releases Et...|often releases et...|[often, releases,...|
+--------------------+--------------------+--------------------+
only showing top 4 rows



In [10]:
remover = StopWordsRemover(inputCol='words', outputCol='filtered_words')

In [11]:
df=remover.transform(df)

In [12]:
df.show(5)

+--------------------+--------------------+--------------------+--------------------+
|               value|                line|               words|      filtered_words|
+--------------------+--------------------+--------------------+--------------------+
|This is the 100th...|this is the th et...|[this, is, the, t...|[th, etext, file,...|
|is presented in c...|is presented in c...|[is, presented, i...|[presented, coope...|
|Library of the Fu...|library of the fu...|[library, of, the...|[library, future,...|
|often releases Et...|often releases et...|[often, releases,...|[often, releases,...|
|                    |                    |                  []|                  []|
+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [13]:
nltk.download('words')

[nltk_data] Downloading package words to /home/bigdata/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [14]:
english_words=set(w.lower() for w in nltk.corpus.words.words())

In [15]:
def filter_english_words(word_list):
    return [word for word in word_list if word in english_words] #return the matching words in the corpus and our text file

filter_english_udf=udf(filter_english_words,ArrayType(StringType()))
df=df.withColumn('filter_words',filter_english_udf(col('filtered_words')))

In [16]:
df.show(6)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|               value|                line|               words|      filtered_words|        filter_words|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|This is the 100th...|this is the th et...|[this, is, the, t...|[th, etext, file,...| [th, file, project]|
|is presented in c...|is presented in c...|[is, presented, i...|[presented, coope...|    [world, library]|
|Library of the Fu...|library of the fu...|[library, of, the...|[library, future,...|[library, future,...|
|often releases Et...|often releases et...|[often, releases,...|[often, releases,...|[often, public, d...|
|                    |                    |                  []|                  []|                  []|
|         Shakespeare|         shakespeare|       [shakespeare]|       [shakespeare]|                  []|
+--------------------+---------------

In [17]:
df=df.withColumn('cleaned_line',concat_ws(' ',col('filter_words')))

In [18]:
df.select('cleaned_line').show(5)

+--------------------+
|        cleaned_line|
+--------------------+
|     th file project|
|       world library|
|library future pr...|
| often public domain|
|                    |
+--------------------+
only showing top 5 rows



In [30]:
df = df.filter(col("cleaned_line") != "") 

In [31]:
df.select('cleaned_line').show(5)

+--------------------+
|        cleaned_line|
+--------------------+
|     th file project|
|       world library|
|library future pr...|
| often public domain|
|certain copyright...|
+--------------------+
only showing top 5 rows



In [32]:
df.select("cleaned_line").coalesce(1).write.text("/home/bigdata/Desktop/ShakesModFinal")
