In [1]:
import findspark
import re
import string
from pyspark.sql import SparkSession
findspark.init()

In [2]:
def wordCount(input, output, num_common_words, stop_words):
    # Start Spark Session
    spark = SparkSession.builder.master("local").appName("ExtendedWordCount").getOrCreate()
    sc = spark.sparkContext

    # Read all input files from directory
    text = sc.wholeTextFiles(input).values()

    # Convert text to words in lower case
    words = text.flatMap(lambda x: x.lower().split())

    # Replace punctuations that are not part of the word and map them a count
    words = words.map(lambda x: (x.lstrip(string.punctuation)).rstrip(string.punctuation))

    # Filter stop words
    non_stop_words = words.filter(lambda x: x not in stop_words)

    # Give Count of individual words
    maps = non_stop_words.map(lambda x: (x, 1))

    # Count of all words
    counts = maps.reduceByKey(lambda x,y: x+y)

    # Get the commmon 25 words
    common = counts.takeOrdered(num_common_words, key = lambda x: -x[1])
    
    # Save the counts to a file
    sc.parallelize(common).saveAsTextFile(output)

In [3]:
# Paths to input and output files
input = "./input"
output = "./extended-output"

# Number of common words
num_common_words = 25

# Define stop words
stop_words = ["the", "and", "is", "in", "it", "or", "of", "to", 'a', 'that', 'was', 'you', 'his', 'had', 'with', 'him', 'for', 'as', 'at', 'not', 'be', 'on', 'my', 'her', 'are', 'he', 'have', 'me', 'by', 'from', 'but', 'would', 'were', 'what', 'an', 'so', 'this', 'been', 'she', 'will', 'about', 'there', 'am', 'your', 'who', 'here', 'they', 'do', 'which', 'if', 'them', 'when', 'into', 'has', 'can']

# Invoking Word Count function
wordCount(input, output, num_common_words, stop_words)