In [2]:
import pyspark
from pyspark import SparkContext
from operator import add

In [3]:
sc = SparkContext(appName="task7")

In [3]:
INPUT_FILE_PATH = "Rec1_F19_Files_to_zip/file01_Hd_Sp_Freq"

In [37]:
lines = sc.textFile(INPUT_FILE_PATH)

r"""
    Answer Format:
        Section_1 - Functions that will be passed into higher order collection
        functions like map or reduce. Each function will define its
        purpose within itself.

        Section_2 - Chain of spark operations to process the data which will take
        in the functions defined and apply them to data.
"""


#Section_1:
def mapToFirstLetter(wordTuple):
    r"""
        Will extract the first letter of the word and create a tuple that is:
        ( firstLetter , (word,occurrence) )

    @param:
        wordTuple is composed as follows:
            - wordTuple[0] = actual word
            - wordTuple[1] = number of times word has occurred in doc
        
    @return:
        tuple : ( firstLetter, (word,occurrence) )
    """

    word = wordTuple[0]
    occurrence = wordTuple[1]
    firstLetter = word[0]
    return firstLetter,(word,occurrence)

def reduceByFirstLetter(acc,current):
    r"""
    @param:
        acc : the accumulated val
        cuurent: the current tuple being proccessed

    @return:
        acc the accumalated value which is continuously updated
    
    @procedure:
            Creates a list of all (word,occurrence) tuples 
        for each letter. The broad process is creating a 
        list for accumulator (acc) and appending each 
        current element received.
            The accumulator requires an initial value
        which is obtained from the very first entry the
        reducer. This inital value is a tuple but we
        need our accumulator needs to be a list.
        Because of this requirement we have an if
        statement to test if the "acc" parameter is
        a tuple and if it is then change it to a list.
        This way we can have a list of tuples.
            The current parameter can either be a
        tuple (word,occurrence) or a list of tuples.
        This is due to the parallel nature of processing.
        Another reducerByKey task could have already started
        processing the elements for the same key. To deal with
        this we have a check if "current" is a list, in which
        case each of that list's elements will be added to 
        the overall accumulator.
    """
    if isinstance(acc,tuple):
        temp = [acc]
        acc = temp
    
    if isinstance(current,list):
        for elem in current:
            acc.append(elem)
        return acc
    
    acc.append(current)
    return acc


def finalReducerHelper(letterWordsTuple):
    r"""
        Helper method that obtains the 40 most
        freuquent words for each letter. Then
        creates a tuple that is (firstLetter,
        listOf(40 most frequent words)). If
        less than 40 words overall then all
        of them are included.
    
    @param:
        letterWordTuple :
            - [0] : letter
            - [1] : list of (word,occurrence) tuples
                        - [0] = word
                        - [1] = occurrence
    """
    firstLetter = letterWordsTuple[0]
    if isinstance(letterWordsTuple[1],tuple):
        return firstLetter,letterWordsTuple[1][0]
    wordFreqPairs = list(map(lambda wfp: wfp,letterWordsTuple[1]))
    wordFreqPairs.sort(key=lambda wfPair: wfPair[1],reverse=True)
    topFrequentWords = []
    numOfWords = 0
    for word in wordFreqPairs:
        topFrequentWords.append(word[0])
        numOfWords += 1
        if numOfWords == 40:
            break
    return firstLetter,topFrequentWords


def finalReducer(acc,current):
    r"""
        Get top 40 (or as many as possible if less) most
        frequent words for each letter found in the doc.

    @param:
        acc : most frequent words for each letter
        current: tuple (letter, [(word,occurrent),...])
            - current[0] = first letter
            - current[1] = wordOccurrence Pairs
                - wordOccurrence pair;
                    - wordOccurrence[0] = word
                    - wordOccurrence[1] = occurrence

    @return:
        acc : top 40 words for each list

    @procedure:
            For each letter, sort the (word,occurrence) tuple
        list by value in decesending order. Then create
        outputlist and fill it with the most frequent 
        words, up to 40 of them.
    """

    if isinstance(acc,tuple): #if this is the very first reduction and initial val is being used
        temp = finalReducerHelper(acc)
        acc = [temp]
    
    currentAns = finalReducerHelper(current)
    acc.append(currentAns)
    return acc
    

#Section_2:
r"""
    
The Spark function chain:

    The steps here correspond to each line of the mostFrequentWordsPerLetter. Some functions
    will be defined before map reduce chain is started (because they are too long for a
    lambda function). Those functions will have their notes within the funciton 
    definition. The steps are as follows :
        - flatMapt -> get all the words in 1 list
        - map -> map every word to value 1
        - reduceByKey -> add up all occurrences for each word
        - map -> switch kev value pair to make occurrences keys
        - sortByKey -> sorts by occurrences of words so that most frequent words are first
        - map -> switch key value pair again to make word the key again
        - map -> Uses: mapToFirstLetter, gets first letter of word and uses that as key with word,occurrence as value
        - reduceByKey -> Uses: reduceByFirstLetter, collects all of the word,occurrence pairs that belong to a particular first letter
        - reduce -> Uses: finalReducer, finds the 40 most frequent words per letter
"""

# collection function chain for output: 
mostFrequentWordsPerLetter = lines.flatMap(lambda line: line.split(" "))\
                                .filter(lambda word: word!='')\
                                .map(lambda word: (word.lower(),1))\
                                .reduceByKey(add)\
                                .map(lambda wordOccurrencePair: (wordOccurrencePair[1],wordOccurrencePair[0]))\
                                .sortByKey(ascending=False)\
                                .map(lambda kvPair: (kvPair[1],kvPair[0]))\
                                .map(mapToFirstLetter)\
                                .reduceByKey(reduceByFirstLetter)\
                                .reduce(finalReducer)
                                

print(mostFrequentWordsPerLetter)

[('i', ['i', 'is', 'it', 'interested', 'in', 'it.', "i'll", "i'd"]), ('h', ['hadoop', 'how', 'have', 'hours', 'hello']), ('s', ['spark', 'studying', 'study', 'say', 'started', 'spend', 'some', 'so', 'shall', 'sql', 'supposed']), ('b', ['but', 'bye', 'be', 'better']), ('d', ['do', 'debug', 'difficult', "don't"]), ('c', ['compile', 'cloud', 'choose', 'can', 'compromise', 'computing']), ('p', ['perhaps', 'prefer', 'part', 'pighive']), ('l', "let's"), ('y', ['you', 'your']), ('g', 'get'), (('t', ['to', 'this', 'then']), 'a')]


In [4]:
sc.stop()