# <center>IMPORTING NECESSARY PACKAGES</center>

In [1]:
import re
import string
import numpy as np
import pandas as pd

# <br><br><center>DATA CLEANING</center>

In [2]:
def Data_Cleaning(text_list):
    
    output = []
    
    # Extracting individual lines from the list
    for line in text_list:
        
        # Ignoring empty lines
        if line!="\n":
            
            # Using regex to replace non characters & single space with space.
            # Replacing with space to avoid concatenating words which have just punctuations between them
            # Eg: "myself--for" in line 821 
            cleaned_line = re.sub(r'[^a-zA-Z ]'," ",line).lower()
            
    
            # If line ends with a number ("Chapter 1") the word is followed by a space in the end after cleaning ("chapter ")
            # Removing such space by checking if the last character of the line is just a space 
            if cleaned_line.isspace()!= True:
                if cleaned_line[-1:]==" ":
                    output.append(cleaned_line[:-1])
                else:
                    output.append(cleaned_line)

    return output

# <br><br><center>DATA SPLIT</center>

In [3]:
def Data_Split(cleansed_data):
    
    # Slicing first 5000 lines
    output1 = cleansed_data[0:5000]
    
    # Slicing remaining lines
    output2 = cleansed_data[5000:]
    return output1, output2

# <br><br><center>MAPPER</center>

In [4]:
def Mapper1(text_lines):
    
    output = []
    
    # Iterating each individual line from the list
    for lines in text_lines:
        
        # Iterating each word in the line
        for word in lines.split(" "):
            # Ignoring if the word is an empty character
            if word != "":
                #  Creating a dictionay with word as key and value as 1 and appending the dictionary to the list
                output.append({word: 1})
    
    return output



# Same logic as Mapper1 function above
def Mapper2(text_lines):
    output = []
    
    for lines in text_lines:
        for word in lines.split(" "):
            if word != "":
                output.append({word: 1})

        
    return output

# <br><br><center>SORT MAPPER OUTPUTS</center>

In [5]:
def Sort_Mapper(list_1, list_2):
    
    # Combining both lists
    combined_list = list_1 + list_2
    
    words = []
    
    # Iterating each dictionary in the list
    for dictionaries in combined_list:
        
        # Selecting keys(words) of individual dictionary and appending them to a list
        for k, v in dictionaries.items():
            words.append(k)
    
    # Extracting the indices by sorting the words alphabeltically
    sorted_indices = list(np.argsort(words))
    
    output = []
    
    # Using the sorted indices to rearrange and store the dictionaries of the combined list in a new list
    for i in range(len(combined_list)):
        output.append(combined_list[sorted_indices[i]])
    
    return output

# <br><br><center>PARTITION</center>

In [6]:
def Partition(sorted_list):
    
    # Checking the index for the first key(word) which starts with "n"
    for i in range(len(sorted_list)):
        if str(sorted_list[i].keys())[12] == "n":
            n_index = i
            break
    
    # List 1 contains keys(words) starting with letters "a" to "m"
    # List 2 contains keys(words) starting with letters "n" to "z"
    output1, output2 = sorted_list[:n_index], sorted_list[n_index:]
    
    return output1, output2

# <br><br><center>REDUCER</center>

In [7]:
def Reducer1(partition_output):
    words_list = []
    
    # Appending keys from all the dictionaries in a ist  
    for dictionaries in partition_output:
        for k,v in dictionaries.items():
            words_list.append(k)
    
    # Creating a list with a set of individual words 
    individual_words = sorted(list(set(words_list)))
    index = 0
    
    # Dictionary to store key and its concatenated values
    reducer_dict = {}
    
    # Concatenating key with all its values similar to the reducer function. Eg: {"a":[1,1,1,1....] , "b":[1,1,1,1....] , .....} 
    for word in individual_words:
        values = []
    
        for text_word in words_list[index:]:
        
            if text_word == word:
                values.append(int(str(partition_output[index].values())[-3]))
                index+=1
        
            if text_word != word or index == len(words_list):
                reducer_dict[word]=values
                break
    
    ct=[]
    
    # Counting the frequency for each word by checking length of list with the concatenated values
    for word in individual_words:
        ct.append(len(reducer_dict[word]))    
    
    return individual_words, ct


# Implemeted using same logis as Reducer1 function
def Reducer2(partition_output):
    words_list = []

    for dictionaries in partition_output:
        for k,v in dictionaries.items():
            words_list.append(k)

    individual_words = sorted(list(set(words_list)))
    index = 0
    reducer_dict = {}

    for word in individual_words:
        values = []
    
        for text_word in words_list[index:]:
        
            if text_word == word:
                values.append(int(str(partition_output[index].values())[-3]))
                index+=1
        
            if text_word != word or index == len(words_list):
                reducer_dict[word]=values
                break
    
    ct=[]
    
    for word in individual_words:
        ct.append(len(reducer_dict[word]))    
    
    return individual_words, ct

# <br><br><center>MAIN FUNCTION (<font color ="blue">MAPREDUCE</font>)</center>

In [8]:
def Map_Reduce(file_name):
    
    # Storing each line of the text as an individual elemnt of  a list 
    with open(file_name,"r") as file:
        book = file.readlines()
    
    # Calling Data Cleaning function
    cleansed_data = Data_Cleaning(book)
    
    # Calling Data Split  to split data on 5000th line
    part1 , part2 = Data_Split(cleansed_data)
    
    # Calling both mapper functions with the split data as arguments to form key value pairs
    mapper1_output , mapper2_output = Mapper1(part1), Mapper2(part2)
    
    # Calling Sort function to sort the combined output of both mappers 
    sort_output = Sort_Mapper(mapper1_output , mapper2_output)
    
    # Calling partition function to split sorted result into two lists, 
    # one for words starting with "a" to "m" and another for words starting with "n" to "z"
    partition_output1 , partition_output2 = Partition(sort_output)
    
    # Calling reducer function to count the frequency of each individual word
    reducer1_words , reducer1_frequency = Reducer1(partition_output1)
    reducer2_words , reducer2_frequency = Reducer2(partition_output2)

    # Concatenating the reducer output into a dataframe
    final_output = pd.DataFrame()
    final_output["Word"] = reducer1_words + reducer2_words
    final_output["Frequency"] = reducer1_frequency + reducer2_frequency
    
    # Storing the results in a csv file
    final_output.to_csv("Pride and Prejudice.csv", index=False)
    
    return final_output

# <br><br><center>CALLING MAIN (<font color ="blue">MAPREDUCE</font>) FUNCTION</center>

In [9]:
# Enter name of file. File needs to be stored in the current working directory
file_name = "Pride_and_Prejudice.txt"

# Calling the Main Function (Map_Reduce)
Map_Reduce_Output = Map_Reduce(file_name)

In [10]:
Map_Reduce_Output.head(11)

Unnamed: 0,Word,Frequency
0,a,1954
1,abatement,1
2,abhorrence,6
3,abhorrent,1
4,abide,1
5,abiding,1
6,abilities,6
7,able,54
8,ablution,1
9,abode,8


In [11]:
# Top 10 Least frequent words
Map_Reduce_Output.sort_values(["Frequency"], ascending=True, axis=0).head(10)

Unnamed: 0,Word,Frequency
3129,involves,1
3204,kindled,1
3207,kindred,1
3208,kinds,1
3211,kiss,1
3213,kitchen,1
3215,knees,1
3223,kympton,1
3225,labour,1
3226,lace,1


In [12]:
# Top 10 Most frequent words
Map_Reduce_Output.sort_values(["Frequency"], ascending=False, axis=0).head(10)

Unnamed: 0,Word,Frequency
5590,the,4331
5656,to,4163
3819,of,3611
268,and,3585
2709,her,2225
2806,i,2070
0,a,1954
2890,in,1880
6064,was,1846
5045,she,1710


<font size=3>Kindly find the book <b>Pride and Prejudice</b> along with the dataframe of words and their corresponding frequency exported to a csv file named <b>Pride and Prejudice.csv</b> in my GitHub repository <a href="https://github.com/Prasanna-Vengatesh-Venkataraman/WORD-FREQUENCY-USING-MAPREDUCE">here</a>.</font>