In [10]:
from collections import Counter
import nltk
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords

In [11]:
# Sample input text
corpus = """
Natural language processing (NLP) is a subfield of linguistics, 
computer science, information engineering, and artificial 
intelligence concerned with the interactions between computers and human 
(natural) languages, in particular how to program computers to process and analyze 
large amounts of natural language data. Challenges in natural language processing 
frequently involve speech recognition, natural language understanding, and natural 
language generation.
"""

In [16]:
# Mapper function
def mapper(text):
    # Tokenize the text into words
    words = word_tokenize(text)
    # Remove stopwords and non-alphabetic words
    words = [word.lower() for word in words if word.isalpha() and word.lower() not in stopwords.words('english')]
    # Tag each word with its part of speech
    tagged_words = pos_tag(words)
    # Output (word, tag) pairs
    return [(word, tag) for word, tag in tagged_words]

In [17]:
def reducer(mapped_values):
    # Count the occurrences of each (word, tag) pair
    tag_counts = Counter(mapped_values)
    # Output the word with its most common tag
    return [(word, tag) for word, tag in tag_counts.items() if tag_counts[word] > 0]

In [18]:
# MapReduce function
def map_reduce(corpus):
    # Split the corpus into sentences
    sentences = nltk.sent_tokenize(corpus)
    # Map phase: Apply the mapper function to each sentence
    mapped_values = [mapper(sentence) for sentence in sentences]
    # Flatten the list of mapped values
    mapped_values = [item for sublist in mapped_values for item in sublist]
    # Reduce phase: Apply the reducer function to the mapped values
    reduced_values = reducer(mapped_values)
    return reduced_values

In [19]:
# Run the MapReduce program
result = map_reduce(corpus)
print(result)

[(('natural', 'JJ'), 6), (('language', 'NN'), 5), (('processing', 'NN'), 2), (('nlp', 'JJ'), 1), (('subfield', 'NN'), 1), (('linguistics', 'NNS'), 1), (('computer', 'NN'), 1), (('science', 'NN'), 1), (('information', 'NN'), 1), (('engineering', 'NN'), 1), (('artificial', 'JJ'), 1), (('intelligence', 'NN'), 1), (('concerned', 'VBN'), 1), (('interactions', 'NNS'), 1), (('computers', 'NNS'), 2), (('human', 'JJ'), 1), (('languages', 'NNS'), 1), (('particular', 'JJ'), 1), (('program', 'NN'), 1), (('process', 'VBP'), 1), (('analyze', 'JJ'), 1), (('large', 'JJ'), 1), (('amounts', 'NNS'), 1), (('data', 'NNS'), 1), (('challenges', 'NNS'), 1), (('frequently', 'RB'), 1), (('involve', 'VBP'), 1), (('speech', 'NN'), 1), (('recognition', 'NN'), 1), (('understanding', 'JJ'), 1), (('generation', 'NN'), 1)]
