In [9]:
%%writefile mapper.py

import sys
import re
import collections

with open('stop_words_en.txt') as f:
    stop_words = f.read().split()

for line in sys.stdin:
    try:
        article_id, text = line.strip().split('\t', 1)
        words = re.split("\W*\s+\W*", text, flags=re.UNICODE)
        words = [word.lower() for word in words if (word.lower() not in stop_words)]
        
        words_counter = collections.Counter(words)
        words_total = sum(words_counter.values())

        for word, count in sorted(words_counter.items()):
            if not word.isalpha(): continue
            tf = float(count)/float(words_total)            
            print(word, article_id, str(tf), sep="\t")
             
    except ValueError as e:        
        continue

Overwriting mapper.py


In [10]:
%%writefile reducer.py

import sys
import math

current_word = None
articles = {}

for line in sys.stdin:
    try:
        word, article_id, tf = line.strip().split('\t', 2)      
        if current_word != word:
            if current_word:
                idf = 1.0 / math.log(1 + len(articles))
                for article, tf in articles.items():
                    print(current_word, article, str(tf*idf), sep="\t")            
            current_word = word
            articles = {}
        articles[article_id] = float(tf)    
    except ValueError as e:
        continue
        
if current_word:    
    idf = 1.0 / math.log(1 + len(articles))
    for article, tf in articles.items():
        print(current_word, article, str(tf*idf), sep="\t")

Overwriting reducer.py


In [13]:
%%bash
HADOOP_STREAMING_JAR="/opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/hadoop-streaming.jar"
OUT_DIR="TDIDF_"$(date +"%s%6N")

hdfs dfs -rm -r -skipTrash ${OUT_DIR} > /dev/null

yarn jar $HADOOP_STREAMING_JAR \
    -D mapred.jab.name="TDIDF" \
    -D mapreduce.job.reduces=4 \
    -D mapreduce.partition.keypartitioner.options=-k1,1 \
    -files mapper.py,reducer.py,/datasets/stop_words_en.txt \
    -mapper "python3 mapper.py" \
    -reducer "python3 reducer.py" \
    -numReduceTasks 4 \
    -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \
    -input /data/wiki/en_articles_part \
    -output ${OUT_DIR} > /dev/null 

hdfs dfs -cat ${OUT_DIR}/part* | grep -w "labor" | grep -w "12" | cut -f 3

Couldn't find program: 'bash'
