# Hadoop Streaming Name Count

Make WordCount program for all the names in the dataset. Name is a word with the following properties:

1. The first character is not a digit (other characters can be digits).
2. The first character is uppercase, all the other characters that are letters are lowercase.
3. There are less than 0.5% occurrences of this word, when this word regardless to its case appears in the dataset and the condition (2) is not met.

    
Docker container: https://hub.docker.com/r/bigdatateam/yarn-notebook/  

# Mapper 1

In [2]:
%%writefile mapper1.py

import sys
import re

for line in sys.stdin:

    article_id, text = line.strip().split('\t', 1)
    try:
        words = re.split('\W*\s+\W*', text.strip())
        for word in words:            
            if word[0].isalpha():
                print(word.lower(), 1, int(word[0].isupper() and word[1:].islower()), sep="\t")            
    except Exception as e:
        continue

Writing mapper1.py


# Reducer 1

In [22]:
%%writefile reducer1.py

import sys

current_key = None
word_total = 0
name_total = 0

for line in sys.stdin:
    try:
        key, word_count, name_count = line.strip().split('\t', 2)
        word_count = int(word_count)
        name_count = int(name_count)        
        if current_key != key:
            if current_key:
                print(name_total, word_total, current_key, sep="\t")                
            current_key = key
            word_total = word_count
            name_total = name_count
        else:            
            word_total += word_count
            name_total += name_count
        
    except Exception as e:
        continue  

if current_key:
    print(name_total, word_total, current_key, sep="\t")

Overwriting reducer1.py


# Maper 2

In [24]:
%%writefile mapper2.py

import sys

for line in sys.stdin:
    try:     
        print(line.strip())        
    except ValueError as e:
        continue    

Overwriting mapper2.py


# Reducer 2

In [25]:
%%writefile reducer2.py

import sys

total_count = 0
total_caps = 0
current_word = None

for line in sys.stdin:
    try:
        caps_count, count, key = line.strip().split('\t', 2)
        count = int(count)
        caps_count = int(caps_count)
        
        if key != current_word:
            
            if current_word and float(total_caps) / float(total_count) >= 0.995: 
                print(current_word, total_caps, sep="\t")
            
            total_count = 0
            total_caps = 0
            current_word = key


        total_caps += caps_count
        total_count += count

    except Exception as e:
        continue
        
if current_word and float(total_caps) / float(total_count) >= 0.995: 
    print(current_word, total_caps, sep="\t")   

Overwriting reducer2.py


# Bash

In [1]:
%%bash

OUT_DIR_1="part_1_"$(date +"%s%6N")
OUT_DIR_2="part_2_"$(date +"%s%6N")

hdfs dfs -rm -r -skipTrash ${OUT_DIR_1}* > /dev/null
hdfs dfs -rm -r -skipTrash ${OUT_DIR_2}* > /dev/null


yarn jar /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/hadoop-streaming.jar \
    -files mapper1.py,reducer1.py \
    -mapper 'python3 mapper1.py' \
    -reducer 'python3 reducer1.py' \
    -numReduceTasks 6 \
    -input /data/wiki/en_articles_part \
    -output ${OUT_DIR_1} > /dev/null


yarn jar /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/hadoop-streaming.jar \
    -D mapreduce.job.output.key.comparator.class=org.apache.hadoop.mapreduce.lib.partition.KeyFieldBasedComparator \
    -D mapreduce.partition.keycomparator.options="-k1,3nr" \
    -files mapper2.py,reducer2.py \
    -mapper 'python3 mapper2.py' \
    -reducer 'python3 reducer2.py' \
    -numReduceTasks 1 \
    -input ${OUT_DIR_1} \
    -output ${OUT_DIR_2} > /dev/null


hdfs dfs -cat ${OUT_DIR_2}/part-00000 | sed -n "5p;8q"

Couldn't find program: 'bash'
