In [20]:
input_ = 'aut Caesar aut nihil\naut aut\nde mortuis aut bene aut nihil'

In [30]:
from collections import defaultdict
import sys

In [178]:
# map + reduce
class Mapper():
    def map(self, input_):
        result = ''
        for token in input_.strip().split():
            pair = '\t'.join([token, '1'])
            # print(pair)
            yield pair
            
class Reducer():
    def reduce(self, input_):
        last_key, sum_ = None, 0

        for line in input_.strip().split('\n'):
            key, value = line.strip().split('\t')
            if last_key and key != last_key:
                print('\t'.join([last_key, str(sum_)]))
                last_key, sum_ = key, int(value)
            else:
                last_key, sum_ = key, sum_ + int(value)

        if last_key:
            print('\t'.join([last_key, str(sum_)]))

In [187]:
mapper = Mapper()
mapped = ''
for doc in input_.strip().split(): # sys.stdin
    for pair in mapper.map(doc):
        mapped += '\n' + pair
# sort before reduce
mapped = '\n'.join(sorted(mapped.split('\n')))
reducer = Reducer()
reducer.reduce(mapped)

Caesar	1
aut	6
bene	1
de	1
mortuis	1
nihil	2


In [200]:
# in-mapper combining v.1
class MapperWithCombinerV1():
    def map(self, input_):
        result = defaultdict(int)
        for token in input_.strip().split():
            result[token] += 1
        for k, v in result.items():
            # print('\t'.join([k, str(v)]))
             yield '\t'.join([k, str(v)])

In [204]:
mapper = MapperWithCombinerV1()
mapped = ''
for doc in input_.strip().split('\n'): # sys.stdin
    for pair in mapper.map(doc):
        mapped += '\n' + pair
# sort before reduce
mapped_sorted = '\n'.join(sorted(mapped.split('\n')))
reducer = Reducer()
reducer.reduce(mapped_sorted)

Caesar	1
aut	6
bene	1
de	1
mortuis	1
nihil	2


In [205]:
print(mapped)


aut	2
Caesar	1
nihil	1
aut	2
de	1
mortuis	1
aut	2
bene	1
nihil	1


In [215]:
# in-mapper combining v.2
class MapperWithCombinerV2():
    def __init__(self):
        self.counter = defaultdict(int)

    def map(self, input_):
        for token in input_.strip().split():
            self.counter[token] += 1
    
    def close(self):
        for k, v in self.counter.items():
            # print('\t'.join([k, str(v)]))
             yield '\t'.join([k, str(v)])

In [216]:
mapper = MapperWithCombinerV2()
mapped = ''
for doc in input_.strip().split('\n'): # sys.stdin
    mapper.map(doc)
for pair in mapper.close():
    mapped += '\n' + pair
# sort before reduce
mapped_sorted = '\n'.join(sorted(mapped.split('\n')))
reducer = Reducer()
reducer.reduce(mapped_sorted)

Caesar	1
aut	6
bene	1
de	1
mortuis	1
nihil	2


In [217]:
print(mapped)


aut	6
Caesar	1
nihil	2
de	1
mortuis	1
bene	1


In [218]:
input_ = 'www.facebook.com\t100\nwww.google.com\t10\nwww.google.com\t5\nwww.google.com\t15\nwww.stepic.org\t60\nwww.stepic.org\t100'

In [219]:
print(input_)

www.facebook.com	100
www.google.com	10
www.google.com	5
www.google.com	15
www.stepic.org	60
www.stepic.org	100


In [256]:
# reducer for mean values
# class MapperMean():
#     def map(self, t, r):
#         yield '\t'.join([t, r])

class ReducerMean():
    def reduce(self, input_):
        last_key, sum_, count_ = None, 0, 0
        for line in input_.strip().split('\n'):
            k, v = line.strip().split('\t')

            if last_key and k != last_key:
                print('\t'.join([last_key, str(sum_ // count_)]))
                last_key, sum_, count_ = k, int(v), 1
            else:
                last_key = k
                sum_ += int(v)
                count_ += 1

        if last_key:
            print('\t'.join([last_key, str(sum_ // count_)]))

In [257]:
reducer = ReducerMean()
reducer.reduce(input_)

www.facebook.com	100
www.google.com	10
www.stepic.org	80


In [258]:
input_ = 'www.facebook.com\t100;1\nwww.google.com\t10;1\nwww.google.com\t5;1\nwww.google.com\t15;1\nstepic.org\t60;1\nstepic.org\t100;1'

In [259]:
print(input_)

www.facebook.com	100;1
www.google.com	10;1
www.google.com	5;1
www.google.com	15;1
stepic.org	60;1
stepic.org	100;1


In [264]:
# combiner mean
class CombinerMean():
    def combine(self, input_):
        last_key, sum_, count_ = None, 0, 0
        for line in input_.strip().split('\n'):
            k, v = line.strip().split('\t')
            s, ONE = v.split(';')

            if last_key and k != last_key:
                print('\t'.join([last_key, str(sum_) + ';' + str(count_)]))
                last_key, sum_, count_ = k, int(s), 1
            else:
                last_key = k
                sum_ += int(s)
                count_ += 1

        if last_key:
            print('\t'.join([last_key, str(sum_) + ';' + str(count_)]))

In [265]:
combiner = CombinerMean()
combiner.combine(input_)

www.facebook.com	100;1
www.google.com	30;3
stepic.org	160;2


In [266]:
input_ = '1\ta,b\n2\ta,d,e\n1\tb\n3\ta,b'

In [267]:
print(input_)

1	a,b
2	a,d,e
1	b
3	a,b


In [284]:
# Distinct Values v1
class MapperDistinctV1():
    def map(self, input_):
        k, v = input_.strip().split('\t')
        gs = v.split(',')
        for g in gs:
            print('\t'.join([k + ',' + g, str(1)]))

In [285]:
mapper = MapperDistinctV1()
for doc in input_.strip().split('\n'):
    mapper.map(doc)

1,a	1
1,b	1
2,a	1
2,d	1
2,e	1
1,b	1
3,a	1
3,b	1


In [286]:
input_ = '1,a\t1\n1,b\t1\n1,b\t1\n2,a\t1\n2,d\t1\n2,e\t1\n3,a\t1\n3,b\t1'

In [292]:
print(input_)

1,a	1
1,b	1
1,b	1
2,a	1
2,d	1
2,e	1
3,a	1
3,b	1


In [290]:
class ReducerDistinctV1():
    def reduce(self, input_):
        last_k = None

        for line in input_.strip().split('\n'):
            k, v = line.strip().split('\t')
            if k != last_k:
                print(k)
                last_k = k
            else:
                last_k = k

In [291]:
reducer = ReducerDistinctV1()
reducer.reduce(input_)

1,a
1,b
2,a
2,d
2,e
3,a
3,b


In [293]:
input_ = '1,a\n2,a\n3,a\n1,b\n3,b\n2,d\n2,e'

In [294]:
print(input_)

1,a
2,a
3,a
1,b
3,b
2,d
2,e


In [299]:
# Distinct Values v1
class MapperDistinctV2():
    def map(self, input_):
        f, g = input_.strip().split(',')
        print(g, '1', sep='\t')

In [300]:
mapper = MapperDistinctV2()
for doc in input_.strip().split('\n'):
    mapper.map(doc)

a	1
a	1
a	1
b	1
b	1
d	1
e	1


In [2]:
input_ = '1\ta\n1\tb\n1\tb\n2\ta\n2\td\n2\te\n3\ta\n3\tb'

In [4]:
print(input_)

1	a
1	b
1	b
2	a
2	d
2	e
3	a
3	b


In [28]:
# Distinct Values v2
class ReducerDistinctV2():
    def __init__(self):
        self.counter = defaultdict(int)

    def reduce(self, input_):
        last_pair = None

        for line in input_.strip().split('\n'):
            k, v = line.strip().split('\t')
            if (k, v) != last_pair:
                self.counter[v] += 1
                last_pair = (k, v)
            else:
                last_pair = (k, v)
    
    def close(self):
        for k, v in self.counter.items():
            print(k, v, sep='\t')

In [29]:
reducer = ReducerDistinctV2()
reducer.reduce(input_)
reducer.close()

a	3
b	2
d	1
e	1


In [31]:
input_ = 'a b\na b a c'

In [32]:
print(input_)

a b
a b a c


In [36]:
# Cross-Correlation: Pairs
class MapperCrossCorrelationPairs():
    def map(self, input_):
        arr = input_.strip().split(' ')
        for i in arr:
            for j in arr:
                if i != j:
                    print(','.join([i, j]), '1', sep='\t')

In [37]:
mapper = MapperCrossCorrelationPairs()
for doc in input_.strip().split('\n'):
    mapper.map(doc)

a,b	1
b,a	1
a,b	1
a,c	1
b,a	1
b,a	1
b,c	1
a,b	1
a,c	1
c,a	1
c,b	1
c,a	1


In [38]:
input_ = 'a b\na b a c'

In [39]:
print(input_)

a b
a b a c


In [51]:
# Cross-Correlation: Stripes
class MapperCrossCorrelationStripes():
    def map(self, input_):
        arr = input_.strip().split(' ')
        for i in arr:
            counter = defaultdict(int)
            for j in arr:
                if j != i:
                    counter[j] += 1
            stripe = ','.join([k + ':' + str(v) for k, v in counter.items()])
            print(i, stripe, sep='\t')

In [52]:
mapper = MapperCrossCorrelationStripes()
for doc in input_.strip().split('\n'):
    mapper.map(doc)

a	b:1
b	a:1
a	b:1,c:1
b	a:2,c:1
a	b:1,c:1
c	a:2,b:1
