In [137]:
from pyspark.sql import SparkSession

# (8 cores, 16gb per machine) x 5 = 40 cores

# New API
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.1.153:7077") \
        .appName("A_LU")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",4)\
        .getOrCreate()
        

# Old API (RDD)
sc = spark_session.sparkContext


In [138]:
# Question A.1
rdd_en = sc.textFile("hdfs://192.168.1.153:9000/europarl/europarl-v7.fr-en.en").cache()
rdd_fr = sc.textFile("hdfs://192.168.1.153:9000/europarl/europarl-v7.fr-en.fr").cache()
count_en = rdd_en.count()
count_fr = rdd_fr.count()
print(count_en)
print(count_fr)
print(count_en == count_fr)
print(rdd_en.getNumPartitions())
print(rdd_fr.getNumPartitions())

2007723
2007723
True
3
3


In [139]:
# Question A.2
def tokenise(rdd):
    return rdd.map(lambda line: line.lower().split(' '))

# lines_en = lines_en.map(lambda line: line.lower().split(' '))
lines_en = tokenise(rdd_en).cache()
lines_fr = tokenise(rdd_fr).cache()
print(lines_en.take(10))
print(lines_fr.take(10))
print(lines_en.count() == lines_fr.count())

[['resumption', 'of', 'the', 'session'], ['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999,', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period.'], ['although,', 'as', 'you', 'will', 'have', 'seen,', 'the', 'dreaded', "'millennium", "bug'", 'failed', 'to', 'materialise,', 'still', 'the', 'people', 'in', 'a', 'number', 'of', 'countries', 'suffered', 'a', 'series', 'of', 'natural', 'disasters', 'that', 'truly', 'were', 'dreadful.'], ['you', 'have', 'requested', 'a', 'debate', 'on', 'this', 'subject', 'in', 'the', 'course', 'of', 'the', 'next', 'few', 'days,', 'during', 'this', 'part-session.'], ['in', 'the', 'meantime,', 'i', 'should', 'like', 'to', 'observe', 'a', "minute'", 's', 'silence,', 'as', 'a', 'number', 'of', 'members', 'have', 'requested,', 'on', 'behalf', 'of', '

In [140]:
# Question A.3
def word_count(rdd):
    return rdd.flatMap(lambda line: line)\
                .map(lambda word: (word,1))\
                .reduceByKey(lambda a, b : a + b)\
                .sortBy(lambda a: a[1], False)
            
wordCount_en = word_count(lines_en)
print(wordCount_en.take(10))
wordCount_fr = word_count(lines_fr)
print(wordCount_fr.take(10))

[('the', 3846086), ('of', 1823194), ('to', 1692082), ('and', 1413924), ('in', 1191334), ('that', 877203), ('a', 850612), ('is', 832094), ('for', 584680), ('we', 578720)]
[('de', 2908562), ('la', 1890061), ('et', 1313600), ('le', 1260986), ('les', 1147249), ('à', 1100987), ('des', 1058652), ('que', 856810), ('en', 772241), ('nous', 607262)]


In [141]:
# Question A.4
en_1 = lines_en.zipWithIndex()
fr_1 = lines_fr.zipWithIndex()
en_2 = en_1.map(lambda x: (x[1], x[0]))
fr_2 = fr_1.map(lambda x: (x[1], x[0]))
ef_3 = en_2.join(fr_2)
ef_4 = ef_3.values().filter(lambda line: len(line) == 2)
ef_5 = ef_4.filter(lambda line: len(line[0]) < 10)
ef_6 = ef_5.filter(lambda line: len(line[0]) == len(line[1]))
ef_7 = ef_6.flatMap(lambda line: zip(line[0], line[1]))
ef_8 = ef_7.map(lambda pair: (pair, 1))\
            .reduceByKey(lambda a, b : a + b)\
            .sortBy(lambda a: a[1], False)
print(ef_8.take(10))

[(('the', 'le'), 8074), (('is', 'est'), 7426), (('the', 'la'), 4814), (('we', 'nous'), 4014), (('debate', 'débat'), 3963), (('closed.', 'clos.'), 3889), (('i', 'je'), 3588), (('of', 'de'), 3092), (('(applause)', '(applaudissements)'), 2936), (('the', 'les'), 1903)]


In [143]:
# release the cores for another application!
sc.stop()