In [2]:
from pyspark.sql import SparkSession


spark_session = SparkSession\
        .builder\
        .master("local[1]") \
        .appName("novellarausell_lecture1_simple_example")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",4)\
        .getOrCreate()
        
spark_context = spark_session.sparkContext


# Part A - Working with the RDD API

## Question A.1

### A.1.1 Read the English transcripts with Spark, and count the number of lines.

In [3]:
# First we need to access the given files, which are in HDFS. The Namenode and the host name of the master containing
# such files is 192.168.1.153
en_lines = spark_context.textFile("hdfs://192.168.1.153:9000/europarl/europarl-v7.sv-en.en")
nr_en = en_lines.count()
print("The number of lines in the English transcript: {}".format(nr_en))

The number of lines in the English transcript: 1862234


### A.1.2 Do the same with the other language (so that you have a separate lineage of RDDs for each).

In [4]:
sv_lines = spark_context.textFile("hdfs://192.168.1.153:9000/europarl/europarl-v7.sv-en.sv")
nr_sv = sv_lines.count()
print("The number of lines in the Swedish transcript: {}".format(nr_sv))

The number of lines in the Swedish transcript: 1862234


### A.1.3 Verify that the line counts are the same for the two languages.

In [22]:
# Asserting that both lengths are the same!
assert (nr_en == nr_sv), "Not the same length!"

### A.1.4 Count the number of partitions.

In [44]:
# The number of partitions is the number of blocks used by HDFS to store the file 
print("Number of partitions in the English transcript: {} \n".format(en_lines.getNumPartitions()))
print("Number of partitions in the Swedish transcript: {} \n".format(sv_lines.getNumPartitions()))

Number of partitions in the English transcript: 2 

Number of partitions in the Swedish transcript: 3 



## Question A.2

### A.2.1 Pre-process the text from both RDDs by doing the following:
 - Lowercase the text
 - Tokenize the text (split on space)

In [10]:
import string

def rddtolower(x):
    return x.lower().strip().translate(str.maketrans("", "", string.punctuation))

def rddtokenizer(x):
    words = x.split(' ')
    for word in words:
        return tuple((word,1))

In [46]:
sv_tokens = sv_lines.map(lambda x: rddtolower(x)).map(lambda y: rddtokenizer(y))
en_tokens = en_lines.map(lambda x: rddtolower(x)).map(lambda y: rddtokenizer(y))

### A.2.2 Inspect 10 entries from each of your RDDs to verify your pre-processing

In [47]:
print("English transcript inspection: \n {}".format(en_tokens.take(10)))
print("Swedish transcript inspection: \n {}".format(sv_tokens.take(10)))

English transcript inspection: 
 [('resumption', 1), ('i', 1), ('although', 1), ('you', 1), ('in', 1), ('please', 1), ('the', 1), ('madam', 1), ('you', 1), ('one', 1)]
Swedish transcript inspection: 
 [('återupptagande', 1), ('jag', 1), ('som', 1), ('ni', 1), ('till', 1), ('jag', 1), ('parlamentet', 1), ('fru', 1), ('ni', 1), ('en', 1)]


### A.2.3 Verify that the line counts still match after the pre-processing.

In [None]:
# Asserting that both lengths are the same!
en_tokens.count()

assert (en_tokens.count() == sv_tokens.count()), "Not the same length!"

## Question A.3

### A.3.1 Use Spark to compute the 10 most frequently according words in the English language corpus. Repeat for the other language.

In [106]:
from operator import add
en_mostcommon = en_tokens.reduceByKey(add).takeOrdered(10, key = lambda x: -x[1])
sv_mostcommon = sv_tokens.reduceByKey(add).takeOrdered(10, key = lambda x: -x[1])

In [119]:
print("The 10 most common words on the English corpus are: \n" + ",".join([pair[0] for pair in en_mostcommon]))
print("The 10 most common words on the Swedish corpus are: \n" + ",".join([pair[0] for pair in sv_mostcommon]))

The 10 most common words on the English corpus are: 
the,i,we,it,in,this,mr,that,as,however
The 10 most common words on the Swedish corpus are: 
jag,det,vi,i,detta,för,herr,den,de,men


### A.3.2 Verify that your results are reasonable

## Question A.4

### A.4.1 Use this parallel corpus to mine some translations in the form of word pairs, for the two languages. Do this by pairing words found on short lines with the same number of words respectively. We (incorrectly) assume the words stay in the same order when translated. 

#### 1. Key the lines by their line number (hint: ZipWithIndex())

In [5]:
en_1 = en_lines.zipWithIndex()
sv_1 = sv_lines.zipWithIndex()

#### 2. Swap the key and value - so that the line number is the key

In [6]:
en_2 = en_1.map(lambda x: (x[1], x[0]), en_1)
sv_2 = sv_1.map(lambda x: (x[1], x[0]), sv_1)

#### 3. Join the two RDDs together according to the line number key, so you have pairs of matching lines

In [7]:
ensv_3 = en_2.join(sv_2)

#### 4. Filter to exclude line pairs that have an empty/missing “corresponding” sentence.
#### 5. Filter to leave only pairs of sentences with a small number of words per sentence, this should give a more reliable translation (you can experiment)
#### 6. Filter to leave only pairs of sentences with the same number of words in each sentence.

In [13]:
ensv_456 = ensv_3.filter(lambda x: x if x[1][0] is not None or x[1][1] is not None else None)\
.filter(lambda x: x if len(x[1][0].split(' ')) < 5 and len(x[1][1].split(' ')) < 5 else None)\
.filter(lambda x: x if len(x[1][0].split(' ')) == len(x[1][1].split(' ')) else None)




In [16]:
ensv_456.take(6)

[(50, ('Agenda', 'Arbetsplan')),
 (255, ('The debate is closed.', 'Jag förklarar debatten avslutad.')),
 (1295, ('Why?', 'Varför?')),
 (1670, ('.', '')),
 (3100, ('1997 discharge', 'Ansvarsfrihet 1997')),
 (3445, ('Is this possible?', 'Är det acceptabelt?'))]