In [5]:
from pyspark.sql import SparkSession


spark_session = SparkSession\
        .builder\
        .master("local[1]") \
        .appName("novellarausell_lecture1_simple_example")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",4)\
        .getOrCreate()
        
spark_context = spark_session.sparkContext


# Part A - Working with the RDD API

## Question A.1

### A.1.1 Read the English transcripts with Spark, and count the number of lines.

In [20]:
# First we need to access the given files, which are in HDFS. The Namenode and the host name of the master containing
# such files is 192.168.1.153
en_lines = spark_context.textFile("hdfs://192.168.1.153:9000/europarl/europarl-v7.sv-en.en")
nr_en = en_lines.count()
print("The number of lines in the English transcript: {}".format(nr_en))

The number of lines in the English transcript: 1862234


### A.1.2 Do the same with the other language (so that you have a separate lineage of RDDs for each).

In [21]:
sv_lines = spark_context.textFile("hdfs://192.168.1.153:9000/europarl/europarl-v7.sv-en.sv")
nr_sv = sv_lines.count()
print("The number of lines in the Swedish transcript: {}".format(nr_sv))

The number of lines in the Swedish transcript: 1862234


### A.1.3 Verify that the line counts are the same for the two languages.

In [22]:
# Asserting that both lengths are the same!
assert (nr_en == nr_sv), "Not the same length!"

### A.1.4 Count the number of partitions.

In [27]:
# The number of partitions is the number of blocks used by HDFS to store the file 
print("Number of partitions in the English transcript: {} \n".format(en_lines.getNumPartitions()))
print("Number of partitions in the Swedish transcript: {} \n".format(sv_lines.getNumPartitions()))

Number of partitions in the English transcript: 2 

Number of partitions in the Swedish transcript: 3 



## Question A.2

### A.2.1 Pre-process the text from both RDDs by doing the following:
 - Lowercase the text
 - Tokenize the text (split on space)

In [76]:
import string

def rddtolower(x):
    return x.lower().translate(str.maketrans("", "", string.punctuation))

def rddtokenizer(x):
    words = x.split(' ')
    for word in words:
        return (word.strip(),1)

In [77]:
sv_tokens = sv_lines.map(lambda x: rddtolower(x)).map(lambda y: rddtokenizer(y))
en_tokens = en_lines.map(lambda x: rddtolower(x)).map(lambda y: rddtokenizer(y))

### A.2.2 Inspect 10 entries from each of your RDDs to verify your pre-processing

In [78]:
print("English transcript inspection: \n {}".format(en_tokens.take(10)))
print("Swedish transcript inspection: \n {}".format(sv_tokens.take(10)))

English transcript inspection: 
 [('resumption', 1), ('i', 1), ('although', 1), ('you', 1), ('in', 1), ('please', 1), ('the', 1), ('madam', 1), ('you', 1), ('one', 1)]
Swedish transcript inspection: 
 [('återupptagande', 1), ('jag', 1), ('som', 1), ('ni', 1), ('till', 1), ('jag', 1), ('parlamentet', 1), ('fru', 1), ('ni', 1), ('en', 1)]


### A.2.3 Verify that the line counts still match after the pre-processing.

In [79]:
# Asserting that both lengths are the same!
assert (en_tokens.count() == sv_tokens.count()), "Not the same length!"

## Question A.3

### A.3.1 Use Spark to compute the 10 most frequently according words in the English language corpus. Repeat for the other language.