In [29]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, lower

spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.250:7077") \
        .appName("Part_A_Simon_Pislar_A3")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",2)\
        .config("spark.driver.port",9999)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()

def to_lower_split(rdd, split_by_char):
    """
    Lowercase the text and tokenize by splitting on space.
    Returns a new RDD with preprocessed text.
    """
    return rdd.map(lambda line: line.lower().split(split_by_char))

# RDD  API
spark_context = spark_session.sparkContext

spark_context.setLogLevel("ERROR")

In [30]:
# A.1.1 Read the English transcripts with Spark, and count the number of lines
# A.2.1 Pre-process the text from both RDDs
path_to_english_transcripts = "hdfs://192.168.2.250:9000/europarl/europarl-v7.sv-en.en"
english_transcripts_rdd = spark_context.textFile(path_to_english_transcripts)
pre_processed_english_text = to_lower_split(english_transcripts_rdd, ' ')
print(f"Pre-process check: {pre_processed_english_text.take(10)}")
num_lines_sample = pre_processed_english_text.count()
print(f"Number of lines in the sample: {num_lines_sample}")

                                                                                

Pre-process check: [['resumption', 'of', 'the', 'session'], ['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999,', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period.'], ['although,', 'as', 'you', 'will', 'have', 'seen,', 'the', 'dreaded', "'millennium", "bug'", 'failed', 'to', 'materialise,', 'still', 'the', 'people', 'in', 'a', 'number', 'of', 'countries', 'suffered', 'a', 'series', 'of', 'natural', 'disasters', 'that', 'truly', 'were', 'dreadful.'], ['you', 'have', 'requested', 'a', 'debate', 'on', 'this', 'subject', 'in', 'the', 'course', 'of', 'the', 'next', 'few', 'days,', 'during', 'this', 'part-session.'], ['in', 'the', 'meantime,', 'i', 'should', 'like', 'to', 'observe', 'a', "minute'", 's', 'silence,', 'as', 'a', 'number', 'of', 'members', 'have', 'requested,', 'on'



Number of lines in the sample: 1862234


                                                                                

In [31]:
# A.1.2 Do the same with the other language (so that you have a separate lineage of RDDs for
# each).
# A.2.1 Pre-process the text from both RDDs
path_to_swedish_transcripts = "hdfs://192.168.2.250:9000/europarl/europarl-v7.sv-en.sv"
swedish_transcripts_rdd = spark_context.textFile(path_to_swedish_transcripts)
pre_processed_swedish_text = to_lower_split(swedish_transcripts_rdd, ' ')
print(f"Pre-process check: {pre_processed_swedish_text.take(10)}")
num_lines_sample = pre_processed_swedish_text.count()
print(f"Number of lines in the sample: {num_lines_sample}")

Pre-process check: [['återupptagande', 'av', 'sessionen'], ['jag', 'förklarar', 'europaparlamentets', 'session', 'återupptagen', 'efter', 'avbrottet', 'den', '17', 'december.', 'jag', 'vill', 'på', 'nytt', 'önska', 'er', 'ett', 'gott', 'nytt', 'år', 'och', 'jag', 'hoppas', 'att', 'ni', 'haft', 'en', 'trevlig', 'semester.'], ['som', 'ni', 'kunnat', 'konstatera', 'ägde', '"den', 'stora', 'år', '2000-buggen"', 'aldrig', 'rum.', 'däremot', 'har', 'invånarna', 'i', 'ett', 'antal', 'av', 'våra', 'medlemsländer', 'drabbats', 'av', 'naturkatastrofer', 'som', 'verkligen', 'varit', 'förskräckliga.'], ['ni', 'har', 'begärt', 'en', 'debatt', 'i', 'ämnet', 'under', 'sammanträdesperiodens', 'kommande', 'dagar.'], ['till', 'dess', 'vill', 'jag', 'att', 'vi,', 'som', 'ett', 'antal', 'kolleger', 'begärt,', 'håller', 'en', 'tyst', 'minut', 'för', 'offren', 'för', 'bl.a.', 'stormarna', 'i', 'de', 'länder', 'i', 'europeiska', 'unionen', 'som', 'drabbats.'], ['jag', 'ber', 'er', 'resa', 'er', 'för', 'en', 



Number of lines in the sample: 1862234


                                                                                

In [32]:
# A.1.4 Count the number of partitions.
path_to_english_transcripts = "hdfs://192.168.2.250:9000/europarl/europarl-v7.sv-en.en"
english_transcripts_rdd = spark_context.textFile(path_to_english_transcripts)
num_partitions = english_transcripts_rdd.getNumPartitions()
print(f"Number of partitions: {num_partitions}")

Number of partitions: 2


In [None]:
# A.3.1 Use Spark to compute the 10 most frequently according words in the English language
# corpus. Repeat for the other language.
flattened_pre_processed_english_text = pre_processed_english_text.flatMap(lambda x: x)
english_word_tuples = flattened_pre_processed_english_text.map(lambda word: (word, 1))
english_word_occurence = english_word_tuples.reduceByKey(lambda a, b: a + b)
english_word_occurence_sorted = english_word_occurence.sortBy(lambda word_count: word_count[1], ascending=False)
print(f"Most used english words: {english_word_occurence_sorted.take(10)}")

In [None]:
# A.3.1 Use Spark to compute the 10 most frequently according words in the English language
# corpus. Repeat for the other language.
flattened_pre_processed_swedish_text = pre_processed_swedish_text.flatMap(lambda x: x)
swedish_word_tuples = flattened_pre_processed_swedish_text.map(lambda word: (word, 1))
swedish_word_occurence = swedish_word_tuples.reduceByKey(lambda a, b: a + b)
swedish_word_occurence_sorted = swedish_word_occurence.sortBy(lambda word_count: word_count[1], ascending=False)
print(f"Most used swedish words: {swedish_word_occurence_sorted.take(10)}")

In [33]:
zipped_english_text = pre_processed_english_text.zipWithIndex()
zipped_swedish_text = pre_processed_swedish_text.zipWithIndex()
swapped_key_value_english = zipped_english_text.map(lambda x: (x[1], x[0]))
swapped_key_value_swedish = zipped_swedish_text.map(lambda x: (x[1], x[0]))
joined_swedish_english = swapped_key_value_swedish.join(swapped_key_value_english)

# TODO: Fix filtering function- Filters too much, nothing gets through!
filter_structure_swedish_english = joined_swedish_english.filter(lambda x: all(x[1]))

print(f"Sample: {joined_swedish_english.take(10)}")

[Stage 19:>                                                         (0 + 1) / 1]

Sample: [(865085, (['när', 'det', 'gäller', 'gilles', 'savarys', 'betänkande', 'gläder', 'det', 'mig', 'att', 'det', 'omfattar', 'arbetsmarknadens', 'parters', 'rättigheter', 'och', 'specifikationer', 'för', 'driftskompatibilitet.'], ['regarding', 'the', 'report', 'by', 'mr', 'savary,', 'i', 'am', 'pleased', 'that', 'it', 'includes', 'the', 'rights', 'of', 'the', 'social', 'partners', 'and', 'specifications', 'for', 'interoperability.'])), (865625, (['jag', 'menar', 'att', 'det', 'för', 'det', 'första', 'behöver', 'skapas', 'förutsättningar', 'för', 'en', 'stadig', 'utveckling', 'för', 'de', 'konkurrenskraftigaste', 'sockerproducenterna', 'i', 'gemenskapen', 'för', 'att', 'eu:s', 'produktion', 'ska', 'kunna', 'konkurrera', 'på', 'en', 'allt', 'öppnare', 'världsmarknad.'], ['what', 'i', 'think', 'is', 'needed', 'is,', 'firstly,', 'to', 'create', 'conditions', 'for', 'stable', 'development', 'for', 'the', 'most', 'competitive', 'sugar', 'producers', 'in', 'the', 'community,', 'to', 'enab

                                                                                

In [34]:
# Step 5: Filter to leave only pairs of sentences with a small number of words per sentence.
filter_small_words_swedish_english = filter_structure_swedish_english.filter(
    lambda x: len(x[1][0]) <= 5 and len(x[1][1]) <= 5
)

print(f"Sample: {filter_small_words_swedish_english.take(10)}")
# Step 6: Since the data is already in the correct form (lists of words), we don't need to do anything here.

[Stage 21:>                                                         (0 + 1) / 1]

Sample: [(46070, (['det', 'gäller', 'framtiden.'], ['that', 'is', 'for', 'the', 'future.'])), (127055, (['jag', 'förklarar', 'debatten', 'avslutad.'], ['the', 'debate', 'is', 'closed.'])), (151865, (['ingen!'], ['no', 'one', 'could', 'say!'])), (199480, (['tack', 'så', 'mycket,', 'fru', 'kommissionär!'], ['thank', 'you,', 'commissioner.'])), (200945, (['.'], [''])), (253975, (['det', 'är', 'år', '1995.'], ['this', 'happened', 'in', '1995.'])), (281930, (['detta', 'får', 'inte', 'fortsätta.'], ['that', 'must', 'not', 'continue.'])), (304735, (['före', 'omröstningen:'], ['before', 'the', 'vote:'])), (320680, (['det', 'var', 'en', 'sidoanmärkning.'], ['that', 'was', 'an', 'aside.'])), (352995, (['så', 'ser', 'verkligheten', 'ut.'], ['that', 'is', 'the', 'reality.']))]


                                                                                

In [35]:
# Step 7: For each sentence pair, pair each word with its translation.
word_pairs = filter_small_words_swedish_english.flatMap(
    lambda x: zip(x[1][0], x[1][1])
)
print(f"Sample: {word_pairs.take(10)}")

[Stage 23:>                                                         (0 + 1) / 1]

Sample: [('.', ''), ('det', 'that'), ('är', 'is'), ('idrott.', 'sport.'), ('\xa0\xa0', '\xa0\xa0'), ('–', '\xa0the'), ('debatten', 'debate'), ('är', 'is'), ('avslutad.', 'closed.'), ('rådets', 'president-in-office')]


                                                                                

In [37]:
# Step 8: Use reduce to count the number of occurrences of each word-translation pair.
from operator import add

word_pair_counts = word_pairs.map(lambda x: (x, 1)).reduceByKey(add)
print(f"Sample: {word_pair_counts.take(10)}")



Sample: [(('det', 'that'), 2168), (('är', 'is'), 6057), (('råder', 'do'), 1), (('(parlamentet', '(parliament'), 1159), (('hänt?', 'since?'), 1), (('detta', 'all'), 9), (('emot', 'against'), 38), (('rekommendationen)', 'the'), 1), (('tack,', 'thank'), 1024), (('så', 'you,'), 516)]


                                                                                

In [38]:
# Step 9: Print some of the most frequently occurring pairs of words.
most_frequent_pairs = word_pair_counts.takeOrdered(10, key=lambda x: -x[1])
print(f"Most Frequent Pairs: {most_frequent_pairs}")

Most Frequent Pairs: [(('är', 'is'), 6057), (('.', '.'), 3976), (('\xa0\xa0', '\xa0\xa0'), 3792), (('avslutad.', 'closed.'), 2951), (('(applåder)', '(applause)'), 2546), (('vi', 'we'), 2227), (('.', ''), 2223), (('debatten', 'the'), 2194), (('är', 'debate'), 2172), (('det', 'that'), 2168)]


In [39]:
# Stop the spark context
spark_context.stop()