In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, lower

spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.250:7077") \
        .appName("Part_A_Simon_Pislar_A3")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",2)\
        .config("spark.driver.port",9999)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()

def to_lower_split(rdd, split_by_char):
    """
    Lowercase the text and tokenize by splitting on space.
    Returns a new RDD with preprocessed text.
    """
    return rdd.map(lambda line: line.lower().split(split_by_char))

# RDD  API
spark_context = spark_session.sparkContext

spark_context.setLogLevel("ERROR")

In [None]:
# A.1.1 Read the English transcripts with Spark, and count the number of lines
# A.2.1 Pre-process the text from both RDDs
path_to_english_transcripts = "hdfs://192.168.2.250:9000/europarl/europarl-v7.sv-en.en"
english_transcripts_rdd = spark_context.textFile(path_to_english_transcripts)
pre_processed_english_text = to_lower_split(english_transcripts_rdd, ' ')
print(f"Pre-process check: {pre_processed_english_text.take(10)}")
num_lines_sample = pre_processed_english_text.count()
print(f"Number of lines in the sample: {num_lines_sample}")

In [None]:
# A.1.2 Do the same with the other language (so that you have a separate lineage of RDDs for
# each).
# A.2.1 Pre-process the text from both RDDs
path_to_swedish_transcripts = "hdfs://192.168.2.250:9000/europarl/europarl-v7.sv-en.sv"
swedish_transcripts_rdd = spark_context.textFile(path_to_swedish_transcripts)
pre_processed_swedish_text = to_lower_split(swedish_transcripts_rdd, ' ')
print(f"Pre-process check: {pre_processed_swedish_text.take(10)}")
num_lines_sample = pre_processed_swedish_text.count()
print(f"Number of lines in the sample: {num_lines_sample}")

In [None]:
# A.1.4 Count the number of partitions.
path_to_english_transcripts = "hdfs://192.168.2.250:9000/europarl/europarl-v7.sv-en.en"
english_transcripts_rdd = spark_context.textFile(path_to_english_transcripts)
num_partitions = english_transcripts_rdd.getNumPartitions()
print(f"Number of partitions: {num_partitions}")

In [None]:
# A.3.1 Use Spark to compute the 10 most frequently according words in the English language
# corpus. Repeat for the other language.
flattened_pre_processed_english_text = pre_processed_english_text.flatMap(lambda x: x)
english_word_tuples = flattened_pre_processed_english_text.map(lambda word: (word, 1))
english_word_occurence = english_word_tuples.reduceByKey(lambda a, b: a + b)
english_word_occurence_sorted = english_word_occurence.sortBy(lambda word_count: word_count[1], ascending=False)
print(f"Most used english words: {english_word_occurence_sorted.take(10)}")

In [None]:
# A.3.1 Use Spark to compute the 10 most frequently according words in the English language
# corpus. Repeat for the other language.
flattened_pre_processed_swedish_text = pre_processed_swedish_text.flatMap(lambda x: x)
swedish_word_tuples = flattened_pre_processed_swedish_text.map(lambda word: (word, 1))
swedish_word_occurence = swedish_word_tuples.reduceByKey(lambda a, b: a + b)
swedish_word_occurence_sorted = swedish_word_occurence.sortBy(lambda word_count: word_count[1], ascending=False)
print(f"Most used swedish words: {swedish_word_occurence_sorted.take(10)}")

In [None]:
# Assuming sv_RDD and en_RDD are your Swedish and English RDDs after preprocessing

# 1. Key the lines by their line number
sv_zipped = pre_processed_swedish_text.zipWithIndex().map(lambda x: (x[1], x[0]))
en_zipped = pre_processed_english_text.zipWithIndex().map(lambda x: (x[1], x[0]))

# 2. Swap the key and value
sv_swapped = sv_zipped.map(lambda x: (x[1], x[0]))
en_swapped = en_zipped.map(lambda x: (x[1], x[0]))

# 3. Join the RDDs on the line number
joined_RDD = sv_swapped.join(en_swapped)

# 4. Filter to exclude line pairs with missing sentences
filtered_missing = joined_RDD.filter(lambda x: x[1][0] is not None and x[1][1] is not None)

# 5. Filter to leave only pairs of sentences with a small number of words per sentence
# Here, let's define 'small' as less than or equal to 5 words.
filtered_length = filtered_missing.filter(lambda x: len(x[1][0].split()) <= 5 and len(x[1][1].split()) <= 5)

# 6. Filter to leave only pairs of sentences with the same number of words
filtered_same_length = filtered_length.filter(lambda x: len(x[1][0].split()) == len(x[1][1].split()))

# 7. For each sentence pair, map so that you pair each (in order) word in the two sentences
word_pairs = filtered_same_length.flatMap(lambda x: zip(x[1][0].split(), x[1][1].split()))

# Ensure that the word pairs are tuples (they should be by default using zip)
word_pairs = word_pairs.map(lambda x: (tuple(x), 1))

# 8. Use reduce to count the number of occurrences of the word-translation-pairs
word_pair_counts = word_pairs.reduceByKey(lambda x, y: x + y)

# 9. Print some of the most frequently occurring pairs of words
top_word_pairs = word_pair_counts.takeOrdered(10, key=lambda x: -x[1])
for pair in top_word_pairs:
    print(f"{pair[0][0]} - {pair[0][1]}: {pair[1]}")


In [None]:
# Stop the spark context
spark_context.stop()