In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, lower

spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.250:7077") \
        .appName("Part_A_Simon_Pislar_A3")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",2)\
        .config("spark.driver.port",9999)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()

def to_lower_split(rdd, split_by_char):
    """
    Lowercase the text and tokenize by splitting on space.
    Returns a new RDD with preprocessed text.
    """
    return rdd.map(lambda line: line.lower().split(split_by_char))

# RDD  API
spark_context = spark_session.sparkContext

spark_context.setLogLevel("ERROR")

In [None]:
# A.1.1 Read the English transcripts with Spark, and count the number of lines
# A.2.1 Pre-process the text from both RDDs
path_to_english_transcripts = "hdfs://192.168.2.250:9000/europarl/europarl-v7.sv-en.en"
english_transcripts_rdd = spark_context.textFile(path_to_english_transcripts)
pre_processed_english_text = to_lower_split(english_transcripts_rdd, ' ')
print(f"Pre-process check: {pre_processed_english_text.take(10)}")
num_lines_sample = pre_processed_english_text.count()
print(f"Number of lines in the sample: {num_lines_sample}")

In [None]:
# A.1.2 Do the same with the other language (so that you have a separate lineage of RDDs for
# each).
# A.2.1 Pre-process the text from both RDDs
path_to_swedish_transcripts = "hdfs://192.168.2.250:9000/europarl/europarl-v7.sv-en.sv"
swedish_transcripts_rdd = spark_context.textFile(path_to_swedish_transcripts)
pre_processed_swedish_text = to_lower_split(swedish_transcripts_rdd, ' ')
print(f"Pre-process check: {pre_processed_swedish_text.take(10)}")
num_lines_sample = pre_processed_swedish_text.count()
print(f"Number of lines in the sample: {num_lines_sample}")

In [None]:
# A.1.4 Count the number of partitions.
path_to_english_transcripts = "hdfs://192.168.2.250:9000/europarl/europarl-v7.sv-en.en"
english_transcripts_rdd = spark_context.textFile(path_to_english_transcripts)
num_partitions = english_transcripts_rdd.getNumPartitions()
print(f"Number of partitions: {num_partitions}")

In [None]:
# A.3.1 Use Spark to compute the 10 most frequently according words in the English language
# corpus. Repeat for the other language.
flattened_pre_processed_english_text = pre_processed_english_text.flatMap(lambda x: x)
english_word_tuples = flattened_pre_processed_english_text.map(lambda word: (word, 1))
english_word_occurence = english_word_tuples.reduceByKey(lambda a, b: a + b)
english_word_occurence_sorted = english_word_occurence.sortBy(lambda word_count: word_count[1], ascending=False)
print(f"Most used english words: {english_word_occurence_sorted.take(10)}")

In [None]:
# A.3.1 Use Spark to compute the 10 most frequently according words in the English language
# corpus. Repeat for the other language.
flattened_pre_processed_swedish_text = pre_processed_swedish_text.flatMap(lambda x: x)
swedish_word_tuples = flattened_pre_processed_swedish_text.map(lambda word: (word, 1))
swedish_word_occurence = swedish_word_tuples.reduceByKey(lambda a, b: a + b)
swedish_word_occurence_sorted = swedish_word_occurence.sortBy(lambda word_count: word_count[1], ascending=False)
print(f"Most used swedish words: {swedish_word_occurence_sorted.take(10)}")

In [None]:
zipped_english_text = pre_processed_english_text.zipWithIndex()
zipped_swedish_text = pre_processed_swedish_text.zipWithIndex()
swapped_key_value_english = zipped_english_text.map(lambda x: (x[1], x[0]))
swapped_key_value_swedish = zipped_swedish_text.map(lambda x: (x[1], x[0]))
joined_swedish_english = swapped_key_value_swedish.join(swapped_key_value_english)

# TODO: Fix filtering function- Filters too much, nothing gets through!
filter_structure_swedish_english = joined_swedish_english.filter(lambda x: all(x[1]))

print(f"Sample: {joined_swedish_english.take(10)}")

In [None]:
filter_small_words_swedish_english = filter_structure_swedish_english.filter(lambda x: len(x[1][0].split()) <= 5 and len(x[1][1].split()) <= 5)

In [None]:
filter_same_length_english_swedish = filter_small_words_swedish_english.filter(lambda x: len(x[1][0].split()) == len(x[1][1].split()))

In [None]:
pair_of_words = filter_same_length_english_swedish.flatMap(lambda x: zip(x[1][0].split(), x[1][1].split()))

In [None]:
# Stop the spark context
spark_context.stop()