In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, lower

spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.250:7077") \
        .appName("Part_A_Simon_Pislar_A3")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",2)\
        .config("spark.driver.port",9999)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()

def to_lower_split(rdd, split_by_char):
    """
    Lowercase the text and tokenize by splitting on space.
    Returns a new RDD with preprocessed text.
    """
    return rdd.map(lambda line: line.lower().split(split_by_char))

# RDD  API
spark_context = spark_session.sparkContext

spark_context.setLogLevel("ERROR")

In [None]:
# A.1.1 Read the English transcripts with Spark, and count the number of lines
# A.2.1 Pre-process the text from both RDDs
path_to_english_transcripts = "hdfs://192.168.2.250:9000/europarl/europarl-v7.sv-en.en"
english_transcripts_rdd = spark_context.textFile(path_to_english_transcripts)
pre_processed_text = to_lower_split(english_transcripts_rdd, ' ')
print(f"Pre-process check: {pre_processed_text.take(10)}")
num_lines_sample = pre_processed_text.count()
print(f"Number of lines in the sample: {num_lines_sample}")

In [None]:
# A.1.2 Do the same with the other language (so that you have a separate lineage of RDDs for
# each).
# A.2.1 Pre-process the text from both RDDs
path_to_swedish_transcripts = "hdfs://192.168.2.250:9000/europarl/europarl-v7.sv-en.sv"
swedish_transcripts_rdd = spark_context.textFile(path_to_swedish_transcripts)
pre_processed_text = to_lower_split(swedish_transcripts_rdd, ' ')
print(f"Pre-process check: {pre_processed_text.take(10)}")
num_lines_sample = pre_processed_text.count()
print(f"Number of lines in the sample: {num_lines_sample}")

In [None]:
# A.1.4 Count the number of partitions.
path_to_english_transcripts = "hdfs://192.168.2.250:9000/europarl/europarl-v7.sv-en.en"
english_transcripts_rdd = spark_context.textFile(path_to_english_transcripts)
num_partitions = english_transcripts_rdd.getNumPartitions()
print(f"Number of lines in the sample: {num_partitions}")

In [None]:
# Stop the spark context
spark_context.stop()