In [1]:
import sys
import platform
from pyspark.sql import SparkSession
import pyspark

spark = (
    SparkSession.builder
    .appName("BDA-PracticeLab01-2")
    .config("spark.sql.session.timeZone", "UTC")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("WARN")

print(f"Spark version: {spark.version}")
print(f"PySpark version: {pyspark.__version__}")
print(f"Python version: {sys.version.split()[0]}")
print(f"Session timezone: {spark.conf.get('spark.sql.session.timeZone')}")


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/10/23 10:29:58 WARN Utils: Your hostname, Remi, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/10/23 10:29:58 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/23 10:29:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/10/23 10:29:58 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/10/23 10:29:58 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
25/10/23 10:29:58 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
25/10/23 10:29:58 WARN Utils: Service '

Spark version: 4.0.1
PySpark version: 4.0.1
Python version: 3.10.19
Session timezone: UTC


In [2]:
print(spark.sparkContext.uiWebUrl)

http://10.255.255.254:4044


In [3]:
from pathlib import Path
import urllib.request

BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR / "data"
OUTPUTS_DIR = BASE_DIR / "outputs"
PROOF_DIR = BASE_DIR / "proof"
for directory in (DATA_DIR, OUTPUTS_DIR, PROOF_DIR):
    directory.mkdir(exist_ok=True)

SHAKESPEARE_URL = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
TEXT_PATH = DATA_DIR / "tiny_shakespeare.txt"
if not TEXT_PATH.exists():
    urllib.request.urlretrieve(SHAKESPEARE_URL, TEXT_PATH)

raw_rdd = spark.sparkContext.textFile(str(TEXT_PATH)).cache()
lines_df = spark.read.text(str(TEXT_PATH)).withColumnRenamed("value", "line").cache()

# Materialize caches
raw_rdd.count()
lines_df.count()

print(f"Data loaded from: {TEXT_PATH}")
lines_df.show(5, truncate=False)


                                                                                

Data loaded from: /mnt/c/Users/rerel/OneDrive/Bureau/Esiee/Esiee/E5/BDA/Lab_1/data/tiny_shakespeare.txt
+---------------------------------------------+
|line                                         |
+---------------------------------------------+
|First Citizen:                               |
|Before we proceed any further, hear me speak.|
|                                             |
|All:                                         |
|Speak, speak.                                |
+---------------------------------------------+
only showing top 5 rows


In [5]:
import re
from operator import add
import pandas as pd

token_pattern = re.compile(r"[a-z]+")

word_counts_rdd = (
    raw_rdd
    .map(lambda line: token_pattern.findall(line.lower()))
    .flatMap(lambda tokens: tokens)
    .map(lambda token: (token, 1))
    .reduceByKey(add)
)
print(len(word_counts_rdd))
top10_rdd = word_counts_rdd.takeOrdered(10, key=lambda kv: (-kv[1], kv[0]))

pd.DataFrame(top10_rdd, columns=["token", "count"]).to_csv(OUTPUTS_DIR / "top10_rdd.csv", index=False)

print("Top 10 tokens (RDD):")
for token, count in top10_rdd:
    print(f"{token}: {count}")


TypeError: object of type 'PipelinedRDD' has no len()