# BDA — Practice Lab 01 Notebook
> Author : Badr TAJINI - Big Data Analytics - ESIEE 2025-2026

Evidence‑driven PySpark exercises: counts, “perfect x”, and PMI.

## 0. Setup

In [27]:
import sys
import platform
from pyspark.sql import SparkSession
import pyspark

spark = (
    SparkSession.builder
    .appName("BDA-PracticeLab01-2")
    .config("spark.sql.session.timeZone", "UTC")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("WARN")

print(f"Spark version: {spark.version}")
print(f"PySpark version: {pyspark.__version__}")
print(f"Python version: {sys.version.split()[0]}")
print(f"Session timezone: {spark.conf.get('spark.sql.session.timeZone')}")


Spark version: 4.0.1
PySpark version: 4.0.1
Python version: 3.10.19
Session timezone: UTC


In [28]:
print(spark.sparkContext.uiWebUrl)

http://10.255.255.254:4042


## 1. Load data

In [14]:
from pathlib import Path
import urllib.request

BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR / "data"
OUTPUTS_DIR = BASE_DIR / "outputs"
PROOF_DIR = BASE_DIR / "proof"
for directory in (DATA_DIR, OUTPUTS_DIR, PROOF_DIR):
    directory.mkdir(exist_ok=True)

SHAKESPEARE_URL = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
TEXT_PATH = DATA_DIR / "tiny_shakespeare.txt"
if not TEXT_PATH.exists():
    urllib.request.urlretrieve(SHAKESPEARE_URL, TEXT_PATH)

raw_rdd = spark.sparkContext.textFile(str(TEXT_PATH)).cache()
lines_df = spark.read.text(str(TEXT_PATH)).withColumnRenamed("value", "line").cache()

# Materialize caches
raw_rdd.count()
lines_df.count()

print(f"Data loaded from: {TEXT_PATH}")
lines_df.show(5, truncate=False)


Data loaded from: /mnt/c/Users/rerel/OneDrive/Bureau/Esiee/Esiee/E5/BDA/data/tiny_shakespeare.txt
+---------------------------------------------+
|line                                         |
+---------------------------------------------+
|First Citizen:                               |
|Before we proceed any further, hear me speak.|
|                                             |
|All:                                         |
|Speak, speak.                                |
+---------------------------------------------+
only showing top 5 rows


In [4]:
!pip install pandas

Collecting pandas
  Downloading pandas-2.3.3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
Collecting numpy>=1.22.4 (from pandas)
  Downloading numpy-2.2.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading pandas-2.3.3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (12.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m24.0 MB/s[0m  [33m0:00:00[0m eta [36m0:00:01[0m
[?25hDownloading numpy-2.2.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.8/16.8 MB[0m [31m24.7 MB/s[0m  [33m0:00:00[0mm0:00:01[0m00:01[0m
[?25hDownloading pytz-2025.2-py2.py3-none-any.whl (509 kB)
Installing collected packages: pytz, numpy, pandas
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m

## 2. WordCount (RDD)

In [15]:
import re
from operator import add
import pandas as pd

token_pattern = re.compile(r"[a-z]+")

word_counts_rdd = (
    raw_rdd
    .map(lambda line: token_pattern.findall(line.lower()))
    .flatMap(lambda tokens: tokens)
    .map(lambda token: (token, 1))
    .reduceByKey(add)
)

top10_rdd = word_counts_rdd.takeOrdered(10, key=lambda kv: (-kv[1], kv[0]))

pd.DataFrame(top10_rdd, columns=["token", "count"]).to_csv(OUTPUTS_DIR / "top10_rdd.csv", index=False)

print("Top 10 tokens (RDD):")
for token, count in top10_rdd:
    print(f"{token}: {count}")


Top 10 tokens (RDD):
the: 6287
and: 5690
i: 5111
to: 4934
of: 3760
you: 3211
my: 3120
a: 3018
that: 2664
in: 2403


## 3. WordCount (DataFrames)

In [29]:
from pyspark.sql import functions as F
from contextlib import redirect_stdout
from io import StringIO

clean_tokens_df = (
    lines_df
    .select(
        F.split(
            F.regexp_replace(F.lower("line"), "[^a-z]+", " "),
            "\s+"
        ).alias("tokens")
    )
)

wordcount_df = (
    clean_tokens_df
    .select(F.explode(F.expr("filter(tokens, x -> x <> '')")).alias("token"))
    .groupBy("token")
    .count()
    .orderBy(F.desc("count"), F.asc("token"))
    .limit(100)
)

wordcount_df.show(truncate=False)

wordcount_df.toPandas().to_csv(OUTPUTS_DIR / "top10_df.csv", index=False)

plan_buffer = StringIO()
with redirect_stdout(plan_buffer):
    wordcount_df.explain("formatted")
(PROOF_DIR / "plan_df.txt").write_text(plan_buffer.getvalue())


                                                                                

+-----+-----+
|token|count|
+-----+-----+
|the  |6287 |
|and  |5690 |
|i    |5111 |
|to   |4934 |
|of   |3760 |
|you  |3211 |
|my   |3120 |
|a    |3018 |
|that |2664 |
|in   |2403 |
|is   |2118 |
|not  |2015 |
|for  |1926 |
|s    |1859 |
|with |1813 |
|it   |1773 |
|me   |1769 |
|be   |1710 |
|your |1686 |
|he   |1606 |
+-----+-----+
only showing top 20 rows


3930

## 4. 'perfect x' follower counts

In [18]:
import re
from contextlib import redirect_stdout
from io import StringIO

pattern_perfect = re.compile(r"[a-z]+")

def followers(tokens):
    result = []
    for idx in range(len(tokens) - 1):
        if tokens[idx] == "perfect":
            follower = tokens[idx + 1]
            if follower:
                result.append(follower)
    return result

followers_rdd = (
    lines_df.rdd
    .map(lambda row: [token for token in pattern_perfect.findall(row.line.lower()) if token])
    .flatMap(followers)
)

followers_df = followers_rdd.map(lambda token: (token,)).toDF(["follower"])

perfect_counts_df = (
    followers_df
    .groupBy("follower")
    .count()
    .filter(F.col("count") > 1)
    .orderBy(F.desc("count"), F.asc("follower"))
)

perfect_counts_df.show(truncate=False)

perfect_counts_df.toPandas().to_csv(OUTPUTS_DIR / "perfect_followers.csv", index=False)

plan_buffer = StringIO()
with redirect_stdout(plan_buffer):
    perfect_counts_df.explain("formatted")
(PROOF_DIR / "plan_perfect.txt").write_text(plan_buffer.getvalue())


+--------+-----+
|follower|count|
+--------+-----+
|love    |3    |
+--------+-----+



3026

## 5. PMI — pairs (RDD)

In [19]:
import math
from itertools import combinations
from io import StringIO
from contextlib import redirect_stdout

MAX_TOKENS = 40
PMI_THRESHOLD = 5

def dedupe_preserve(tokens):
    seen = set()
    ordered = []
    for token in tokens:
        if token not in seen:
            seen.add(token)
            ordered.append(token)
    return ordered

pmi_token_pattern = re.compile(r"[a-z]+")

tokens_per_line = (
    lines_df.rdd
    .map(lambda row: [t for t in pmi_token_pattern.findall(row.line.lower())][:MAX_TOKENS])
    .map(lambda tokens: [t for t in tokens if t])
    .map(dedupe_preserve)
    .filter(lambda tokens: len(tokens) > 1)
    .cache()
)

num_docs = tokens_per_line.count()

from operator import add

marginal_counts = (
    tokens_per_line
    .flatMap(lambda tokens: ((token, 1) for token in tokens))
    .reduceByKey(add)
)

marginal_dict = dict(marginal_counts.collect())
marginal_bc = spark.sparkContext.broadcast(marginal_dict)

pair_counts = (
    tokens_per_line
    .flatMap(lambda tokens: [((min(a, b), max(a, b)), 1) for a, b in combinations(tokens, 2)])
    .reduceByKey(add)
    .filter(lambda kv: kv[1] >= PMI_THRESHOLD)
)

def compute_pair_pmi(kv):
    (x, y), co_count = kv
    count_x = marginal_bc.value.get(x)
    count_y = marginal_bc.value.get(y)
    if not count_x or not count_y:
        return None
    pmi = math.log10((co_count * num_docs) / (count_x * count_y))
    return (x, y, float(pmi), int(co_count))

pmi_pairs_rdd = pair_counts.map(compute_pair_pmi).filter(lambda row: row is not None)

pairs_df = spark.createDataFrame(pmi_pairs_rdd, schema=["x", "y", "pmi", "count"]).orderBy(F.desc("pmi"))

pairs_df.show(10, truncate=False)

pairs_df.toPandas().to_csv(OUTPUTS_DIR / "pmi_pairs_sample.csv", index=False)

plan_buffer = StringIO()
with redirect_stdout(plan_buffer):
    pairs_df.explain("formatted")
(PROOF_DIR / "plan_pmi_pairs.txt").write_text(plan_buffer.getvalue())


                                                                                

+--------+---------+------------------+-----+
|x       |y        |pmi               |count|
+--------+---------+------------------+-----+
|scroop  |stephen  |3.4748101890460235|7    |
|lartius |titus    |3.3779001760379668|7    |
|n       |stol     |3.356710876968029 |5    |
|shearing|sheep    |3.2598008639599723|5    |
|bite    |thumb    |3.1384964120067664|6    |
|bishop  |carlisle |3.0045283588566667|7    |
|earl    |wiltshire|2.9887340916734346|7    |
|en      |ta       |2.944530429181381 |24   |
|drums   |trumpets |2.8795896222483663|5    |
|bushy   |green    |2.870044304342136 |5    |
+--------+---------+------------------+-----+
only showing top 10 rows


1518

## 6. PMI — stripes (RDD)

In [20]:
from collections import Counter
from contextlib import redirect_stdout
from io import StringIO


def stripe_builder(tokens):
    for x in tokens:
        counter = Counter()
        for y in tokens:
            if y != x:
                counter[y] += 1
        if counter:
            yield (x, counter)

def merge_counters(c1, c2):
    c1.update(c2)
    return c1

def stripe_to_rows(item):
    x, counter = item
    count_x = marginal_bc.value.get(x)
    if not count_x:
        return []
    rows = []
    for y, co_count in counter.items():
        if co_count >= PMI_THRESHOLD:
            count_y = marginal_bc.value.get(y)
            if not count_y:
                continue
            pmi = math.log10((co_count * num_docs) / (count_x * count_y))
            rows.append((x, y, float(pmi), int(co_count)))
    return rows

stripes_counts = (
    tokens_per_line
    .flatMap(stripe_builder)
    .reduceByKey(merge_counters)
)

pmi_stripes_rdd = stripes_counts.flatMap(stripe_to_rows)

stripes_df = spark.createDataFrame(pmi_stripes_rdd, schema=["x", "y", "pmi", "count"]).orderBy(F.desc("pmi"))

stripes_df.show(10, truncate=False)

stripes_df.toPandas().to_csv(OUTPUTS_DIR / "pmi_stripes_sample.csv", index=False)

plan_buffer = StringIO()
with redirect_stdout(plan_buffer):
    stripes_df.explain("formatted")
(PROOF_DIR / "plan_pmi_stripes.txt").write_text(plan_buffer.getvalue())


                                                                                

+--------+--------+------------------+-----+
|x       |y       |pmi               |count|
+--------+--------+------------------+-----+
|stephen |scroop  |3.4748101890460235|7    |
|scroop  |stephen |3.4748101890460235|7    |
|titus   |lartius |3.3779001760379668|7    |
|lartius |titus   |3.3779001760379668|7    |
|n       |stol    |3.356710876968029 |5    |
|stol    |n       |3.356710876968029 |5    |
|sheep   |shearing|3.2598008639599723|5    |
|shearing|sheep   |3.2598008639599723|5    |
|thumb   |bite    |3.1384964120067664|6    |
|bite    |thumb   |3.1384964120067664|6    |
+--------+--------+------------------+-----+
only showing top 10 rows


1515

## 7. Spark UI evidence
Open http://localhost:4040 and capture Files Read, Input Size, Shuffle Read/Write.

## 8. Environment and configs

In [21]:
import json
import subprocess

def get_java_version():
    try:
        output = subprocess.check_output(["java", "-version"], stderr=subprocess.STDOUT)
        return output.decode("utf-8").strip().splitlines()[0]
    except Exception as exc:
        return f"Unavailable ({exc})"

java_output = get_java_version()
print(f"Java: {java_output}")

print("Spark configuration (selected):")
conf_items = sorted(spark.sparkContext.getConf().getAll())
for key, value in conf_items:
    print(f" - {key} = {value}")

env_summary = {
    "python": sys.version,
    "spark": spark.version,
    "pyspark": pyspark.__version__,
    "java": java_output,
    "os": platform.platform(),
    "spark_conf": {k: v for k, v in conf_items if k.startswith("spark.")}
}

env_lines = [
    "# Environment Summary",
    "",
    f"- Python: {sys.version.split()[0]}",
    f"- Spark: {spark.version}",
    f"- PySpark: {pyspark.__version__}",
    f"- Java: {java_output}",
    f"- OS: {platform.platform()}",
    "",
    "## Spark Configuration"
]

env_lines.extend(f"- {k} = {v}" for k, v in env_summary["spark_conf"].items())

ENV_PATH = Path("ENV.md")
ENV_PATH.write_text("\n".join(env_lines) + "\n")

print(f"Environment details saved to {ENV_PATH.resolve()}")


Java: openjdk version "21.0.6" 2025-01-21
Spark configuration (selected):
 - spark.app.id = local-1761204299112
 - spark.app.name = BDA-PracticeLab01
 - spark.app.startTime = 1761204297795
 - spark.app.submitTime = 1761204297532
 - spark.driver.extraJavaOptions = -Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-modules=jdk.incubator.vector --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED 

In [22]:
spark.catalog.listTables()

[]

In [24]:
print(spark.sparkContext.uiWebUrl)

http://10.255.255.254:4042
