# BDA — Practice Lab 03 Notebook
> Author : Badr TAJINI - Big Data Analytics - ESIEE 2025-2026

Personalized PageRank + SMS spam classification in PySpark.

## 0. Bootstrap

In [1]:
import sys
import platform
from pyspark.sql import SparkSession
import pyspark

spark = (
    SparkSession.builder
    .appName("BDA-PracticeLab03")
    .config("spark.sql.session.timeZone", "UTC")
    .config("spark.sql.shuffle.partitions", "4")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("WARN")

print(f"Spark version: {spark.version}")
print(f"PySpark version: {pyspark.__version__}")
print(f"Python version: {sys.version.split()[0]}")
print(f"Session timezone: {spark.conf.get('spark.sql.session.timeZone')}")
print(f"Shuffle partitions: {spark.conf.get('spark.sql.shuffle.partitions')}")




Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/10/09 01:23:28 WARN Utils: Your hostname, btj-XPS-13-9380, resolves to a loopback address: 127.0.1.1; using 172.20.25.150 instead (on interface wlp2s0)
25/10/09 01:23:28 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


25/10/09 01:23:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark version: 4.0.1
PySpark version: 4.0.1
Python version: 3.13.5


Session timezone: UTC
Shuffle partitions: 4


## 1. Data acquisition

In [2]:
from pathlib import Path
import urllib.request
import zipfile

BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR / "data"
OUTPUTS_DIR = BASE_DIR / "outputs"
PROOF_DIR = BASE_DIR / "proof"

for directory in (DATA_DIR, OUTPUTS_DIR, PROOF_DIR):
    directory.mkdir(exist_ok=True)

karate_path = DATA_DIR / "karate_edges.txt"
if not karate_path.exists():
    synthetic_edges = [
        ("1", "2"), ("1", "3"), ("1", "4"), ("2", "3"), ("2", "5"), ("2", "6"),
        ("3", "4"), ("3", "6"), ("3", "7"), ("4", "5"), ("4", "7"), ("4", "8"),
        ("5", "6"), ("5", "8"), ("6", "7"), ("6", "9"), ("7", "8"), ("7", "10"),
        ("8", "1"), ("8", "9"), ("9", "10"), ("10", "1"), ("5", "1"), ("9", "2"),
    ]
    edge_text = "\n".join(f"{u} {v}" for u, v in synthetic_edges)
    karate_path.write_text(edge_text)
    print(f"Generated synthetic graph with {len(synthetic_edges)} directed edges at {karate_path}")
else:
    print(f"Found existing graph at {karate_path}")

sms_path = DATA_DIR / "sms.tsv"
if not sms_path.exists():
    sms_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
    zip_path = DATA_DIR / "smsspamcollection.zip"
    if not zip_path.exists():
        print("Downloading SMS Spam Collection dataset...")
        urllib.request.urlretrieve(sms_url, zip_path)
    with zipfile.ZipFile(zip_path, "r") as zf:
        with zf.open("SMSSpamCollection") as src, sms_path.open("wb") as dst:
            dst.write(src.read())
    print(f"Extracted SMS dataset to {sms_path}")
else:
    print(f"Found SMS dataset at {sms_path}")

print(f"Data directory ready: {DATA_DIR}")


Found existing graph at /home/btj/data-engineering-course/DE0/labs-final/lab3-practice/data/karate_edges.txt
Found SMS dataset at /home/btj/data-engineering-course/DE0/labs-final/lab3-practice/data/sms.tsv
Data directory ready: /home/btj/data-engineering-course/DE0/labs-final/lab3-practice/data


## 2. Helpers: tokenizers and hashing

In [3]:
import re
import hashlib
from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType, StringType
from pyspark.ml.linalg import SparseVector, VectorUDT

TOKEN_PATTERN = re.compile(r"[a-z0-9]+")
FEATURE_HASHSIZE = 1 << 18

def tokenize(text: str):
    if not text:
        return []
    return TOKEN_PATTERN.findall(text.lower())

def make_bigrams(tokens):
    return [f"{tokens[i]}_{tokens[i+1]}" for i in range(len(tokens) - 1)]

def hash_token(token: str) -> int:
    return int(hashlib.md5(token.encode("utf-8")).hexdigest(), 16) % FEATURE_HASHSIZE

def featurize(text: str) -> SparseVector:
    tokens = tokenize(text)
    grams = tokens + make_bigrams(tokens)
    if not grams:
        return SparseVector(FEATURE_HASHSIZE, [], [])
    counts = {}
    for gram in grams:
        idx = hash_token(gram)
        counts[idx] = counts.get(idx, 0.0) + 1.0
    indices = sorted(counts.keys())
    values = [float(counts[i]) for i in indices]
    return SparseVector(FEATURE_HASHSIZE, indices, values)

def featurize_counts(text: str):
    tokens = tokenize(text)
    grams = tokens + make_bigrams(tokens)
    counts = {}
    for gram in grams:
        idx = hash_token(gram)
        counts[idx] = counts.get(idx, 0.0) + 1.0
    return counts

tokenize_udf = F.udf(tokenize, ArrayType(StringType()))
featurize_udf = F.udf(featurize, VectorUDT())


## 3. Part A — Multi-Source Personalized PageRank

In [4]:
from operator import add
from contextlib import redirect_stdout
from io import StringIO
from pyspark.sql import functions as F

edges_rdd = spark.sparkContext.textFile(str(karate_path)).filter(lambda line: line.strip() and not line.startswith("#"))
edges_pairs = edges_rdd.map(lambda line: line.strip().split()).filter(lambda parts: len(parts) == 2).map(lambda parts: (parts[0], parts[1]))

nodes = edges_pairs.flatMap(lambda kv: kv).distinct().collect()
adjacency_map = edges_pairs.groupByKey().mapValues(lambda nbrs: list(dict.fromkeys(nbrs))).collectAsMap()
for node in nodes:
    adjacency_map.setdefault(node, [])

adjacency_rdd = spark.sparkContext.parallelize(list(adjacency_map.items())).cache()
nodes_rdd = adjacency_rdd.keys().cache()

alpha = 0.85
num_iters = 10
sources = [nodes[0], nodes[2], nodes[4]] if len(nodes) >= 5 else nodes[:1]
source_set = set(sources)
initial_mass = 1.0 / len(source_set)
k = min(10, len(nodes))

ranks = nodes_rdd.map(lambda node: (node, initial_mass if node in source_set else 0.0))
print(f"Running PPR with alpha={alpha}, iterations={num_iters}, sources={sources}")

for iteration in range(1, num_iters + 1):
    joined = adjacency_rdd.join(ranks)
    dangling_mass = joined.filter(lambda kv: len(kv[1][0]) == 0).map(lambda kv: kv[1][1]).sum()
    contribs = (
        joined
        .flatMap(lambda kv: [] if len(kv[1][0]) == 0 else [(nbr, kv[1][1] / len(kv[1][0])) for nbr in kv[1][0]])
        .reduceByKey(add)
    )
    base = (
        nodes_rdd.map(lambda node: (node, 0.0))
        .leftOuterJoin(contribs)
        .mapValues(lambda pair: pair[1] if pair[1] is not None else 0.0)
    )
    teleport_mass = (1.0 - alpha) + alpha * dangling_mass
    jump_mass = teleport_mass / len(source_set)
    ranks = base.map(lambda kv: (kv[0], alpha * kv[1] + (jump_mass if kv[0] in source_set else 0.0)))
    total_mass = ranks.values().sum()
    ranks = ranks.mapValues(lambda value: value / total_mass)
    preview = ranks.takeOrdered(3, key=lambda kv: -kv[1])
    print(f"Iteration {iteration:02d} | total_mass={total_mass:.6f} | preview={preview}")

ppr_topk = ranks.takeOrdered(k, key=lambda kv: -kv[1])
ppr_df = spark.createDataFrame(ppr_topk, schema=["node", "score"]).orderBy(F.desc("score"))

output_ppr_path = OUTPUTS_DIR / "ppr_topk.csv"
ppr_df.toPandas().to_csv(output_ppr_path, index=False)
print(f"Saved top-{k} PPR scores to {output_ppr_path}")

plan_buffer = StringIO()
with redirect_stdout(plan_buffer):
    ppr_df.explain("formatted")
(PROOF_DIR / "plan_ppr.txt").write_text(plan_buffer.getvalue())



[Stage 0:>                                                          (0 + 2) / 2]


                                                                                

Running PPR with alpha=0.85, iterations=10, sources=['3', '5', '7']



[Stage 4:>                                                         (0 + 8) / 16]








                                                                                








                                                                                



                                                                                

Iteration 01 | total_mass=1.000000 | preview=[('8', 0.2361111111111111), ('6', 0.18888888888888888), ('7', 0.14444444444444446)]








                                                                                














                                                                                



                                                                                

Iteration 02 | total_mass=1.000000 | preview=[('1', 0.2349305555555556), ('9', 0.180625), ('7', 0.17120370370370372)]









                                                                                
















                                                                                






                                                                                

Iteration 03 | total_mass=1.000000 | preview=[('10', 0.14952719907407408), ('2', 0.14332928240740742), ('3', 0.1241454475308642)]














                                                                                

















                                                                                






                                                                                

Iteration 04 | total_mass=1.000000 | preview=[('1', 0.19179332883230454), ('7', 0.1319047582304527), ('3', 0.12387692579732512)]



















                                                                                

























                                                                                






                                                                                

Iteration 05 | total_mass=1.000000 | preview=[('7', 0.14502875795717596), ('1', 0.12347244909550756), ('3', 0.12045327369470166)]





















                                                                                































                                                                                








                                                                                

Iteration 06 | total_mass=1.000000 | preview=[('7', 0.14515809592549725), ('1', 0.14200011305753815), ('8', 0.11120454600225586)]



























                                                                                





































                                                                                











                                                                                

Iteration 07 | total_mass=1.000000 | preview=[('1', 0.1553613090023497), ('7', 0.13506524349475568), ('3', 0.10938526687363735)]

































                                                                                









































                                                                                











                                                                                

Iteration 08 | total_mass=1.000000 | preview=[('1', 0.15352265900708728), ('7', 0.1341868014205485), ('3', 0.11525402134675089)]








































                                                                                












































                                                                                














                                                                                

Iteration 09 | total_mass=1.000000 | preview=[('1', 0.14792829059900203), ('7', 0.13679026852550694), ('3', 0.11584732305605547)]





































                                                                                
















































                                                                                














                                                                                

Iteration 10 | total_mass=1.000000 | preview=[('1', 0.14799371768224318), ('7', 0.13877581618530058), ('3', 0.11381387830851719)]
















                                                                                

Saved top-10 PPR scores to /home/btj/data-engineering-course/DE0/labs-final/lab3-practice/outputs/ppr_topk.csv


1323

## 4. Part B — Spam classification (baseline with MLlib)

In [5]:
import os
from pyspark.sql import functions as F, types as T
from pyspark.sql.types import DoubleType
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

schema = T.StructType([
    T.StructField("label_raw", T.StringType(), False),
    T.StructField("text", T.StringType(), False),
])

sms_df = spark.read.csv(str(sms_path), sep="	", schema=schema)
sms_df = sms_df.filter(F.col("text").isNotNull())
sms_df = sms_df.withColumn("label", F.when(F.col("label_raw") == "spam", F.lit(1.0)).otherwise(F.lit(0.0)))

features_df = sms_df.select("label", "text", featurize_udf("text").alias("features")).cache()

train_df, test_df = features_df.randomSplit([0.8, 0.2], seed=42)
print(f"Training instances: {train_df.count()}, Validation instances: {test_df.count()}")

lr = LogisticRegression(
    featuresCol="features",
    labelCol="label",
    predictionCol="prediction",
    probabilityCol="probability",
    rawPredictionCol="rawPrediction",
    regParam=0.01,
    elasticNetParam=0.0,
    maxIter=80,
)

lr_model = lr.fit(train_df)
predictions = lr_model.transform(test_df).cache()

evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
auc = evaluator.evaluate(predictions)
print(f"Validation AUC: {auc:.4f}")

extract_prob_udf = F.udf(lambda v: float(v[1]) if v is not None else 0.0, DoubleType())
with_probs = predictions.withColumn("prob_spam", extract_prob_udf(F.col("probability")))
threshold = 0.5
scored = with_probs.withColumn("pred_label", F.when(F.col("prob_spam") >= threshold, F.lit(1.0)).otherwise(F.lit(0.0)))

agg = scored.agg(
    F.sum(F.when((F.col("label") == 1.0) & (F.col("pred_label") == 1.0), 1).otherwise(0)).alias("tp"),
    F.sum(F.when((F.col("label") == 0.0) & (F.col("pred_label") == 1.0), 1).otherwise(0)).alias("fp"),
    F.sum(F.when((F.col("label") == 1.0) & (F.col("pred_label") == 0.0), 1).otherwise(0)).alias("fn"),
    F.count("*").alias("total")
).collect()[0]

tp = float(agg["tp"])
fp = float(agg["fp"])
fn = float(agg["fn"])
precision = tp / (tp + fp) if (tp + fp) else 0.0
recall = tp / (tp + fn) if (tp + fn) else 0.0

metrics_lines = [
    "# SMS Spam Classification Metrics",
    "",
    f"AUC: {auc:.4f}",
    f"Threshold: {threshold}",
    f"Precision: {precision:.4f}",
    f"Recall: {recall:.4f}",
    "",
    "## Logistic Regression Summary",
    f"Intercept: {lr_model.intercept:.4f}",
    f"Non-zero coefficients: {len([v for v in lr_model.coefficients if v != 0.0])}",
    f"Feature space size: {FEATURE_HASHSIZE}",
]

metrics_path = OUTPUTS_DIR / "sms_metrics.md"
separator = os.linesep
metrics_path.write_text(separator.join(metrics_lines) + separator)
print(separator.join(metrics_lines))



[Stage 544:>                                                        (0 + 1) / 1]


                                                                                

Training instances: 4503, Validation instances: 1071



[Stage 551:>                                                        (0 + 1) / 1]

                                                                                


[Stage 619:>                                                        (0 + 1) / 1]


                                                                                

Validation AUC: 0.9960


# SMS Spam Classification Metrics

AUC: 0.9960
Threshold: 0.5
Precision: 1.0000
Recall: 0.6667

## Logistic Regression Summary
Intercept: -6.7134
Non-zero coefficients: 40793
Feature space size: 262144


## 5. Part B — Spam classification (manual SGD, optional)

In [6]:
import os
import math
import random

train_records = [(float(row.label), featurize_counts(row.text)) for row in train_df.select("label", "text").collect()]
test_records = [(float(row.label), featurize_counts(row.text)) for row in test_df.select("label", "text").collect()]

weights = {}
bias = 0.0
learning_rate = 0.1
reg = 1e-5
epochs = 5


def sigmoid(x):
    return 1.0 / (1.0 + math.exp(-x))


for epoch in range(epochs):
    random.shuffle(train_records)
    for label, features in train_records:
        dot = bias
        for idx, value in features.items():
            dot += weights.get(idx, 0.0) * value
        pred = sigmoid(dot)
        error = pred - label
        for idx, value in features.items():
            w = weights.get(idx, 0.0)
            grad = error * value + reg * w
            weights[idx] = w - learning_rate * grad
        bias -= learning_rate * (error + reg * bias)
    learning_rate *= 0.9
    print(f"Epoch {epoch+1} completed; learning_rate={learning_rate:.4f}")

predictions_manual = []
for label, features in test_records:
    dot = bias
    for idx, value in features.items():
        dot += weights.get(idx, 0.0) * value
    prob = sigmoid(dot)
    predictions_manual.append((label, prob))

threshold_manual = 0.5
tp = fp = fn = tn = 0
for label, prob in predictions_manual:
    pred = 1.0 if prob >= threshold_manual else 0.0
    if label == 1.0 and pred == 1.0:
        tp += 1
    elif label == 0.0 and pred == 1.0:
        fp += 1
    elif label == 1.0 and pred == 0.0:
        fn += 1
    else:
        tn += 1

precision_manual = tp / (tp + fp) if (tp + fp) else 0.0
recall_manual = tp / (tp + fn) if (tp + fn) else 0.0

sorted_scores = sorted(predictions_manual, key=lambda pair: pair[1])
pos = sum(1 for label, _ in sorted_scores if label == 1.0)
neg = len(sorted_scores) - pos
rank_sum = 0.0
for rank, (label, _) in enumerate(sorted_scores, start=1):
    if label == 1.0:
        rank_sum += rank
auc_manual = (rank_sum - pos * (pos + 1) / 2.0) / (pos * neg) if pos and neg else 0.0

manual_lines = [
    "",
    "## Manual SGD Summary",
    f"Epochs: {epochs}",
    f"Learning rate (final): {learning_rate:.4f}",
    f"AUC: {auc_manual:.4f}",
    f"Precision (threshold {threshold_manual}): {precision_manual:.4f}",
    f"Recall (threshold {threshold_manual}): {recall_manual:.4f}",
]

separator = os.linesep
with open(OUTPUTS_DIR / "sms_metrics.md", "a", encoding="utf-8") as handle:
    handle.write(separator.join(manual_lines) + separator)

print(separator.join(manual_lines))


Epoch 1 completed; learning_rate=0.0900


Epoch 2 completed; learning_rate=0.0810


Epoch 3 completed; learning_rate=0.0729


Epoch 4 completed; learning_rate=0.0656


Epoch 5 completed; learning_rate=0.0590

## Manual SGD Summary
Epochs: 5
Learning rate (final): 0.0590
AUC: 0.9863
Precision (threshold 0.5): 0.9926
Recall (threshold 0.5): 0.9184


## 6. Spark UI evidence
Open http://localhost:4040 during runs. Capture Files Read, Input Size, Shuffle Read/Write and store screenshots under `proof/`.

## 7. Environment and reproducibility

In [7]:
import os
import subprocess


def get_java_version():
    try:
        output = subprocess.check_output(["java", "-version"], stderr=subprocess.STDOUT)
        return output.decode("utf-8").strip().splitlines()[0]
    except Exception as exc:
        return f"Unavailable ({exc})"

java_version = get_java_version()
print(f"Java: {java_version}")

print("Spark configuration (selected):")
conf_items = sorted(spark.sparkContext.getConf().getAll())
for key, value in conf_items:
    print(f" - {key} = {value}")

env_lines = [
    "# Environment Summary",
    "",
    f"- Python: {sys.version.split()[0]}",
    f"- Spark: {spark.version}",
    f"- PySpark: {pyspark.__version__}",
    f"- Java: {java_version}",
    f"- OS: {platform.platform()}",
    "",
    "## Spark Configuration",
]

env_lines.extend(f"- {k} = {v}" for k, v in conf_items)

newline = os.linesep
ENV_PATH = BASE_DIR / "ENV.md"
ENV_PATH.write_text(newline.join(env_lines) + newline)

print(f"Environment summary saved to {ENV_PATH}")


Java: openjdk version "21.0.8" 2025-07-15
Spark configuration (selected):


 - spark.app.id = local-1759965811807
 - spark.app.name = BDA-PracticeLab03
 - spark.app.startTime = 1759965810192
 - spark.app.submitTime = 1759965809434
 - spark.driver.extraJavaOptions = -Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-modules=jdk.incubator.vector --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.secur