<a href="https://colab.research.google.com/github/Rachelllle/Spark-Core/blob/main/Session03.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
from pyspark import SparkContext

In [20]:
from pyspark import SparkConf # Import SparkConf

# Use getOrCreate to handle multiple context creations gracefully
conf = SparkConf().setAppName("MonApp").setMaster("local")
sc = SparkContext.getOrCreate(conf=conf)

# Création du RDD depuis la liste de tuples
data = [
    ("Alice", 100),
    ("Bob", 50),
    ("Alice", 25),
    ("Bob", 75),
    ("Charlie", 200)
]

rdd = sc.parallelize(data)

In [21]:
# Somme des montants par utilisateur
rdd_group = rdd.groupByKey().mapValues(lambda values: sum(values))

print(rdd_group.collect())

[('Alice', 125), ('Bob', 125), ('Charlie', 200)]


In [22]:
# Somme des montants par utilisateur
rdd_reduce = rdd.reduceByKey(lambda a, b: a + b)
print(rdd_reduce.collect())


[('Alice', 125), ('Bob', 125), ('Charlie', 200)]


In [23]:
# 2. Trier par clé (ordre alphabétique)
rdd_sorted = rdd_reduce.sortByKey()

# Afficher le résultat
print(rdd_sorted.collect())

[('Alice', 125), ('Bob', 125), ('Charlie', 200)]


In [24]:
# DAG for groupByKey
print("=== groupByKey DAG ===")
print(rdd_group.toDebugString().decode("utf-8"))

# DAG for reduceByKey
print("=== reduceByKey DAG ===")
print(rdd_reduce.toDebugString().decode("utf-8"))

=== groupByKey DAG ===
(1) PythonRDD[57] at collect at /tmp/ipython-input-3665508038.py:4 []
 |  MapPartitionsRDD[56] at mapPartitions at PythonRDD.scala:168 []
 |  ShuffledRDD[55] at partitionBy at NativeMethodAccessorImpl.java:0 []
 +-(1) PairwiseRDD[54] at groupByKey at /tmp/ipython-input-3665508038.py:2 []
    |  PythonRDD[53] at groupByKey at /tmp/ipython-input-3665508038.py:2 []
    |  ParallelCollectionRDD[52] at readRDDFromFile at PythonRDD.scala:297 []
=== reduceByKey DAG ===
(1) PythonRDD[62] at collect at /tmp/ipython-input-3212256562.py:3 []
 |  MapPartitionsRDD[61] at mapPartitions at PythonRDD.scala:168 []
 |  ShuffledRDD[60] at partitionBy at NativeMethodAccessorImpl.java:0 []
 +-(1) PairwiseRDD[59] at reduceByKey at /tmp/ipython-input-3212256562.py:2 []
    |  PythonRDD[58] at reduceByKey at /tmp/ipython-input-3212256562.py:2 []
    |  ParallelCollectionRDD[52] at readRDDFromFile at PythonRDD.scala:297 []


In [25]:
# Calculer la moyenne par utilisateur (sans groupByKey)

rdd_tuples = rdd.mapValues(lambda x: (x, 1))
rdd_combined = rdd_tuples.reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1]))
rdd_avg = rdd_combined.mapValues(lambda x: x[0] / x[1])

print(rdd_avg.sortByKey().collect())

[('Alice', 62.5), ('Bob', 62.5), ('Charlie', 200.0)]
