<a href="https://colab.research.google.com/github/Rachelllle/Spark-Core/blob/main/Session06.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from pyspark import SparkContext

In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
import pyspark.sql.functions as F

spark = SparkSession.builder.appName("ExercicesJointures").getOrCreate()


users_data = [
    (1, "FR"),
    (2, "FR"),
    (3, "UK"),
    (4, "DE"),
    (5, "ES")
]

users_schema = StructType([
    StructField("user_id", IntegerType(), False),
    StructField("country", StringType(), False)
])

users = spark.createDataFrame(users_data, users_schema)

transactions_data = ([(1, 100)] * 50 +
                     [(2, 200)] * 5 +
                     [(3, 150)] * 5 +
                     [(4, 50)] * 5 +
                     [(5, 300)] * 5)

transactions_schema = StructType([
    StructField("user_id", IntegerType(), False),
    StructField("amount", IntegerType(), False)
])

transactions = spark.createDataFrame(transactions_data, transactions_schema)

In [30]:
joined_df = users.join(transactions, "user_id", "inner")
print(joined_df.show(5))

+-------+-------+------+
|user_id|country|amount|
+-------+-------+------+
|      1|     FR|   100|
|      1|     FR|   100|
|      1|     FR|   100|
|      1|     FR|   100|
|      1|     FR|   100|
+-------+-------+------+
only showing top 5 rows
None


In [31]:
res_exo = joined_df.groupBy("country").agg(F.sum("amount").alias("total_amount"))
print(res_exo.show(5))

+-------+------------+
|country|total_amount|
+-------+------------+
|     DE|         250|
|     ES|        1500|
|     FR|        6000|
|     UK|         750|
+-------+------------+

None


In [22]:
broadcast_joined_df = users.join(F.broadcast(transactions), "user_id", "inner")
print(broadcast_joined_df.show(5))

+-------+-------+------+
|user_id|country|amount|
+-------+-------+------+
|      1|     FR|   100|
|      1|     FR|   100|
|      1|     FR|   100|
|      1|     FR|   100|
|      1|     FR|   100|
+-------+-------+------+
only showing top 5 rows
None


In [32]:
skew_check = transactions.groupBy("user_id").count().orderBy(F.desc("count"))
skew_check.show()

+-------+-----+
|user_id|count|
+-------+-----+
|      1|   50|
|      3|    5|
|      5|    5|
|      4|    5|
|      2|    5|
+-------+-----+



In [34]:
transactions_repartitioned = transactions.repartition("user_id")
final_join = users.join(transactions_repartitioned, on="user_id", how="inner")
final_join.show(5)

+-------+-------+------+
|user_id|country|amount|
+-------+-------+------+
|      1|     FR|   100|
|      1|     FR|   100|
|      1|     FR|   100|
|      1|     FR|   100|
|      1|     FR|   100|
+-------+-------+------+
only showing top 5 rows
