In [13]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as sf

spark = SparkSession.builder.appName('Jupyter')\
    .config('spark.driver.memory', '5g')\
    .getOrCreate()

In [14]:
spark.conf.get('spark.driver.memory')

'5g'

In [15]:
matches = spark.read.options(
    header=True,
    inferSchema=True
).csv('/home/iceberg/data/matches.csv')

matches.schema


StructType([StructField('match_id', StringType(), True), StructField('mapid', StringType(), True), StructField('is_team_game', BooleanType(), True), StructField('playlist_id', StringType(), True), StructField('game_variant_id', StringType(), True), StructField('is_match_over', BooleanType(), True), StructField('completion_date', TimestampType(), True), StructField('match_duration', StringType(), True), StructField('game_mode', StringType(), True), StructField('map_variant_id', StringType(), True)])

In [16]:
match_details = spark.read.options(
    header=True,
    inferSchema=True
).csv('/home/iceberg/data/match_details.csv')

match_details.schema

                                                                                

StructType([StructField('match_id', StringType(), True), StructField('player_gamertag', StringType(), True), StructField('previous_spartan_rank', IntegerType(), True), StructField('spartan_rank', IntegerType(), True), StructField('previous_total_xp', IntegerType(), True), StructField('total_xp', IntegerType(), True), StructField('previous_csr_tier', IntegerType(), True), StructField('previous_csr_designation', IntegerType(), True), StructField('previous_csr', IntegerType(), True), StructField('previous_csr_percent_to_next_tier', IntegerType(), True), StructField('previous_csr_rank', IntegerType(), True), StructField('current_csr_tier', IntegerType(), True), StructField('current_csr_designation', IntegerType(), True), StructField('current_csr', IntegerType(), True), StructField('current_csr_percent_to_next_tier', IntegerType(), True), StructField('current_csr_rank', IntegerType(), True), StructField('player_rank_on_team', IntegerType(), True), StructField('player_finished', BooleanType(

In [17]:
spark.sql('DROP TABLE IF EXISTS bootcamp.matches_bucketed')
spark.sql('''
    CREATE TABLE IF NOT EXISTS bootcamp.matches_bucketed (
        match_id STRING,
        is_team_game BOOLEAN,
        playlist_id STRING,
        completion_date TIMESTAMP
    )
    USING iceberg
    PARTITIONED BY (months(completion_date), bucket(16, match_id))
''')

matches\
    .select(
        sf.col('match_id'),
        sf.col('is_team_game'),
        sf.col('playlist_id'),
        sf.col('completion_date')
    )\
    .writeTo('bootcamp.matches_bucketed')\
    .partitionedBy(
        sf.months('completion_date'),
        sf.bucket(16, 'match_id')
    )\
    .append()


                                                                                

In [18]:
spark.sql('DROP TABLE IF EXISTS bootcamp.match_details_bucketed')
match_details.writeTo('bootcamp.match_details_bucketed')\
    .partitionedBy(
        sf.bucket(16, 'match_id')
    ).create()

                                                                                

In [None]:
%%sql
SELECT * FROM bootcamp.matches_bucketed.files

In [20]:
spark.conf.set('spark.sql.autoBroadcastJoinThreshold', '-1')
spark.conf.get('spark.sql.autoBroadcastJoinThreshold')

'-1'

In [24]:
spark.sql('''
    SELECT *
    FROM bootcamp.matches_bucketed mb
    JOIN bootcamp.match_details_bucketed mdb
    ON mb.match_id = mdb.match_id AND mb.completion_date = DATE('2016-01-01')
''').explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- SortMergeJoin [match_id#1213], [match_id#1217], Inner
   :- Sort [match_id#1213 ASC NULLS FIRST], false, 0
   :  +- Exchange hashpartitioning(match_id#1213, 200), ENSURE_REQUIREMENTS, [plan_id=403]
   :     +- Filter (completion_date#1216 = 2016-01-01 00:00:00)
   :        +- BatchScan demo.bootcamp.matches_bucketed[match_id#1213, is_team_game#1214, playlist_id#1215, completion_date#1216] demo.bootcamp.matches_bucketed (branch=null) [filters=completion_date IS NOT NULL, completion_date = 1451606400000000, match_id IS NOT NULL, groupedBy=] RuntimeFilters: []
   +- Sort [match_id#1217 ASC NULLS FIRST], false, 0
      +- Exchange hashpartitioning(match_id#1217, 200), ENSURE_REQUIREMENTS, [plan_id=404]
         +- BatchScan demo.bootcamp.match_details_bucketed[match_id#1217, player_gamertag#1218, previous_spartan_rank#1219, spartan_rank#1220, previous_total_xp#1221, total_xp#1222, previous_csr_tier#1223, previous_csr_designation#12

In [25]:
spark.sql('''
    SELECT *
    FROM bootcamp.matches_bucketed mb
    JOIN bootcamp.match_details_bucketed mdb
''').explain()

== Physical Plan ==
CartesianProduct
:- *(1) ColumnarToRow
:  +- BatchScan demo.bootcamp.matches_bucketed[match_id#1344, is_team_game#1345, playlist_id#1346, completion_date#1347] demo.bootcamp.matches_bucketed (branch=null) [filters=, groupedBy=] RuntimeFilters: []
+- *(2) ColumnarToRow
   +- BatchScan demo.bootcamp.match_details_bucketed[match_id#1348, player_gamertag#1349, previous_spartan_rank#1350, spartan_rank#1351, previous_total_xp#1352, total_xp#1353, previous_csr_tier#1354, previous_csr_designation#1355, previous_csr#1356, previous_csr_percent_to_next_tier#1357, previous_csr_rank#1358, current_csr_tier#1359, current_csr_designation#1360, current_csr#1361, current_csr_percent_to_next_tier#1362, current_csr_rank#1363, player_rank_on_team#1364, player_finished#1365, player_average_life#1366, player_total_kills#1367, player_total_headshots#1368, player_total_weapon_damage#1369, player_total_shots_landed#1370, player_total_melee_kills#1371, ... 12 more fields] demo.bootcamp.match_

In [29]:
spark.table('bootcamp.matches_bucketed').count()

24025

In [36]:
spark.table('bootcamp.match_details_bucketed').alias('mdb').join(
    sf.broadcast(spark.table('bootcamp.matches_bucketed')).alias('mb'),
    sf.col('mb.match_id') == sf.col('mdb.match_id')
).where("mb.completion_date = DATE('2016-01-01')").explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- BroadcastHashJoin [match_id#2164], [match_id#2236], Inner, BuildRight, false
   :- BatchScan demo.bootcamp.match_details_bucketed[match_id#2164, player_gamertag#2165, previous_spartan_rank#2166, spartan_rank#2167, previous_total_xp#2168, total_xp#2169, previous_csr_tier#2170, previous_csr_designation#2171, previous_csr#2172, previous_csr_percent_to_next_tier#2173, previous_csr_rank#2174, current_csr_tier#2175, current_csr_designation#2176, current_csr#2177, current_csr_percent_to_next_tier#2178, current_csr_rank#2179, player_rank_on_team#2180, player_finished#2181, player_average_life#2182, player_total_kills#2183, player_total_headshots#2184, player_total_weapon_damage#2185, player_total_shots_landed#2186, player_total_melee_kills#2187, ... 12 more fields] demo.bootcamp.match_details_bucketed (branch=null) [filters=match_id IS NOT NULL, groupedBy=] RuntimeFilters: [dynamicpruningexpression(match_id#2164 IN dynamicpruning#2369)