In [49]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import sys

#--------------
#   QUERY 3
#--------------

#The chances of winning given the first hand
#It helps to maximize the profit and minimize the loss by evaluating the initial hand

spark = SparkSession.builder \
    .appName("TGVD_GenericQuery") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")

path_training = "CardsParquetData/trained_blackjack.parquet"
path_match = "CardsParquetData/played_blackjack.parquet"

df_train = spark.read.parquet(path_training)
df_play = spark.read.parquet(path_match)

print(df_play.count())
print(df_train.count())

1000
2000


In [50]:
from pyspark.sql.functions import floor
from pyspark.sql.functions import monotonically_increasing_id

df_train = df_train.withColumn("index", (monotonically_increasing_id() + 1))
df_play = df_play.withColumn("index", (monotonically_increasing_id() + 1))

#We do the chunk part just for the train (to analize the behaviour change through the iterations of the training)
CHUNK_SIZE = 500
df_chunks_train = df_train.withColumn("Chunk Number", floor(col("index")/CHUNK_SIZE))

In [51]:
#Funtion to detect if we have a win, draw or a lose
from pyspark.sql.functions import coalesce, when

rev_hand_cols = ["Hand 7", "Hand 6", "Hand 5", "Hand 4", "Hand 3", "Hand 2", "Hand 1", "Hand 0"]
df_result_train = df_chunks_train.withColumn("Final_Hand", coalesce(*[col(c) for c in rev_hand_cols]))
df_result_train.show()

df_result_train = df_result_train.withColumn(
    "Result",
    when(col("Final_Hand").isNull(), "Unknown")
    .when(col("Final_Hand")[0] > 21, "Lose")
    .when(col("Final_Hand")[1] > 21, "Win")
    .when(col("Final_Hand")[0] > col("Final_Hand")[1], "Win")
    .when(col("Final_Hand")[0] < col("Final_Hand")[1], "Lose")
    .otherwise("Draw")
)
df_result_train.show()

rev_hand_cols = ["Hand 5", "Hand 4", "Hand 3", "Hand 2", "Hand 1", "Hand 0"]
df_result_play = df_play.withColumn("Final_Hand", coalesce(*[col(c) for c in rev_hand_cols]))
df_result_play.show()

df_result_play = df_result_play.withColumn(
    "Result",
    when(col("Final_Hand").isNull(), "Unknown")
    .when(col("Final_Hand")[0] > 21, "Lose")
    .when(col("Final_Hand")[1] > 21, "Win")
    .when(col("Final_Hand")[0] > col("Final_Hand")[1], "Win")
    .when(col("Final_Hand")[0] < col("Final_Hand")[1], "Lose")
    .otherwise("Draw")
)
df_result_play.show()

+-------------------+-----------+--------+--------+--------+--------+------+------+------+------+-----+------------+----------+
|          Timestamp|Shown_cards|  Hand 0|  Hand 1|  Hand 2|  Hand 3|Hand 4|Hand 5|Hand 6|Hand 7|index|Chunk Number|Final_Hand|
+-------------------+-----------+--------+--------+--------+--------+------+------+------+------+-----+------------+----------+
|2025-05-12 19:24:20|    [10, 4]| [10, 7]|[10, 11]|[10, 21]|[10, 21]|  NULL|  NULL|  NULL|  NULL|    1|           0|  [10, 21]|
|2025-05-12 19:24:20|     [4, 7]|[13, 12]|[23, 21]|    NULL|    NULL|  NULL|  NULL|  NULL|  NULL|    2|           0|  [23, 21]|
|2025-05-12 19:24:20|   [-1, 10]|[14, 18]|[14, 29]|    NULL|    NULL|  NULL|  NULL|  NULL|  NULL|    3|           0|  [14, 29]|
|2025-05-12 19:24:20|    [8, 12]|[20, 12]|[22, 12]|    NULL|    NULL|  NULL|  NULL|  NULL|  NULL|    4|           0|  [22, 12]|
|2025-05-12 19:24:20|     [3, 6]| [3, 15]| [3, 23]|    NULL|    NULL|  NULL|  NULL|  NULL|  NULL|    5| 

In [52]:
df_clean_play = df_result_play.withColumn("1st Hand Card", col("Shown_cards")[0]).select("Final_Hand", "Result", "1st Hand Card")
df_clean_play.show()
df_clean_train = df_result_train.withColumn("1st Hand Card", col("Shown_cards")[0]).select("Chunk Number", "Final_Hand", "Result", "1st Hand Card")
df_clean_train.show()

+----------+------+-------------+
|Final_Hand|Result|1st Hand Card|
+----------+------+-------------+
|  [19, 21]|  Lose|           10|
|  [15, 26]|   Win|            4|
|  [12, 26]|   Win|           12|
|  [21, 20]|   Win|           11|
|  [10, 22]|   Win|           10|
|  [10, 30]|   Win|           10|
|  [19, 14]|   Win|            8|
|  [19, 21]|  Lose|           11|
|  [15, 24]|   Win|            7|
|  [12, 28]|   Win|            8|
|  [12, 27]|   Win|            6|
|  [22, 18]|  Lose|            8|
|  [20, 10]|   Win|            8|
|  [20, 18]|   Win|            5|
|  [26, 17]|  Lose|            8|
|  [11, 26]|   Win|            6|
|  [18, 25]|   Win|            5|
|  [17, 25]|   Win|            5|
|  [12, 23]|   Win|           12|
|  [20, 21]|  Lose|           -1|
+----------+------+-------------+
only showing top 20 rows

+------------+----------+------+-------------+
|Chunk Number|Final_Hand|Result|1st Hand Card|
+------------+----------+------+-------------+
|           0|  [

In [53]:
df_clean_play = df_clean_play.groupBy("1st Hand Card", "Result").count().orderBy(["count", "1st Hand Card"], ascending = False)
df_clean_play.show()

df_clean_train = df_clean_train.groupBy("1st Hand Card", "Result", "Chunk Number").count().orderBy(["Chunk Number", "count", "1st Hand Card"], ascending = False)
df_clean_train.show()

+-------------+------+-----+
|1st Hand Card|Result|count|
+-------------+------+-----+
|            8|   Win|   66|
|           10|   Win|   63|
|           10|  Lose|   51|
|            5|  Lose|   49|
|           11|   Win|   46|
|           12|  Lose|   45|
|            6|  Lose|   42|
|            4|  Lose|   42|
|           12|   Win|   41|
|            6|   Win|   41|
|            8|  Lose|   40|
|            7|  Lose|   39|
|           11|  Lose|   34|
|            7|   Win|   34|
|            3|  Lose|   33|
|            9|  Lose|   31|
|            8|  Draw|   30|
|            4|   Win|   30|
|            3|   Win|   30|
|            2|  Lose|   30|
+-------------+------+-----+
only showing top 20 rows

+-------------+------+------------+-----+
|1st Hand Card|Result|Chunk Number|count|
+-------------+------+------------+-----+
|            2|   Win|           4|    1|
|            8|   Win|           3|   41|
|           10|   Win|           3|   39|
|            7|  Lose|    

In [54]:
from pyspark.sql.functions import sum, count

total_df = df_clean_train.groupBy("1st Hand Card", "Chunk Number").agg(sum("count").alias("TotalGames"))
wins_df = df_clean_train.filter(col("Result") == "Win").groupBy("1st Hand Card", "Chunk Number").agg(sum("count").alias("Wins"))
df_winrate_train = total_df.join(wins_df, on=["1st Hand Card", "Chunk Number"], how="left").fillna(0, subset=["Wins"]).withColumn("Winning Rate Proportion", col("Wins") / col("TotalGames")).orderBy("Winning Rate Proportion", ascending = False)
df_winrate_train.show(10)

total_df = df_clean_play.groupBy("1st Hand Card").agg(sum("count").alias("TotalGames"))
wins_df = df_clean_play.filter(col("Result") == "Win").groupBy("1st Hand Card").agg(sum("count").alias("Wins"))
df_winrate_play = total_df.join(wins_df, on=["1st Hand Card"], how="left").fillna(0, subset=["Wins"]).withColumn("Winning Rate Proportion", col("Wins") / col("TotalGames")).orderBy("Winning Rate Proportion", ascending = False)
df_winrate_play.show(1000)

+-------------+------------+----------+----+-----------------------+
|1st Hand Card|Chunk Number|TotalGames|Wins|Winning Rate Proportion|
+-------------+------------+----------+----+-----------------------+
|            2|           4|         1|   1|                    1.0|
|            0|           0|         7|   5|     0.7142857142857143|
|            1|           3|        15|  10|     0.6666666666666666|
|           11|           3|        38|  25|     0.6578947368421053|
|            1|           2|        14|   9|     0.6428571428571429|
|           12|           1|        38|  24|      0.631578947368421|
|           10|           3|        63|  39|     0.6190476190476191|
|            1|           0|        20|  12|                    0.6|
|            8|           3|        69|  41|     0.5942028985507246|
|            9|           2|        33|  19|     0.5757575757575758|
+-------------+------------+----------+----+-----------------------+
only showing top 10 rows

+-------

In [55]:
#We prepare the data in order to do a collect_set aggregation.
#With this function we will be able to see how many unique move values (n_moves) there are per first hand, indicating variability in the strategy.
from functools import reduce
from pyspark.sql import functions as F

df_moves_train = df_result_train.select(
    col("index"),
    col("Shown_cards").alias("Hand_-1"),
    col("Hand 0").alias("Hand_0"),
    col("Hand 1").alias("Hand_1"),
    col("Hand 2").alias("Hand_2"),
    col("Hand 3").alias("Hand_3"),
    col("Hand 4").alias("Hand_4"),
    col("Hand 5").alias("Hand_5"),
    col("Hand 6").alias("Hand_6"),
    col("Hand 7").alias("Hand_7"),
    col("Chunk Number")
)
hand_cols = ["Hand_0", "Hand_1", "Hand_2", "Hand_3", "Hand_4", "Hand_5", "Hand_6", "Hand_7"]
df_moves_train = df_moves_train.withColumn("n_moves",
    1 + reduce(
        lambda a, b: a + b,
        [
            when((col(f"Hand_{i-1}")[0].isNotNull()) & (col(f"Hand_{i-1}")[0] != col(f"Hand_{i-2}")[0]), F.lit(1)).otherwise(F.lit(0))
            for i in range(1, len(hand_cols))
        ]
    )
)

df_moves_play = df_result_play.select(
    col("index"),
    col("Shown_cards").alias("Hand_-1"),
    col("Hand 0").alias("Hand_0"),
    col("Hand 1").alias("Hand_1"),
    col("Hand 2").alias("Hand_2"),
    col("Hand 3").alias("Hand_3"),
    col("Hand 4").alias("Hand_4"),
    col("Hand 5").alias("Hand_5")
)
hand_cols = ["Hand_0", "Hand_1", "Hand_2", "Hand_3", "Hand_4", "Hand_5"]
df_moves_play = df_moves_play.withColumn("n_moves",
    1 + reduce(
        lambda a, b: a + b,
        [
            when((col(f"Hand_{i-1}")[0].isNotNull()) & (col(f"Hand_{i-1}")[0] != col(f"Hand_{i-2}")[0]), F.lit(1)).otherwise(F.lit(0))
            for i in range(1, len(hand_cols))
        ]
    )
)

df_moves_train.show()
df_moves_play.show()

+-----+--------+--------+--------+--------+--------+------+------+------+------+------------+-------+
|index| Hand_-1|  Hand_0|  Hand_1|  Hand_2|  Hand_3|Hand_4|Hand_5|Hand_6|Hand_7|Chunk Number|n_moves|
+-----+--------+--------+--------+--------+--------+------+------+------+------+------------+-------+
|    1| [10, 4]| [10, 7]|[10, 11]|[10, 21]|[10, 21]|  NULL|  NULL|  NULL|  NULL|           0|      1|
|    2|  [4, 7]|[13, 12]|[23, 21]|    NULL|    NULL|  NULL|  NULL|  NULL|  NULL|           0|      3|
|    3|[-1, 10]|[14, 18]|[14, 29]|    NULL|    NULL|  NULL|  NULL|  NULL|  NULL|           0|      2|
|    4| [8, 12]|[20, 12]|[22, 12]|    NULL|    NULL|  NULL|  NULL|  NULL|  NULL|           0|      3|
|    5|  [3, 6]| [3, 15]| [3, 23]|    NULL|    NULL|  NULL|  NULL|  NULL|  NULL|           0|      1|
|    6| [7, 10]|[16, 20]|[26, 20]|    NULL|    NULL|  NULL|  NULL|  NULL|  NULL|           0|      3|
|    7|  [7, 5]|[13, 17]|[13, 17]|    NULL|    NULL|  NULL|  NULL|  NULL|  NULL|  

In [56]:
#We prepare the data in order to do a collect_set aggregation.
#With this function we will be able to see how many unique move values (n_moves) there are per first hand, indicating variability in the strategy.
from pyspark.sql.functions import count, collect_set, round

df_fin_train = df_moves_train.select("Chunk Number", "Hand_-1", "n_moves") \
    .groupBy("Chunk Number", "Hand_-1") \
    .agg(
        count("*").alias("count"),
        round((count("*") * 100 / CHUNK_SIZE), 2).alias("Proportion"),
        collect_set("n_moves").alias("Unique_Moves")
    ) \
    .orderBy("Chunk Number", "count", ascending = False)

df_fin_train.show(truncate=False)

df_fin_play = df_moves_play.select("Hand_-1", "n_moves") \
    .groupBy("Hand_-1") \
    .agg(
        count("*").alias("count"),
        round((count("*") * 100 / CHUNK_SIZE), 2).alias("Proportion"),
        collect_set("n_moves").alias("Unique_Moves")
    ) \
    .orderBy("count", ascending = False)

df_fin_play.show(truncate = False)

+------------+--------+-----+----------+------------+
|Chunk Number|Hand_-1 |count|Proportion|Unique_Moves|
+------------+--------+-----+----------+------------+
|4           |[2, 2]  |1    |0.2       |[3]         |
|3           |[12, 8] |10   |2.0       |[1, 2, 3]   |
|3           |[8, 3]  |9    |1.8       |[2, 3, 4]   |
|3           |[8, 12] |8    |1.6       |[2, 3, 4]   |
|3           |[10, 8] |8    |1.6       |[1, 2, 3]   |
|3           |[6, 5]  |7    |1.4       |[2, 3]      |
|3           |[3, 8]  |7    |1.4       |[2, 3, 4]   |
|3           |[5, 9]  |7    |1.4       |[3, 4]      |
|3           |[6, 4]  |7    |1.4       |[2, 3]      |
|3           |[7, 11] |7    |1.4       |[2, 3, 4]   |
|3           |[8, 4]  |7    |1.4       |[2, 3]      |
|3           |[8, 5]  |7    |1.4       |[2, 3, 4]   |
|3           |[8, 6]  |7    |1.4       |[2]         |
|3           |[8, 8]  |7    |1.4       |[1]         |
|3           |[10, 6] |7    |1.4       |[1]         |
|3           |[10, 11]|7    