In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import monotonically_increasing_id

#The distance from 21 at the end of each match.
#It helps to know the winning chance of the current hand

spark = SparkSession.builder \
    .appName("TGVD_GenericQuery") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")

path_training = "CardsParquetData/trained_blackjack.parquet"
path_match = "CardsParquetData/played_blackjack.parquet"

df_train = spark.read.parquet(path_training)
df_play = spark.read.parquet(path_match)
df_train = df_train.withColumn("index", (monotonically_increasing_id() + 1))
df_play = df_play.withColumn("index", (monotonically_increasing_id() + 1))

df_train.show(5)
df_play.show(5)

+-------------------+-----------+--------+--------+--------+--------+------+------+------+------+-----+
|          Timestamp|Shown_cards|  Hand 0|  Hand 1|  Hand 2|  Hand 3|Hand 4|Hand 5|Hand 6|Hand 7|index|
+-------------------+-----------+--------+--------+--------+--------+------+------+------+------+-----+
|2025-05-12 19:24:20|    [10, 4]| [10, 7]|[10, 11]|[10, 21]|[10, 21]|  NULL|  NULL|  NULL|  NULL|    1|
|2025-05-12 19:24:20|     [4, 7]|[13, 12]|[23, 21]|    NULL|    NULL|  NULL|  NULL|  NULL|  NULL|    2|
|2025-05-12 19:24:20|   [-1, 10]|[14, 18]|[14, 29]|    NULL|    NULL|  NULL|  NULL|  NULL|  NULL|    3|
|2025-05-12 19:24:20|    [8, 12]|[20, 12]|[22, 12]|    NULL|    NULL|  NULL|  NULL|  NULL|  NULL|    4|
|2025-05-12 19:24:20|     [3, 6]| [3, 15]| [3, 23]|    NULL|    NULL|  NULL|  NULL|  NULL|  NULL|    5|
+-------------------+-----------+--------+--------+--------+--------+------+------+------+------+-----+
only showing top 5 rows

+-------------------+-----------+------

In [8]:
from pyspark.sql.functions import floor, coalesce, when

#We do the chunk part just for the train (to analize the behaviour change through the iterations of the training)
CHUNK_SIZE = 500
df_chunks_train = df_train.withColumn("Chunk Number", floor(col("index")/CHUNK_SIZE))

rev_hand_cols = ["Hand 7", "Hand 6", "Hand 5", "Hand 4", "Hand 3", "Hand 2", "Hand 1", "Hand 0"]
df_result_train = df_chunks_train.withColumn("Final_Hand", coalesce(*[col(c) for c in rev_hand_cols]))
df_result_train = df_result_train.withColumn(
    "Result",
    when(col("Final_Hand").isNull(), "Unknown")
    .when(col("Final_Hand")[0] > 21, "Lose")
    .when(col("Final_Hand")[1] > 21, "Win")
    .when(col("Final_Hand")[0] > col("Final_Hand")[1], "Win")
    .when(col("Final_Hand")[0] < col("Final_Hand")[1], "Lose")
    .otherwise("Draw")
).select("Result", "Final_Hand", "Chunk Number")
df_result_train.show(10)

rev_hand_cols = ["Hand 5", "Hand 4", "Hand 3", "Hand 2", "Hand 1", "Hand 0"]
df_result_play = df_play.withColumn("Final_Hand", coalesce(*[col(c) for c in rev_hand_cols]))
df_result_play = df_result_play.withColumn(
    "Result",
    when(col("Final_Hand").isNull(), "Unknown")
    .when(col("Final_Hand")[0] > 21, "Lose")
    .when(col("Final_Hand")[1] > 21, "Win")
    .when(col("Final_Hand")[0] > col("Final_Hand")[1], "Win")
    .when(col("Final_Hand")[0] < col("Final_Hand")[1], "Lose")
    .otherwise("Draw")
).select("Result", "Final_Hand")
df_result_play.show(10)

+------+----------+------------+
|Result|Final_Hand|Chunk Number|
+------+----------+------------+
|  Lose|  [10, 21]|           0|
|  Lose|  [23, 21]|           0|
|   Win|  [14, 29]|           0|
|  Lose|  [22, 12]|           0|
|   Win|   [3, 23]|           0|
|  Lose|  [26, 20]|           0|
|  Lose|  [13, 17]|           0|
|  Lose|  [23, 28]|           0|
|   Win|  [10, 27]|           0|
|  Lose|   [3, 19]|           0|
+------+----------+------------+
only showing top 10 rows

+------+----------+
|Result|Final_Hand|
+------+----------+
|  Lose|  [19, 21]|
|   Win|  [15, 26]|
|   Win|  [12, 26]|
|   Win|  [21, 20]|
|   Win|  [10, 22]|
|   Win|  [10, 30]|
|   Win|  [19, 14]|
|  Lose|  [19, 21]|
|   Win|  [15, 24]|
|   Win|  [12, 28]|
+------+----------+
only showing top 10 rows



In [9]:
#Tal vez podriamos hacer un histograma de la distancia hasta 21 por result
from pyspark.sql.functions import sum

df_train = df_result_train.withColumn("Agent_hand", col("Final_Hand")[0]).withColumn("Distance_from_21", col("Final_Hand")[0] - 21).filter(col("Distance_from_21") <= 0).groupBy("Result", "Agent_hand", "Chunk Number", "Distance_from_21").count()
df_train.show(10)

df_play = df_result_play.withColumn("Agent_hand", col("Final_Hand")[0]).withColumn("Distance_from_21", col("Final_Hand")[0] - 21).filter(col("Distance_from_21") <= 0).groupBy("Result", "Agent_hand", "Distance_from_21").count()
df_play.show(10)

+------+----------+------------+----------------+-----+
|Result|Agent_hand|Chunk Number|Distance_from_21|count|
+------+----------+------------+----------------+-----+
|  Lose|         5|           3|             -16|    2|
|  Draw|         8|           0|             -13|    3|
|  Lose|        19|           3|              -2|   11|
|  Lose|        15|           2|              -6|   12|
|  Lose|         7|           1|             -14|    3|
|   Win|         1|           3|             -20|    1|
|   Win|        15|           0|              -6|    8|
|  Lose|        17|           0|              -4|    5|
|  Lose|        17|           3|              -4|   20|
|   Win|        17|           2|              -4|   17|
+------+----------+------------+----------------+-----+
only showing top 10 rows

+------+----------+----------------+-----+
|Result|Agent_hand|Distance_from_21|count|
+------+----------+----------------+-----+
|  Draw|        15|              -6|    2|
|  Draw|        18

In [10]:
total_df = df_train.groupBy("Agent_hand", "Chunk Number", "Distance_from_21").agg(sum("count").alias("TotalGames"))
wins_df = df_train.filter(col("Result") == "Win").groupBy("Agent_hand", "Chunk Number", "Distance_from_21").agg(sum("count").alias("Wins"))
df_winrate_train = total_df.join(wins_df, on=["Agent_hand", "Chunk Number", "Distance_from_21"], how="left").fillna(0, subset=["Wins"]).withColumn("Winning Rate Proportion", col("Wins") / col("TotalGames")).orderBy("Winning Rate Proportion", ascending = False)
df_winrate_train.show(10)

total_df = df_play.groupBy("Agent_hand", "Distance_from_21").agg(sum("count").alias("TotalGames"))
wins_df = df_play.filter(col("Result") == "Win").groupBy("Agent_hand", "Distance_from_21").agg(sum("count").alias("Wins"))
df_winrate_play = total_df.join(wins_df, on=["Agent_hand", "Distance_from_21"], how="left").fillna(0, subset=["Wins"]).withColumn("Winning Rate Proportion", col("Wins") / col("TotalGames")).orderBy("Winning Rate Proportion", ascending = False)
df_winrate_play.show(1000)

+----------+------------+----------------+----------+----+-----------------------+
|Agent_hand|Chunk Number|Distance_from_21|TotalGames|Wins|Winning Rate Proportion|
+----------+------------+----------------+----------+----+-----------------------+
|         1|           3|             -20|         1|   1|                    1.0|
|        14|           4|              -7|         1|   1|                    1.0|
|         7|           3|             -14|         1|   1|                    1.0|
|         3|           2|             -18|         1|   1|                    1.0|
|         6|           3|             -15|         1|   1|                    1.0|
|        21|           3|               0|        44|  41|     0.9318181818181818|
|        21|           1|               0|        27|  25|     0.9259259259259259|
|        21|           0|               0|        20|  18|                    0.9|
|         5|           1|             -16|         6|   5|     0.8333333333333334|
|   