In [64]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

#The five types of plays are classified by their riskiness.
#It helps to identify the tactics of the AI

spark = SparkSession.builder \
    .appName("TGVD_GenericQuery") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")

path_training = "CardsParquetData/trained_blackjack.parquet"
path_match = "CardsParquetData/played_blackjack.parquet"

df = spark.read.parquet(path_training)
df_ = df.rdd.zipWithIndex().toDF(["data", "index"])

print(df.count())

2000


In [65]:
#We can try to see how the AI agent evolves along iterations
#For that we will divide the dataset in parts (as we did 100000 matches and
#we only took a match for each 50 matches we will divide it into 5 parts)

CHUNK_SIZE = 440
print(df_.show(truncate =  False))
df_.select("data").printSchema()

+-----------------------------------------------------------------------------------------------+-----+
|data                                                                                           |index|
+-----------------------------------------------------------------------------------------------+-----+
|{2025-05-12 19:24:20, [10, 4], [10, 7], [10, 11], [10, 21], [10, 21], NULL, NULL, NULL, NULL}  |0    |
|{2025-05-12 19:24:20, [4, 7], [13, 12], [23, 21], NULL, NULL, NULL, NULL, NULL, NULL}          |1    |
|{2025-05-12 19:24:20, [-1, 10], [14, 18], [14, 29], NULL, NULL, NULL, NULL, NULL, NULL}        |2    |
|{2025-05-12 19:24:20, [8, 12], [20, 12], [22, 12], NULL, NULL, NULL, NULL, NULL, NULL}         |3    |
|{2025-05-12 19:24:20, [3, 6], [3, 15], [3, 23], NULL, NULL, NULL, NULL, NULL, NULL}            |4    |
|{2025-05-12 19:24:20, [7, 10], [16, 20], [26, 20], NULL, NULL, NULL, NULL, NULL, NULL}         |5    |
|{2025-05-12 19:24:20, [7, 5], [13, 17], [13, 17], NULL, NULL, N

In [66]:
from pyspark.sql.functions import floor

df_clean = df_.select(
    col("index"),
    #col("data.Timestamp").alias("Timestamp"),
    col("data.Shown_cards").alias("Hand_-1"),
    col("data.`Hand 0`").alias("Hand_0"),
    col("data.`Hand 1`").alias("Hand_1"),
    col("data.`Hand 2`").alias("Hand_2"),
    col("data.`Hand 3`").alias("Hand_3"),
    col("data.`Hand 4`").alias("Hand_4"),
    col("data.`Hand 5`").alias("Hand_5"),
    col("data.`Hand 6`").alias("Hand_6"),
    col("data.`Hand 7`").alias("Hand_7")
)

df_clean.show(5)

df_chunks = df_clean.withColumn("Chunk Number", floor(col("index")/CHUNK_SIZE))

df_chunks.show(5)

+-----+--------+--------+--------+--------+--------+------+------+------+------+
|index| Hand_-1|  Hand_0|  Hand_1|  Hand_2|  Hand_3|Hand_4|Hand_5|Hand_6|Hand_7|
+-----+--------+--------+--------+--------+--------+------+------+------+------+
|    0| [10, 4]| [10, 7]|[10, 11]|[10, 21]|[10, 21]|  NULL|  NULL|  NULL|  NULL|
|    1|  [4, 7]|[13, 12]|[23, 21]|    NULL|    NULL|  NULL|  NULL|  NULL|  NULL|
|    2|[-1, 10]|[14, 18]|[14, 29]|    NULL|    NULL|  NULL|  NULL|  NULL|  NULL|
|    3| [8, 12]|[20, 12]|[22, 12]|    NULL|    NULL|  NULL|  NULL|  NULL|  NULL|
|    4|  [3, 6]| [3, 15]| [3, 23]|    NULL|    NULL|  NULL|  NULL|  NULL|  NULL|
+-----+--------+--------+--------+--------+--------+------+------+------+------+
only showing top 5 rows

+-----+--------+--------+--------+--------+--------+------+------+------+------+------------+
|index| Hand_-1|  Hand_0|  Hand_1|  Hand_2|  Hand_3|Hand_4|Hand_5|Hand_6|Hand_7|Chunk Number|
+-----+--------+--------+--------+--------+--------+------

In [67]:
# Contar jugadas no nulas
from pyspark.sql.functions import expr, when

hand_cols = ["Hand_0", "Hand_1", "Hand_2", "Hand_3", "Hand_4", "Hand_5", "Hand_6", "Hand_7"]
df_moves = df_chunks.withColumn("n_moves",
    1 + sum([
        when((col(f"Hand_{i-1}")[0].isNotNull()) & (col(f"Hand_{i-1}")[0] != col(f"Hand_{i-2}")[0]), 1).otherwise(0)
        for i in range(1, 8)
    ])
)

df_moves.show(5)

+-----+--------+--------+--------+--------+--------+------+------+------+------+------------+-------+
|index| Hand_-1|  Hand_0|  Hand_1|  Hand_2|  Hand_3|Hand_4|Hand_5|Hand_6|Hand_7|Chunk Number|n_moves|
+-----+--------+--------+--------+--------+--------+------+------+------+------+------------+-------+
|    0| [10, 4]| [10, 7]|[10, 11]|[10, 21]|[10, 21]|  NULL|  NULL|  NULL|  NULL|           0|      1|
|    1|  [4, 7]|[13, 12]|[23, 21]|    NULL|    NULL|  NULL|  NULL|  NULL|  NULL|           0|      3|
|    2|[-1, 10]|[14, 18]|[14, 29]|    NULL|    NULL|  NULL|  NULL|  NULL|  NULL|           0|      2|
|    3| [8, 12]|[20, 12]|[22, 12]|    NULL|    NULL|  NULL|  NULL|  NULL|  NULL|           0|      3|
|    4|  [3, 6]| [3, 15]| [3, 23]|    NULL|    NULL|  NULL|  NULL|  NULL|  NULL|           0|      1|
+-----+--------+--------+--------+--------+--------+------+------+------+------+------------+-------+
only showing top 5 rows



In [68]:
df_risk = df_moves.withColumn(
    "Risk Level",
    when(col("n_moves") <= 1, "Safe")
    .when(col("n_moves") == 2, "Tactical")
    .when(col("n_moves") == 3, "Aggressive")
    .when(col("n_moves") == 4, "Risky")
    .otherwise("Are you crazy?")
)

#hacerlo por quartiles con el numero de moves para ganar

df_risk.show(5)

+-----+--------+--------+--------+--------+--------+------+------+------+------+------------+-------+----------+
|index| Hand_-1|  Hand_0|  Hand_1|  Hand_2|  Hand_3|Hand_4|Hand_5|Hand_6|Hand_7|Chunk Number|n_moves|Risk Level|
+-----+--------+--------+--------+--------+--------+------+------+------+------+------------+-------+----------+
|    0| [10, 4]| [10, 7]|[10, 11]|[10, 21]|[10, 21]|  NULL|  NULL|  NULL|  NULL|           0|      1|      Safe|
|    1|  [4, 7]|[13, 12]|[23, 21]|    NULL|    NULL|  NULL|  NULL|  NULL|  NULL|           0|      3|Aggressive|
|    2|[-1, 10]|[14, 18]|[14, 29]|    NULL|    NULL|  NULL|  NULL|  NULL|  NULL|           0|      2|  Tactical|
|    3| [8, 12]|[20, 12]|[22, 12]|    NULL|    NULL|  NULL|  NULL|  NULL|  NULL|           0|      3|Aggressive|
|    4|  [3, 6]| [3, 15]| [3, 23]|    NULL|    NULL|  NULL|  NULL|  NULL|  NULL|           0|      1|      Safe|
+-----+--------+--------+--------+--------+--------+------+------+------+------+------------+---

In [78]:
#The types of strategies, (safe, tactical, risky…) and it’s proportion
#It helps to identify which strategy is the common one.

df_fin = df_risk.select(col("Chunk Number"), col("n_moves"), col("Risk Level")).groupBy("Chunk Number", "Risk Level").count().orderBy("Chunk Number", "count")
print(df_fin.show(30))

df_query2 = df_fin.withColumn("Proportion", round(col("count")*100/CHUNK_SIZE, 2))
df_query2.show(30)

+------------+--------------+-----+
|Chunk Number|    Risk Level|count|
+------------+--------------+-----+
|           0|Are you crazy?|    3|
|           0|         Risky|   10|
|           0|    Aggressive|   79|
|           0|      Tactical|  170|
|           0|          Safe|  178|
|           1|Are you crazy?|    4|
|           1|         Risky|   27|
|           1|    Aggressive|  101|
|           1|          Safe|  126|
|           1|      Tactical|  182|
|           2|Are you crazy?|    2|
|           2|         Risky|   30|
|           2|          Safe|   90|
|           2|    Aggressive|  134|
|           2|      Tactical|  184|
|           3|Are you crazy?|    5|
|           3|         Risky|   28|
|           3|          Safe|   59|
|           3|    Aggressive|  165|
|           3|      Tactical|  183|
|           4|Are you crazy?|    3|
|           4|         Risky|    9|
|           4|          Safe|   45|
|           4|    Aggressive|   76|
|           4|      Tactical