In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

#--------------
#   QUERY 1
#--------------

#The five types of plays are classified by their riskiness.
#It helps to identify the tactics of the AI

spark = SparkSession.builder \
    .appName("TGVD_GenericQuery") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")

path_training = "CardsParquetData/trained_blackjack.parquet"
path_match = "CardsParquetData/played_blackjack.parquet"

df = spark.read.parquet(path_training)
df_ = df.rdd.zipWithIndex().toDF(["data", "index"])

print(df.count())

25/05/17 12:05:05 WARN Utils: Your hostname, ASUS-DIEGO resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/05/17 12:05:05 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/17 12:05:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/05/17 12:05:06 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
                                                                                

2000


In [2]:
#We can try to see how the AI agent evolves along iterations
#For that we will divide the dataset in parts (as we did 100000 matches and
#we only took a match for each 50 matches we will divide it into 5 parts)

CHUNK_SIZE = 500
print(df_.show(truncate =  False))
df_.select("data").printSchema()

+-----------------------------------------------------------------------------------------------+-----+
|data                                                                                           |index|
+-----------------------------------------------------------------------------------------------+-----+
|{2025-05-12 19:24:20, [10, 4], [10, 7], [10, 11], [10, 21], [10, 21], NULL, NULL, NULL, NULL}  |0    |
|{2025-05-12 19:24:20, [4, 7], [13, 12], [23, 21], NULL, NULL, NULL, NULL, NULL, NULL}          |1    |
|{2025-05-12 19:24:20, [-1, 10], [14, 18], [14, 29], NULL, NULL, NULL, NULL, NULL, NULL}        |2    |
|{2025-05-12 19:24:20, [8, 12], [20, 12], [22, 12], NULL, NULL, NULL, NULL, NULL, NULL}         |3    |
|{2025-05-12 19:24:20, [3, 6], [3, 15], [3, 23], NULL, NULL, NULL, NULL, NULL, NULL}            |4    |
|{2025-05-12 19:24:20, [7, 10], [16, 20], [26, 20], NULL, NULL, NULL, NULL, NULL, NULL}         |5    |
|{2025-05-12 19:24:20, [7, 5], [13, 17], [13, 17], NULL, NULL, N

In [3]:
from pyspark.sql.functions import floor

df_clean = df_.select(
    col("index"),
    #col("data.Timestamp").alias("Timestamp"),
    col("data.Shown_cards").alias("Hand_-1"),
    col("data.`Hand 0`").alias("Hand_0"),
    col("data.`Hand 1`").alias("Hand_1"),
    col("data.`Hand 2`").alias("Hand_2"),
    col("data.`Hand 3`").alias("Hand_3"),
    col("data.`Hand 4`").alias("Hand_4"),
    col("data.`Hand 5`").alias("Hand_5"),
    col("data.`Hand 6`").alias("Hand_6"),
    col("data.`Hand 7`").alias("Hand_7")
)

df_clean.show(5)

df_chunks = df_clean.withColumn("Chunk Number", floor(col("index")/CHUNK_SIZE))

df_chunks.show(5)

+-----+--------+--------+--------+--------+--------+------+------+------+------+
|index| Hand_-1|  Hand_0|  Hand_1|  Hand_2|  Hand_3|Hand_4|Hand_5|Hand_6|Hand_7|
+-----+--------+--------+--------+--------+--------+------+------+------+------+
|    0| [10, 4]| [10, 7]|[10, 11]|[10, 21]|[10, 21]|  NULL|  NULL|  NULL|  NULL|
|    1|  [4, 7]|[13, 12]|[23, 21]|    NULL|    NULL|  NULL|  NULL|  NULL|  NULL|
|    2|[-1, 10]|[14, 18]|[14, 29]|    NULL|    NULL|  NULL|  NULL|  NULL|  NULL|
|    3| [8, 12]|[20, 12]|[22, 12]|    NULL|    NULL|  NULL|  NULL|  NULL|  NULL|
|    4|  [3, 6]| [3, 15]| [3, 23]|    NULL|    NULL|  NULL|  NULL|  NULL|  NULL|
+-----+--------+--------+--------+--------+--------+------+------+------+------+
only showing top 5 rows

+-----+--------+--------+--------+--------+--------+------+------+------+------+------------+
|index| Hand_-1|  Hand_0|  Hand_1|  Hand_2|  Hand_3|Hand_4|Hand_5|Hand_6|Hand_7|Chunk Number|
+-----+--------+--------+--------+--------+--------+------

In [4]:
# Contar jugadas no nulas
from pyspark.sql.functions import expr, when, coalesce

hand_cols = ["Hand_0", "Hand_1", "Hand_2", "Hand_3", "Hand_4", "Hand_5", "Hand_6", "Hand_7"]
df_moves = df_chunks.withColumn("n_moves",
    1 + sum([
        when((col(f"Hand_{i-1}")[0].isNotNull()) & (col(f"Hand_{i-1}")[0] != col(f"Hand_{i-2}")[0]), 1).otherwise(0)
        for i in range(1, len(hand_cols))
    ])
)

df_moves.show(5)

rev_hand_cols = ["Hand_7", "Hand_6", "Hand_5", "Hand_4", "Hand_3", "Hand_2", "Hand_1", "Hand_0"]
df_result_train = df_moves.withColumn("Final_Hand", coalesce(*[col(c) for c in rev_hand_cols]))
df_result_train.show()

df_result_train = df_result_train.withColumn(
    "Result",
    when(col("Final_Hand").isNull(), "Unknown")
    .when(col("Final_Hand")[0] > 21, "Lose")
    .when(col("Final_Hand")[1] > 21, "Win")
    .when(col("Final_Hand")[0] > col("Final_Hand")[1], "Win")
    .when(col("Final_Hand")[0] < col("Final_Hand")[1], "Lose")
    .otherwise("Draw")
)
df_result_train.show()

+-----+--------+--------+--------+--------+--------+------+------+------+------+------------+-------+
|index| Hand_-1|  Hand_0|  Hand_1|  Hand_2|  Hand_3|Hand_4|Hand_5|Hand_6|Hand_7|Chunk Number|n_moves|
+-----+--------+--------+--------+--------+--------+------+------+------+------+------------+-------+
|    0| [10, 4]| [10, 7]|[10, 11]|[10, 21]|[10, 21]|  NULL|  NULL|  NULL|  NULL|           0|      1|
|    1|  [4, 7]|[13, 12]|[23, 21]|    NULL|    NULL|  NULL|  NULL|  NULL|  NULL|           0|      3|
|    2|[-1, 10]|[14, 18]|[14, 29]|    NULL|    NULL|  NULL|  NULL|  NULL|  NULL|           0|      2|
|    3| [8, 12]|[20, 12]|[22, 12]|    NULL|    NULL|  NULL|  NULL|  NULL|  NULL|           0|      3|
|    4|  [3, 6]| [3, 15]| [3, 23]|    NULL|    NULL|  NULL|  NULL|  NULL|  NULL|           0|      1|
+-----+--------+--------+--------+--------+--------+------+------+------+------+------------+-------+
only showing top 5 rows

+-----+--------+--------+--------+--------+--------+-----

In [16]:
probabilities = [0.25, 0.5, 0.75]

quartiles = df_result_train.select("n_moves").approxQuantile("n_moves", probabilities, 0.01)

df_risk = df_moves.withColumn(
    "Risk Level",
    when(col("n_moves") <= quartiles[0], "Safe")
    .when((col("n_moves") > quartiles[0]) & (col("n_moves") <= quartiles[1]), "Tactical")
    .when((col("n_moves") > quartiles[1]) & (col("n_moves") <= quartiles[2]), "Risky")
    .otherwise("Suicidal")
)

df_risk.show()

+-----+--------+--------+--------+--------+--------+------+------+------+------+------------+-------+----------+
|index| Hand_-1|  Hand_0|  Hand_1|  Hand_2|  Hand_3|Hand_4|Hand_5|Hand_6|Hand_7|Chunk Number|n_moves|Risk Level|
+-----+--------+--------+--------+--------+--------+------+------+------+------+------------+-------+----------+
|    0| [10, 4]| [10, 7]|[10, 11]|[10, 21]|[10, 21]|  NULL|  NULL|  NULL|  NULL|           0|      1|      Safe|
|    1|  [4, 7]|[13, 12]|[23, 21]|    NULL|    NULL|  NULL|  NULL|  NULL|  NULL|           0|      3|     Risky|
|    2|[-1, 10]|[14, 18]|[14, 29]|    NULL|    NULL|  NULL|  NULL|  NULL|  NULL|           0|      2|      Safe|
|    3| [8, 12]|[20, 12]|[22, 12]|    NULL|    NULL|  NULL|  NULL|  NULL|  NULL|           0|      3|     Risky|
|    4|  [3, 6]| [3, 15]| [3, 23]|    NULL|    NULL|  NULL|  NULL|  NULL|  NULL|           0|      1|      Safe|
|    5| [7, 10]|[16, 20]|[26, 20]|    NULL|    NULL|  NULL|  NULL|  NULL|  NULL|           0|   

In [24]:
df_risk = df_risk.groupBy("Chunk Number", "Risk Level").count().orderBy("Chunk Number", "count")
df_risk.show(truncate = False)

+------------+----------+-----+
|Chunk Number|Risk Level|count|
+------------+----------+-----+
|0           |Suicidal  |16   |
|0           |Risky     |90   |
|0           |Safe      |394  |
|1           |Suicidal  |41   |
|1           |Risky     |121  |
|1           |Safe      |338  |
|2           |Suicidal  |30   |
|2           |Risky     |166  |
|2           |Safe      |304  |
|3           |Suicidal  |34   |
|3           |Risky     |178  |
|3           |Safe      |288  |
+------------+----------+-----+

