In [1]:
'''
We will try to evaluate the game decisions of the model by using FPGrowth, BinaryClassificationEvaluator and Cross Validation.
With its results, we will take a look at the frequent decisions made by the model in each case and evaluate if those decisions are being done correctly or if the model is taking unnecessary risks.

We will also use the Random Forest implementation from Spark MLlib in order to make game decisions.
Once we have this model, we will compare its results to the ones from the deep learning model to check which one gives better results based on its resources.
'''

from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import monotonically_increasing_id

spark = SparkSession.builder \
    .appName("TGVD_GenericQuery") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")

path_match = "CardsParquetData/played_blackjack.parquet"

df_play = spark.read.parquet(path_match)
df_play = df_play.withColumn("index", (monotonically_increasing_id() + 1))

df_play.show(5)

25/05/17 18:00:28 WARN Utils: Your hostname, ASUS-DIEGO resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/05/17 18:00:28 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/17 18:00:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/05/17 18:00:32 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/05/17 18:00:32 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
25/05/17 18:00:32 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
25/05/17 18:00:32 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
                                                                                

+-------------------+-----------+--------+--------+--------+--------+--------+------+-----+
|          Timestamp|Shown_cards|  Hand 0|  Hand 1|  Hand 2|  Hand 3|  Hand 4|Hand 5|index|
+-------------------+-----------+--------+--------+--------+--------+--------+------+-----+
|2025-05-12 19:25:11|    [10, 1]| [14, 3]|[19, 15]|[19, 21]|[19, 21]|    NULL|  NULL|    1|
|2025-05-12 19:25:11|     [4, 5]| [8, 13]|[15, 15]|[15, 26]|    NULL|    NULL|  NULL|    2|
|2025-05-12 19:25:11|    [12, 3]|[12, 14]|[12, 19]|[12, 15]|[12, 18]|[12, 26]|  NULL|    3|
|2025-05-12 19:25:11|   [11, 12]|[21, 20]|[21, 20]|    NULL|    NULL|    NULL|  NULL|    4|
|2025-05-12 19:25:11|    [10, 3]|[10, 12]|[10, 14]|[10, 17]|[10, 22]|    NULL|  NULL|    5|
+-------------------+-----------+--------+--------+--------+--------+--------+------+-----+
only showing top 5 rows



In [2]:
df_sep = df_play.withColumn("Agents_1st", col("Shown_cards")[0]).withColumn("Dealers_1st", col("Shown_cards")[1]).select("Shown_cards", "Hand 0", "Hand 1", "Hand 2", "Hand 3", "Hand 4", "Hand 5", "Agents_1st", "Dealers_1st", "index")
df_sep.show(10)

+-----------+--------+--------+--------+--------+--------+------+----------+-----------+-----+
|Shown_cards|  Hand 0|  Hand 1|  Hand 2|  Hand 3|  Hand 4|Hand 5|Agents_1st|Dealers_1st|index|
+-----------+--------+--------+--------+--------+--------+------+----------+-----------+-----+
|    [10, 1]| [14, 3]|[19, 15]|[19, 21]|[19, 21]|    NULL|  NULL|        10|          1|    1|
|     [4, 5]| [8, 13]|[15, 15]|[15, 26]|    NULL|    NULL|  NULL|         4|          5|    2|
|    [12, 3]|[12, 14]|[12, 19]|[12, 15]|[12, 18]|[12, 26]|  NULL|        12|          3|    3|
|   [11, 12]|[21, 20]|[21, 20]|    NULL|    NULL|    NULL|  NULL|        11|         12|    4|
|    [10, 3]|[10, 12]|[10, 14]|[10, 17]|[10, 22]|    NULL|  NULL|        10|          3|    5|
|   [10, 10]|[10, 18]|[10, 30]|    NULL|    NULL|    NULL|  NULL|        10|         10|    6|
|     [8, 4]|[19, 14]|[19, 14]|    NULL|    NULL|    NULL|  NULL|         8|          4|    7|
|    [11, 9]|[19, 21]|[19, 21]|    NULL|    NULL| 

In [7]:
from pyspark.sql import Row

hand_cols = ["Shown_cards", "Hand 0", "Hand 1", "Hand 2", "Hand 3", "Hand 4", "Hand 5"]

def extract_hand_sequence(row):
    result = []
    dealer_card = row["Dealers_1st"]

    hands = []
    for col in hand_cols:
        hand = row[col]
        if hand is not None:
            hands.append(hand)
        else:
            break

    for i in range(len(hands)):
        player_total = hands[i][0]

        # Look ahead to see if the next total is different
        if i + 1 < len(hands):
            next_total = hands[i + 1][0]
            action = "hit" if next_total != player_total else "stand"
        else:
            # No more hands -> last move is stand
            action = "stand"

        # Si el siguiente movimiento no cambia, asumimos que ya se plantó
        result.append(Row(items=[player_total, dealer_card, action]))

        if action == "stand":
            break

    return result

df_rows = df_sep.rdd.flatMap(extract_hand_sequence).toDF()
df_rows.show(truncate=False)

from pyspark.sql.functions import col, udf
from pyspark.sql.types import ArrayType, StringType

# Convertimos los items a strings
@udf(ArrayType(StringType()))
def stringify_items(items):
    return [f"player_{items[0]}", f"dealer_{items[1]}", f"{items[2]}"]

df_fpgrowth = df_rows.withColumn("features", stringify_items(col("items")))
df_fpgrowth.select("features").show(truncate=False)

+---------------+
|items          |
+---------------+
|[10, 1, hit]   |
|[14, 1, hit]   |
|[19, 1, stand] |
|[4, 5, hit]    |
|[8, 5, hit]    |
|[15, 5, stand] |
|[12, 3, stand] |
|[11, 12, hit]  |
|[21, 12, stand]|
|[10, 3, stand] |
|[10, 10, stand]|
|[8, 4, hit]    |
|[19, 4, stand] |
|[11, 9, hit]   |
|[19, 9, stand] |
|[7, 3, hit]    |
|[15, 3, stand] |
|[8, 7, stand]  |
|[6, 10, hit]   |
|[12, 10, stand]|
+---------------+
only showing top 20 rows

+-----------------------------+
|features                     |
+-----------------------------+
|[player_10, dealer_1, hit]   |
|[player_14, dealer_1, hit]   |
|[player_19, dealer_1, stand] |
|[player_4, dealer_5, hit]    |
|[player_8, dealer_5, hit]    |
|[player_15, dealer_5, stand] |
|[player_12, dealer_3, stand] |
|[player_11, dealer_12, hit]  |
|[player_21, dealer_12, stand]|
|[player_10, dealer_3, stand] |
|[player_10, dealer_10, stand]|
|[player_8, dealer_4, hit]    |
|[player_19, dealer_4, stand] |
|[player_11, dealer_9, hit]   

In [19]:
from pyspark.ml.fpm import FPGrowth
from pyspark.sql.functions import size

fp = FPGrowth(itemsCol="features", minSupport=0.02, minConfidence=0.6)
model = fp.fit(df_fpgrowth)

print("Frequent Itemsets:")
model.freqItemsets.filter(size(col("items")) >= 2).show(30)

print("Association Rules:")
model.associationRules.orderBy("confidence", ascending = False).show()
#Interpretation when the player has 20, in 98% of cases the model stands.
#Interpretation when the dealer has 1, in 66% of cases the model hits.

Frequent Itemsets:
+------------------+----+
|             items|freq|
+------------------+----+
|[player_19, stand]|  89|
|[player_21, stand]|  67|
| [dealer_6, stand]|  78|
|   [dealer_6, hit]|  94|
|   [dealer_1, hit]|  82|
|   [player_5, hit]|  84|
|[player_10, stand]|  74|
|  [player_10, hit]|  85|
|[player_17, stand]|  56|
|  [player_17, hit]|  50|
| [dealer_8, stand]| 130|
|   [dealer_8, hit]| 159|
|[dealer_12, stand]|  80|
|  [dealer_12, hit]| 123|
|[dealer_11, stand]|  75|
|  [dealer_11, hit]|  73|
| [dealer_4, stand]|  92|
|   [dealer_4, hit]| 111|
| [dealer_2, stand]|  67|
|   [dealer_2, hit]|  85|
|[player_20, stand]|  68|
|[player_15, stand]|  63|
| [dealer_9, stand]|  59|
|   [dealer_9, hit]| 115|
| [dealer_3, stand]|  72|
|   [dealer_3, hit]|  62|
|   [player_6, hit]|  90|
|   [player_8, hit]| 126|
|   [player_4, hit]|  77|
|   [player_9, hit]|  86|
+------------------+----+
only showing top 30 rows

Association Rules:
+-----------+----------+------------------+---------

In [20]:
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# Extraer columnas
df_ml = df_rows.withColumn("player_total", col("items")[0].cast("int")) \
               .withColumn("dealer_card", col("items")[1].cast("int")) \
               .withColumn("label_str", col("items")[2])

# Codificar acción como 0/1
indexer = StringIndexer(inputCol="label_str", outputCol="label")
df_ml_indexed = indexer.fit(df_ml).transform(df_ml)

# Vector de características
assembler = VectorAssembler(inputCols=["player_total", "dealer_card"], outputCol="features")
df_final = assembler.transform(df_ml_indexed).select("features", "label")


In [None]:
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=50)
evaluator = BinaryClassificationEvaluator()
paramGrid = ParamGridBuilder().addGrid(rf.maxDepth, [3, 5, 7]).build()

cv = CrossValidator(estimator=rf,
                    estimatorParamMaps=paramGrid,
                    evaluator=evaluator,
                    numFolds=3)

cv_model = cv.fit(df_final)
predictions = cv_model.transform(df_final)

auc = evaluator.evaluate(predictions)
print(f"Random Forest AUC: {auc}")

Random Forest AUC: 0.9666481334392376
