In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from pyspark.sql import SparkSession
import seaborn as sns

spark = SparkSession.builder \
    .appName("TGVD_GenericQuery") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")

In [None]:
#PLOT QUERY 1

# en "el_path_que_sea" deberiamos cargar el dataset que esta guardado en:
# "hdfs:///user/ec2-user/output/query1/type_strategy.parquet"
el_path_que_sea = "" # <----------------------------------------HAY QUE CAMBIARLO
df_risk = spark.read.parquet(el_path_que_sea)

pdf = df_risk.toPandas()
pivot_df = pdf.pivot(index="Chunk Number", columns="Risk Level", values="count")
pivot_df.plot(kind='bar', stacked=False)

plt.title("Types of plays classified by their riskiness.")
plt.xlabel("Chunk Number")
plt.ylabel("Cantidad")
plt.xticks(rotation=0)
plt.legend(title="Risk Level")
plt.tight_layout()
plt.show()

NameError: name 'df_risk' is not defined

In [None]:
#PLOT QUERY 1

# en "el_path_que_sea" deberiamos cargar el dataset que esta guardado en:
# "hdfs:///user/ec2-user/output/query2/strategy_winrate_train.parquet"
el_path_que_sea = "hdfs:///user/ec2-user/output/query2/strategy_winrate_train.parquet" # <----------------------------------------HAY QUE CAMBIARLO
df_fin_train = spark.read.parquet(el_path_que_sea)

pdf_train = df_fin_train.toPandas()
pivot_train = pdf_train.pivot(index="Chunk Number", columns="Risk Level", values="count")

ax = pivot_train.plot(kind="bar", stacked=True, figsize=(10, 6))
plt.title("Distribution of risk levels per chunk in the training of the model")
plt.xlabel("Chunk Number")
plt.ylabel("Quantity")
plt.xticks(rotation=0)
plt.legend(title="Risk Level")
plt.tight_layout()


for idx, row in pdf_train.iterrows():
    chunk = row["Chunk Number"]
    risk = row["Risk Level"]
    count = row["count"]
    unique_moves = row["Unique_Moves"]
    x = chunk
    y = pivot_train.loc[chunk, risk]
    ax.annotate(f"{unique_moves}", (x, y), textcoords="offset points", xytext=(0,5), ha='center', fontsize=8)

plt.show()


# en "el_path_que_sea_2" deberiamos cargar el dataset que esta guardado en:
# "hdfs:///user/ec2-user/output/query2/strategy_winrate_play.parquet"
el_path_que_sea_2 = "hdfs:///user/ec2-user/output/query2/strategy_winrate_play.parquet" # <----------------------------------------HAY QUE CAMBIARLO
df_fin_play = spark.read.parquet(el_path_que_sea_2)

pdf_play = df_fin_play.toPandas()
ax = pdf_play.plot(kind="bar", x="Risk Level", y="count", legend=False, figsize=(7, 5))

plt.title("Count of risk level in simmulated matches by the model against itself")
plt.xlabel("Risk Level")
plt.ylabel("Quantity")
plt.xticks(rotation=0)

for idx, row in pdf_play.iterrows():
    ax.annotate(f"{row['Unique_Moves']}", (idx, row["count"] + 5), ha='center', fontsize=9)

plt.tight_layout()
plt.show()

In [None]:
#PLOT QUERY 3

# en "el_path_que_sea" deberiamos cargar el dataset que esta guardado en:
# "hdfs:///user/ec2-user/output/query3/winrate_per1sthand_train_results.parquet"
el_path_que_sea = "hdfs:///user/ec2-user/output/query3/winrate_per1sthand_train_results.parquet" # <----------------------------------------HAY QUE CAMBIARLO
df_winrate_train = spark.read.parquet(el_path_que_sea)

pdf_train_winrate = df_winrate_train.toPandas()

plt.figure(figsize=(10, 6))
for chunk in sorted(pdf_train_winrate["Chunk Number"].unique()):
    subset = pdf_train_winrate[pdf_train_winrate["Chunk Number"] == chunk]
    plt.scatter(
        subset["1st Hand Card"],
        subset["Winning Rate Proportion"],
        label=f"Chunk {chunk}",
        s=60,
        alpha=0.7
    )

plt.title("The chances of winning given the first hand (Train Set)")
plt.xlabel("1st Hand Card")
plt.ylabel("Winning Rate Proportion")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


# en "el_path_que_sea_2" deberiamos cargar el dataset que esta guardado en:
# "hdfs:///user/ec2-user/output/query3/winrate_per1sthand_play_results.parquet"
el_path_que_sea_2 = "hdfs:///user/ec2-user/output/query3/winrate_per1sthand_play_results.parquet" # <----------------------------------------HAY QUE CAMBIARLO
df_winrate_play = spark.read.parquet(el_path_que_sea_2)

pdf_play_winrate = df_winrate_play.toPandas()
pdf_play_winrate_sorted = pdf_play_winrate.sort_values(by="Winning Rate Proportion", ascending=False)

plt.figure(figsize=(10, 6))
plt.plot(
    pdf_play_winrate_sorted["1st Hand Card"].astype(str),
    pdf_play_winrate_sorted["Winning Rate Proportion"],
    marker='o'
)

# Etiquetas
plt.title("The chances of winning given the first hand (Play Set)")
plt.xlabel("1st Hand Card")
plt.ylabel("Winning Rate Proportion")
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
#PLOT QUERY 4

# en "el_path_que_sea" deberiamos cargar el dataset que esta guardado en:
# "hdfs:///user/ec2-user/output/query4/winrate_perdistance_from21_train.parquet"
el_path_que_sea = "hdfs:///user/ec2-user/output/query4/winrate_perdistance_from21_train.parquet" # <----------------------------------------HAY QUE CAMBIARLO
df_winrate_train = spark.read.parquet(el_path_que_sea)

pdf_train_wr = df_winrate_train.toPandas()

plt.figure(figsize=(14, 8))
scatter = plt.scatter(
    pdf_train_wr["Distance_from_21"],
    pdf_train_wr["Winning Rate Proportion"],
    s=pdf_train_wr["TotalGames"]*10,  # tamaño proporcional al número de partidas
    c=pdf_train_wr["Chunk Number"],  # color por chunk
    cmap='viridis',
    alpha=0.6,
    edgecolors='w'
)

plt.colorbar(scatter, label="Chunk Number")
plt.title("Bubble Plot - Train Set: Win Rate vs. Distance from 21")
plt.xlabel("Distance from 21")
plt.ylabel("Winning Rate Proportion")
plt.grid(True)
plt.tight_layout()
plt.show()


# en "el_path_que_sea" deberiamos cargar el dataset que esta guardado en:
# "hdfs:///user/ec2-user/output/query4/winrate_perdistance_from21_play.parquet"
el_path_que_sea_2 = "hdfs:///user/ec2-user/output/query4/winrate_perdistance_from21_play.parquet" # <----------------------------------------HAY QUE CAMBIARLO
df_winrate_play = spark.read.parquet(el_path_que_sea_2)

pdf_play_wr = df_winrate_play.toPandas()
pivot_play = pdf_play_wr.pivot(index="Distance_from_21", columns="Agent_hand", values="Winning Rate Proportion")

plt.figure(figsize=(14, 8))
sns.heatmap(pivot_play, annot=True, fmt=".2f", cmap="YlGnBu", linewidths=.5, cbar_kws={'label': 'Winning Rate'})
plt.title("Heatmap victory rate (Play Set)")
plt.xlabel("Agent Hand")
plt.ylabel("Distance from 21")
plt.tight_layout()
plt.show()