In [None]:
import pandas as pd

import pymysql # Para ver la versión
import sqlalchemy # Para ver la versión
from sqlalchemy import create_engine
from sqlalchemy import text
from sqlalchemy import create_engine
from dotenv import load_dotenv
import os

In [None]:
load_dotenv()  # carga las variables del .env

host = os.getenv("DB_HOST")
user = os.getenv("DB_USER")
password = os.getenv("DB_PASSWORD")
database = os.getenv("DB_NAME")

engine = create_engine(f"postgresql+psycopg2://{user}:{password}@{host}/{database}")


connection = engine.connect()


connection.close()

In [14]:
sql = """SELECT
  g.*,
  COALESCE(s.store_ids,    ARRAY[]::int[])   AS store_ids,
  COALESCE(ge.genre_ids,   ARRAY[]::int[])   AS genre_ids,
  COALESCE(p.platform_ids, ARRAY[]::int[])   AS platform_ids
FROM public.games g
LEFT JOIN (
  SELECT gs.game_id,
         ARRAY_AGG(DISTINCT gs.store_id ORDER BY gs.store_id) AS store_ids
  FROM public.game_stores gs
  GROUP BY gs.game_id
) s  ON s.game_id = g.game_id
LEFT JOIN (
  SELECT gg.game_id,
         ARRAY_AGG(DISTINCT gg.genre_id ORDER BY gg.genre_id) AS genre_ids
  FROM public.game_genres gg
  GROUP BY gg.game_id
) ge ON ge.game_id = g.game_id
LEFT JOIN (
  SELECT gp.game_id,
         ARRAY_AGG(DISTINCT gp.platform_id ORDER BY gp.platform_id) AS platform_ids
  FROM public.game_platforms gp
  GROUP BY gp.game_id
) p  ON p.game_id = g.game_id
ORDER BY g.game_id;
"""
df_games = pd.read_sql_query(sql, engine)

# ¿Existen juegos que aun no están “con puntuación” (metacritic o ratings > 0)?

df_games_signal = df_games[(df_games['metacritic'].fillna(0) > 0) | (df_games['rating'].fillna(0) > 0)]
df = df_games_signal
df.shape


(1578, 23)

In [15]:
df=df[['game_id', 'name', 'rating', 'added', 'metacritic', 'genre_ids','store_ids','platform_ids','esrb_rating_id']]

In [16]:
df = df.copy()
df['esrb_rating_id'] = df['esrb_rating_id'].fillna(6)
df['metacritic'] = df['metacritic'].fillna(df['metacritic'].mean())
df['metacritic'] = df['metacritic'].round(2)

In [17]:
df.head(3)

Unnamed: 0,game_id,name,rating,added,metacritic,genre_ids,store_ids,platform_ids,esrb_rating_id
0,1,D/Generation HD,1.86,128,78.6,"[3, 7]","[1, 2, 3, 6]","[1, 4, 5, 7, 18]",2.0
1,25,Middle-earth: Shadow of War,3.86,8701,82.0,"[4, 5]","[1, 2, 3, 4, 5, 8]","[1, 4, 18, 21]",4.0
2,28,Red Dead Redemption 2,4.59,16547,96.0,[4],"[1, 2, 3, 11]","[1, 4, 18]",4.0


In [18]:
df.columns.tolist()


['game_id',
 'name',
 'rating',
 'added',
 'metacritic',
 'genre_ids',
 'store_ids',
 'platform_ids',
 'esrb_rating_id']

In [19]:
from transformers import TapasTokenizer, TapasForQuestionAnswering
import torch

model_name = "google/tapas-large-finetuned-wtq"
tokenizer = TapasTokenizer.from_pretrained(model_name)
model = TapasForQuestionAnswering.from_pretrained(model_name)

In [44]:
#me ha tocado hacer esto porque si no se sanean las columnas explota

cols = ["name", "metacritic", "rating"] # dejo columnas mas simples sin arrays y rinde peor
df_copy = df[cols].head(200).reset_index(drop=True).copy() # limitar columnas al modelo a 100 porque si no da peor rendimiento o puede petar
table = df_copy.astype(str).fillna("")   # esto es lo que vamos a pasar al pipeline libre de nans y todo a string
assert table.shape[0] > 0 and table.shape[1] > 0, f"Tabla vacia: shape={table.shape}" # check de seguridad por si la tabla se queda vacia
#table = table.head(100).reset_index(drop=True)   # recorta mas si hace falta la tabla si da error
table.columns = [str(c) for c in table.columns] # parseo a strings extra por si nombres de columnas no lo son.

In [None]:
queries = ["Which game has the highest metacritic?"]
inputs = tokenizer(table=table, queries=queries, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)


# Aquí pred_coords ya viene como pares (r,c), por eso [table.iat[r, c] for (r, c) in coords] no explota.
pred_coords, = tokenizer.convert_logits_to_predictions(
    inputs, outputs.logits
)

coords = pred_coords[0]  # lista de pares (row y col)
if coords:
    answers = [table.iat[r, c] for (r, c) in coords]
    print("Respuesta, el juego con mejor calificacion metacritic es: ", ", ".join(answers))
else:
    print("No se ha encontrado respuesta")



  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


Respuesta, el juego con mejor calificacion metacritic es:  Red Dead Redemption 2


In [None]:
queries = ["What is the average rating?"]
inputs = tokenizer(table=table, queries=queries, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)

# Aquí pred_coords ya viene como pares (r,c), por eso [table.iat[r, c] for (r, c) in coords] no explota.
# Uso aggregator para saber que operacion va a usar porque la pregunta lo requiere.
pred_coords, pred_aggs = tokenizer.convert_logits_to_predictions(
    inputs, outputs.logits, outputs.logits_aggregation
)
aggregation_map = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3: "COUNT"}
print("aggregator:", aggregation_map.get(pred_aggs[0], "UNKNOWN"))


rows = [r for (r, c) in pred_coords[0]]  # filas elegidas por TAPAS

if rows:
    avg_rating = pd.to_numeric(df_copy.loc[rows, "rating"], errors="coerce").mean()
    print("Respuesta, average rating es: ", round(avg_rating, 2))
    
else: print("No se ha encontrado respuesta")


aggregator: AVERAGE
Respuesta average rating es:  3.78


In [57]:
queries = ["How many games have a metacritic greater than 90?"]
inputs = tokenizer(table=table, queries=queries, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)

# Aquí pred_coords ya viene como pares (r,c), por eso [table.iat[r, c] for (r, c) in coords] no explota.
# Uso aggregator para saber que operacion va a usar porque la pregunta lo requiere.
pred_coords, pred_aggs = tokenizer.convert_logits_to_predictions(
    inputs, outputs.logits, outputs.logits_aggregation
)
aggregation_map = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3: "COUNT"}
print("aggregator:", aggregation_map.get(pred_aggs[0], "UNKNOWN"))

rows = {r for (r, c) in pred_coords[0]} # filas elegidas por TAPAS
print("Respuesta, juegos con metacritic mayor que 90 son: ", len(rows))


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


aggregator: COUNT
Respuesta, juegos con metacritic mayor que 90 son:  3
