# Análisis de los vectores de banners

In [None]:
# Módulos

import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.neighbors import NearestNeighbors
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
base_dir = Path().resolve().parents[1] # Nos situamos en la carpeta del proyecto
data_dir = base_dir / 'data'

ruta = Path(data_dir / f'info_imagenes.jsonl.gz')
ruta_b = Path(data_dir / f'games_info.jsonl.gz') 

if not ruta.exists():
    raise FileNotFoundError(f'No se encuentra la ruta: {ruta}')

df1 = pd.DataFrame(pd.read_json(ruta, lines=True, compression='infer'))
df2 = pd.DataFrame(pd.read_json(ruta_b, lines=True, compression='infer'))

df1.head()

In [None]:
# Meter numero de reseñas
df3 = df2.copy()
df2 = df2.join(df2["appreviewhistogram"].apply(pd.Series))

def get_review(x):
    return x.get("total_recommendations")

df2["total_recommendations"] = df2["rollups"].apply(lambda x : get_review(x))
df2 = df2[["id", "total_recommendations"]]
df2.head()

df = df1.merge(df2, on="id")

# Meter precio
df3 = df3.join(df3["appdetails"].apply(pd.Series))

def get_price(x):
    return x.get("initial")

df3["initial"] = df3["price_overview"].apply(lambda x : get_price(x))
df3 = df3[["id", "initial"]]

df = df.merge(df3, on="id")

df.head()

In [None]:
modelos = ['v_resnet', 'v_convnext', 'v_clip']

for mod in modelos:
    print(f"Procesando reducción de dimensionalidad para: {mod}...")
    
    matrix = np.vstack(df[mod].values)
    
    # PCA
    pca = PCA(n_components=2)
    coords_pca = pca.fit_transform(matrix)
    
    df[f'pca_{mod}_1'] = coords_pca[:, 0]
    df[f'pca_{mod}_2'] = coords_pca[:, 1]
    #df[f'pca_{mod}_3'] = coords_pca[:, 2]
    
    # TSNE
    pca_pre = PCA(n_components=50)
    matrix_reduced = pca_pre.fit_transform(matrix)
    
    tsne = TSNE(n_components=2, perplexity=30, random_state=42, init='pca')
    coords_tsne = tsne.fit_transform(matrix_reduced)
    
    # Guardamos resultados con prefijo del modelo
    df[f'tsne_{mod}_1'] = coords_tsne[:, 0]
    df[f'tsne_{mod}_2'] = coords_tsne[:, 1]
    #df[f'tsne_{mod}_3'] = coords_tsne[:, 2]

df.head()

In [None]:
df = df[df["total_recommendations"] >= 6]
df_sample1 = df[(df["initial"] >= 1) & (df["initial"] <= 2000)].sample(n=400, random_state=1)
df_sample2 = df[(df["initial"] >= 2001) & (df["initial"] <= 4000)].sample(n=300, random_state=1)
df_sample3 = df[(df["initial"] >= 4001) & (df["initial"] <= 8001)].sample(n=100, random_state=1)
df_sample = pd.concat([df_sample1, df_sample2, df_sample3], axis=0).reset_index(drop=True)

modelos_nombres = ['v_resnet', 'v_convnext', 'v_clip']
titulos_modelos = ['ResNet', 'ConvNeXt', 'CLIP']

fig = make_subplots(
    rows=2, cols=3,
    #specs=[[{'type': 'scene'}]*3]*2,
    subplot_titles=(
        f'PCA - {titulos_modelos[0]}', f'PCA - {titulos_modelos[1]}', f'PCA - {titulos_modelos[2]}',
        f't-SNE - {titulos_modelos[0]}', f't-SNE - {titulos_modelos[1]}', f't-SNE - {titulos_modelos[2]}'
    ),
    horizontal_spacing=0.02,
    vertical_spacing=0.05
)

df_sample['log_recs'] = np.log10(df_sample['total_recommendations'] + 1)

for i, mod in enumerate(modelos_nombres):
    # PCA
    fig.add_trace(
        go.Scatter(
            x=df_sample[f'pca_{mod}_1'],
            y=df_sample[f'pca_{mod}_2'],
            #z=df_sample[f'pca_{mod}_3'],
            mode='markers',
            marker=dict(size=10, color=df_sample['initial'], colorscale='Viridis', opacity=0.8),
            text=df_sample['id'],
            hoverinfo='text',
            name=f'PCA {mod}'
        ),
        row=1, col=i+1
    )
    
    # TSNE
    fig.add_trace(
        go.Scatter(
            x=df_sample[f'tsne_{mod}_1'],
            y=df_sample[f'tsne_{mod}_2'],
            #z=df_sample[f'tsne_{mod}_3'], 
            mode='markers',
            marker=dict(size=10, color=df_sample['initial'], colorscale='Viridis', opacity=0.8),
            text=df_sample['id'],
            hoverinfo='text',
            name=f't-SNE {mod}'
        ),
        row=2, col=i+1
    )

fig.update_layout(
    title_text='Embeddings de imágenes reducidos a 2 dimensiones donde el color es el precio de los juegos',
    height=900, 
    width=1300,
    showlegend=False
)

fig.update_scenes(xaxis_showticklabels=False, yaxis_showticklabels=False, zaxis_showticklabels=False)

fig.show()

In [None]:
matrix_resnet = np.vstack(df_sample['v_resnet'].values)
matrix_convnext = np.vstack(df_sample['v_convnext'].values)
matrix_clip = np.vstack(df_sample['v_clip'].values)

# 2. Función auxiliar actualizada para trabajar con las matrices directamente
def obtener_vecinos_modelo(matrix, query_idx_local):
    nn = NearestNeighbors(n_neighbors=5, metric='cosine')
    nn.fit(matrix)
    _, indices = nn.kneighbors(matrix[query_idx_local].reshape(1, -1))
    return df_sample.iloc[indices[0][1:]][['id', 'initial']].values.tolist()

print("COMPARATIVA DE VECINOS POR MODELO (ESPACIO ORIGINAL)\n" + "="*60)

puntos_test = df_sample.sample(10)
for idx_pandas in puntos_test.index:
    idx_local = df_sample.index.get_loc(idx_pandas)
    id_objetivo = df_sample.iloc[idx_local][['id', 'initial']].values
    
    # Calculamos vecinos para cada arquitectura
    vecinos_resnet = obtener_vecinos_modelo(matrix_resnet, idx_local)
    vecinos_convnext = obtener_vecinos_modelo(matrix_convnext, idx_local)
    vecinos_clip = obtener_vecinos_modelo(matrix_clip, idx_local)
    
    # Formateo de salida
    sum = 0
    for i in range(2):
        sum += vecinos_resnet[i][1]
        sum += vecinos_convnext[i][1]
        sum += vecinos_clip[i][1]
    print(sum/600)
    print(f"ID Objetivo: {id_objetivo}")
    print(f"  Vecinos (ResNet18 - Formas/Color):   {vecinos_resnet}")
    print(f"  Vecinos (ConvNeXt - Textura/Calidad): {vecinos_convnext}")
    print(f"  Vecinos (CLIP - Semántica/Estilo):    {vecinos_clip}")
    print("-" * 60)