# Análisis de los vectores de banners

In [1]:
# Módulos

import pandas as pd

import numpy as np
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.neighbors import NearestNeighbors
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [15]:
base_dir = Path().resolve().parents[1] # Nos situamos en la carpeta del proyecto
data_dir = base_dir / 'data'

ruta = Path(data_dir / f'info_imagenes.jsonl.gz')
ruta_b = Path(data_dir / f'games_info.jsonl.gz') 

if not ruta.exists():
    raise FileNotFoundError(f'No se encuentra la ruta: {ruta}')

df1 = pd.DataFrame(pd.read_json(ruta, lines=True, compression='infer'))
df2 = pd.DataFrame(pd.read_json(ruta_b, lines=True, compression='infer'))

df1.head()

Unnamed: 0,id,brillo,v_resnet,v_convnext,v_clip
0,938810,50.3692,"[2.654, 0.6285000000000001, 3.1121, 0.60330000...","[-0.07, 0.2227, -0.044500000000000005, 0.01270...","[0.22790000000000002, 0.0175, -0.1262, -0.5289..."
1,938820,158.4242,"[0.0692, 1.7278, 1.7555, 1.0887, 0.36060000000...","[0.056, 0.0854, 0.014, -0.2368, -0.0456, 0.036...","[-0.4122, 0.0058000000000000005, -0.1035000000..."
2,938840,150.786,"[0.2124, 2.8073, 1.6505, 0.1507, 0.0497, 0.625...","[0.0531, 0.1552, 0.1587, -0.2351, -0.1252, 0.0...","[-0.3695, 0.176, 0.0325, -0.0047, 0.2241000000..."
3,938860,142.5227,"[0.4778, 1.3384, 0.6664, 0.652, 0.0, 0.2158000...","[-0.023, 0.0989, 0.2969, -0.0159, 0.021, -0.01...","[0.18150000000000002, 0.3265, 0.3819, 0.1557, ..."
4,938870,74.7785,"[1.0681, 2.7420999999999998, 3.5699, 2.543, 0....","[-0.033100000000000004, 0.0868, 0.2021, 0.0387...","[-0.0719, 0.5630000000000001, -0.2323, 0.1353,..."


In [16]:
# Meter numero de reseñas
df3 = df2.copy()
df2 = df2.join(df2["appreviewhistogram"].apply(pd.Series))

def get_review(x):
    return x.get("total_recommendations")

df2["total_recommendations"] = df2["rollups"].apply(lambda x : get_review(x))
df2 = df2[["id", "total_recommendations"]]
df2.head()

df = df1.merge(df2, on="id")

# Meter precio

df3 = df3.join(df3["appdetails"].apply(pd.Series))

def get_price(x):
    return x.get("initial")

df3["initial"] = df3["price_overview"].apply(lambda x : get_price(x))
df3 = df3[["id", "initial"]]

df = df.merge(df3, on="id")

df.head()

Unnamed: 0,id,brillo,v_resnet,v_convnext,v_clip,total_recommendations,initial
0,938810,50.3692,"[2.654, 0.6285000000000001, 3.1121, 0.60330000...","[-0.07, 0.2227, -0.044500000000000005, 0.01270...","[0.22790000000000002, 0.0175, -0.1262, -0.5289...",24,0
1,938820,158.4242,"[0.0692, 1.7278, 1.7555, 1.0887, 0.36060000000...","[0.056, 0.0854, 0.014, -0.2368, -0.0456, 0.036...","[-0.4122, 0.0058000000000000005, -0.1035000000...",0,499
2,938840,150.786,"[0.2124, 2.8073, 1.6505, 0.1507, 0.0497, 0.625...","[0.0531, 0.1552, 0.1587, -0.2351, -0.1252, 0.0...","[-0.3695, 0.176, 0.0325, -0.0047, 0.2241000000...",2,499
3,938860,142.5227,"[0.4778, 1.3384, 0.6664, 0.652, 0.0, 0.2158000...","[-0.023, 0.0989, 0.2969, -0.0159, 0.021, -0.01...","[0.18150000000000002, 0.3265, 0.3819, 0.1557, ...",16,2450
4,938870,74.7785,"[1.0681, 2.7420999999999998, 3.5699, 2.543, 0....","[-0.033100000000000004, 0.0868, 0.2021, 0.0387...","[-0.0719, 0.5630000000000001, -0.2323, 0.1353,...",1,329


In [17]:
modelos = ['v_resnet', 'v_convnext', 'v_clip']

for mod in modelos:
    print(f"Procesando reducción de dimensionalidad para: {mod}...")
    
    matrix = np.vstack(df[mod].values)
    
    # PCA
    pca = PCA(n_components=2)
    coords_pca = pca.fit_transform(matrix)
    
    df[f'pca_{mod}_1'] = coords_pca[:, 0]
    df[f'pca_{mod}_2'] = coords_pca[:, 1]
    #df[f'pca_{mod}_3'] = coords_pca[:, 2]
    
    # TSNE
    pca_pre = PCA(n_components=50)
    matrix_reduced = pca_pre.fit_transform(matrix)
    
    tsne = TSNE(n_components=2, perplexity=30, random_state=42, init='pca')
    coords_tsne = tsne.fit_transform(matrix_reduced)
    
    # Guardamos resultados con prefijo del modelo
    df[f'tsne_{mod}_1'] = coords_tsne[:, 0]
    df[f'tsne_{mod}_2'] = coords_tsne[:, 1]
    #df[f'tsne_{mod}_3'] = coords_tsne[:, 2]

df.head()

Procesando reducción de dimensionalidad para: v_resnet...
Procesando reducción de dimensionalidad para: v_convnext...
Procesando reducción de dimensionalidad para: v_clip...


Unnamed: 0,id,brillo,v_resnet,v_convnext,v_clip,total_recommendations,initial,pca_v_resnet_1,pca_v_resnet_2,tsne_v_resnet_1,tsne_v_resnet_2,pca_v_convnext_1,pca_v_convnext_2,tsne_v_convnext_1,tsne_v_convnext_2,pca_v_clip_1,pca_v_clip_2,tsne_v_clip_1,tsne_v_clip_2
0,938810,50.3692,"[2.654, 0.6285000000000001, 3.1121, 0.60330000...","[-0.07, 0.2227, -0.044500000000000005, 0.01270...","[0.22790000000000002, 0.0175, -0.1262, -0.5289...",24,0,-5.608611,-0.374921,-58.326565,-12.443451,-1.110072,-0.388986,-26.621969,-16.005161,-4.209123,2.514019,80.048164,-2.197586
1,938820,158.4242,"[0.0692, 1.7278, 1.7555, 1.0887, 0.36060000000...","[0.056, 0.0854, 0.014, -0.2368, -0.0456, 0.036...","[-0.4122, 0.0058000000000000005, -0.1035000000...",0,499,-3.241229,2.767292,-11.941306,0.884904,-0.352941,-0.43999,-2.769573,-2.020938,0.109754,-0.885637,21.525339,-3.232559
2,938840,150.786,"[0.2124, 2.8073, 1.6505, 0.1507, 0.0497, 0.625...","[0.0531, 0.1552, 0.1587, -0.2351, -0.1252, 0.0...","[-0.3695, 0.176, 0.0325, -0.0047, 0.2241000000...",2,499,-1.519655,2.821155,-11.910262,0.887087,-0.119455,-0.248684,-2.669312,-1.947996,1.095139,-1.253841,21.539543,-3.232238
3,938860,142.5227,"[0.4778, 1.3384, 0.6664, 0.652, 0.0, 0.2158000...","[-0.023, 0.0989, 0.2969, -0.0159, 0.021, -0.01...","[0.18150000000000002, 0.3265, 0.3819, 0.1557, ...",16,2450,0.081206,5.533376,-28.613152,23.832205,1.606081,-0.921107,41.992264,-19.045879,0.276395,1.27605,35.078888,23.007469
4,938870,74.7785,"[1.0681, 2.7420999999999998, 3.5699, 2.543, 0....","[-0.033100000000000004, 0.0868, 0.2021, 0.0387...","[-0.0719, 0.5630000000000001, -0.2323, 0.1353,...",1,329,1.728616,-3.291079,-26.507664,-47.746994,-0.498221,0.627352,-54.995869,17.158297,0.742414,-1.240778,-21.118685,30.395973


In [68]:
df = df[df["total_recommendations"] >= 6]
df_sample1 = df[(df["initial"] >= 1) & (df["initial"] <= 2000)].sample(n=400, random_state=1)
df_sample2 = df[(df["initial"] >= 2001) & (df["initial"] <= 4000)].sample(n=300, random_state=1)
df_sample3 = df[(df["initial"] >= 4001) & (df["initial"] <= 8001)].sample(n=100, random_state=1)
df_sample = pd.concat([df_sample1, df_sample2, df_sample3], axis=0).reset_index(drop=True)

modelos_nombres = ['v_resnet', 'v_convnext', 'v_clip']
titulos_modelos = ['ResNet', 'ConvNeXt', 'CLIP']

fig = make_subplots(
    rows=2, cols=3,
    #specs=[[{'type': 'scene'}]*3]*2,
    subplot_titles=(
        f'PCA - {titulos_modelos[0]}', f'PCA - {titulos_modelos[1]}', f'PCA - {titulos_modelos[2]}',
        f't-SNE - {titulos_modelos[0]}', f't-SNE - {titulos_modelos[1]}', f't-SNE - {titulos_modelos[2]}'
    ),
    horizontal_spacing=0.02,
    vertical_spacing=0.05
)

df_sample['log_recs'] = np.log10(df_sample['total_recommendations'] + 1)

for i, mod in enumerate(modelos_nombres):
    # PCA
    fig.add_trace(
        go.Scatter(
            x=df_sample[f'pca_{mod}_1'],
            y=df_sample[f'pca_{mod}_2'],
            #z=df_sample[f'pca_{mod}_3'],
            mode='markers',
            marker=dict(size=10, color=df_sample['initial'], colorscale='Viridis', opacity=0.8),
            text=df_sample['id'],
            hoverinfo='text',
            name=f'PCA {mod}'
        ),
        row=1, col=i+1
    )
    
    # TSNE
    fig.add_trace(
        go.Scatter(
            x=df_sample[f'tsne_{mod}_1'],
            y=df_sample[f'tsne_{mod}_2'],
            #z=df_sample[f'tsne_{mod}_3'], 
            mode='markers',
            marker=dict(size=10, color=df_sample['initial'], colorscale='Viridis', opacity=0.8),
            text=df_sample['id'],
            hoverinfo='text',
            name=f't-SNE {mod}'
        ),
        row=2, col=i+1
    )

fig.update_layout(
    title_text='Embeddings de imágenes reducidos a 2 dimensiones donde el color es el precio de los juegos',
    height=900, 
    width=1300,
    showlegend=False
)

fig.update_scenes(xaxis_showticklabels=False, yaxis_showticklabels=False, zaxis_showticklabels=False)

fig.show()

In [67]:
matrix_resnet = np.vstack(df_sample['v_resnet'].values)
matrix_convnext = np.vstack(df_sample['v_convnext'].values)
matrix_clip = np.vstack(df_sample['v_clip'].values)

# 2. Función auxiliar actualizada para trabajar con las matrices directamente
def obtener_vecinos_modelo(matrix, query_idx_local):
    nn = NearestNeighbors(n_neighbors=5, metric='cosine')
    nn.fit(matrix)
    _, indices = nn.kneighbors(matrix[query_idx_local].reshape(1, -1))
    return df_sample.iloc[indices[0][1:]][['id', 'initial']].values.tolist()

print("COMPARATIVA DE VECINOS POR MODELO (ESPACIO ORIGINAL)\n" + "="*60)

puntos_test = df_sample.sample(10)
for idx_pandas in puntos_test.index:
    idx_local = df_sample.index.get_loc(idx_pandas)
    id_objetivo = df_sample.iloc[idx_local][['id', 'initial']].values
    
    # Calculamos vecinos para cada arquitectura
    vecinos_resnet = obtener_vecinos_modelo(matrix_resnet, idx_local)
    vecinos_convnext = obtener_vecinos_modelo(matrix_convnext, idx_local)
    vecinos_clip = obtener_vecinos_modelo(matrix_clip, idx_local)
    
    # Formateo de salida
    sum = 0
    for i in range(2):
        sum += vecinos_resnet[i][1]
        sum += vecinos_convnext[i][1]
        sum += vecinos_clip[i][1]
    print(sum/600)
    print(f"ID Objetivo: {id_objetivo}")
    print(f"  Vecinos (ResNet18 - Formas/Color):   {vecinos_resnet}")
    print(f"  Vecinos (ConvNeXt - Textura/Calidad): {vecinos_convnext}")
    print(f"  Vecinos (CLIP - Semántica/Estilo):    {vecinos_clip}")
    print("-" * 60)

COMPARATIVA DE VECINOS POR MODELO (ESPACIO ORIGINAL)
16.95
ID Objetivo: [np.int64(1153980) np.int64(299)]
  Vecinos (ResNet18 - Formas/Color):   [[1498110, 2099], [1146510, 499], [1435160, 199], [1293990, 79]]
  Vecinos (ConvNeXt - Textura/Calidad): [[1159540, 2995], [1506230, 99], [1498110, 2099], [1083190, 1479]]
  Vecinos (CLIP - Semántica/Estilo):    [[1380910, 2999], [1338840, 1479], [1159540, 2995], [1451830, 299]]
------------------------------------------------------------
10.075
ID Objetivo: [np.int64(1380910) np.int64(2999)]
  Vecinos (ResNet18 - Formas/Color):   [[1289720, 2450], [1361010, 999], [1322730, 499], [1280930, 2450]]
  Vecinos (ConvNeXt - Textura/Calidad): [[1511450, 99], [1076020, 2099], [1058560, 975], [1153980, 299]]
  Vecinos (CLIP - Semántica/Estilo):    [[1051990, 99], [1153980, 299], [977720, 2999], [1430760, 1579]]
------------------------------------------------------------
8.156666666666666
ID Objetivo: [np.int64(1443950) np.int64(99)]
  Vecinos (ResNet1