### SISTEMA DE RECOMENDACION

In [58]:
# Se cargan las librerias necesarias
import pandas as pd 
import numpy as np 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
import scipy as sp
import pickle
import gzip

In [59]:
# Cargar los datos
training = pd.read_parquet('training.parquet')
training

Unnamed: 0,item_id,user_id,Rating,item_name
0,1250,76561197970982479,5,Killing Floor
1,1250,EndAtHallow,1,Killing Floor
2,1250,76561198077432581,5,Killing Floor
3,1250,76561198057958244,5,Killing Floor
4,1250,46366536564574576346346546,5,Killing Floor
...,...,...,...,...
51034,73010,ButtBurger2,5,Cities in Motion
51035,378930,76561198064526566,2,Pesadelo - Regressão
51036,16600,haungaraho,5,Trials 2: Second Edition
51037,232950,UnseenPrecision,5,Bridge Project


Se hace una copia de la tabla para trabajar con ella

In [60]:
training_1 = training.drop(columns=['item_name']).copy()
training_1

Unnamed: 0,item_id,user_id,Rating
0,1250,76561197970982479,5
1,1250,EndAtHallow,1
2,1250,76561198077432581,5
3,1250,76561198057958244,5
4,1250,46366536564574576346346546,5
...,...,...,...
51034,73010,ButtBurger2,5
51035,378930,76561198064526566,2
51036,16600,haungaraho,5
51037,232950,UnseenPrecision,5


### Funcion  recomendacion de juegos

Se normalizan los datos para mejorar los resultados

In [61]:
# Se normaliza la columna Rating para mejorar los resultados
scaler = MinMaxScaler()
training_1['Norm_Rating'] = scaler.fit_transform(training_1[['Rating']])

# Elimino la columna Rating
training_1.drop(columns=['Rating'], inplace=True)

Se pivotea la tabla para dejar los datos de la forma adecuada para que puedan ser tratados por la funcion cosine_similarity

In [62]:
# Se crea la matriz usuario-juego para poder usar cosine_similarity
matriz_1 = training_1.pivot_table(index=['user_id'], columns=['item_id'], values='Norm_Rating').fillna(0)
matriz_1

item_id,10,20,30,40,50,60,70,80,130,220,...,512300,512470,512630,514520,516040,521340,521430,521570,521990,527340
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
--000--,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--ace--,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--ionex--,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-2SV-vuLB-Kg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-Azsael-,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zv_odd,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zvanik,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zwanzigdrei,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zyr0n1c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Se convierte la matriz a fortmato matriz dispersa (sparse matrix) para redfucir la memoria utilizada y mejorar la eficencia

In [63]:
recomendacion_sparse = sp.sparse.csr_matrix(matriz_1.values)

Se instancia cosine_similarity en los datos, teniendo en cuenta que para item es la traspuesta de la matriz que se tiene hasta el momento

In [64]:
item_cos = cosine_similarity(recomendacion_sparse.T)

Se organiza y estructura los resultados en un dataframe

In [65]:
df_item_cos = pd.DataFrame(item_cos, index = matriz_1.columns, columns = matriz_1.columns)

Se hace la funcion para el sistema de recomendacion para juegos similares

In [66]:
def recomendacion_juego(game: str):
    recomendacion = []
    count = 1
    if game not in df_item_cos.columns:
        return {"Mensaje": f"El juego con el ID {game} no está en el listado de juegos."}

    print('Juegos similares a {} :\n'.format(game))
    juegos_similares = df_item_cos.sort_values(by=game, ascending=False).index[1:6]
    
    for item in juegos_similares:
        item_name = training[training['item_id'] == item]['item_name'].values[0]
        recomendacion.append({'No.': count, 'Juego': item_name, 'Id del juego': item})
        count += 1
    
    return {"Juegos similares a {}".format(game): recomendacion}

Prueba del codigo

In [67]:
recomendacion_juego(10)
# http://127.0.0.1:8000/recomendacion_juego/?game=10

Juegos similares a 10 :



{'Juegos similares a 10': [{'No.': 1,
   'Juego': 'Serious Sam Classic: The First Encounter',
   'Id del juego': 41050},
  {'No.': 2, 'Juego': 'Half-Life Deathmatch: Source', 'Id del juego': 360},
  {'No.': 3, 'Juego': 'Days Under Custody', 'Id del juego': 412730},
  {'No.': 4, 'Juego': 'Obscure 2', 'Id del juego': 254480},
  {'No.': 5, 'Juego': 'Pro Evolution Soccer 2013', 'Id del juego': 221430}]}

In [68]:
df_item_cos.to_csv('df_item_cos.csv', index=True, header=True)
training.to_csv('training.csv', index=False)
print(f'Se guardó el archivo')

Se guardó el archivo


In [69]:
df_item_cos.shape

(3296, 3296)

### Funcion para recomendar juegos a un usuario

Ahora se hace la funcion para recomendar a un usuario juegos similares a los que juega

In [70]:
# Se filtran los valores para tener una matriz mas pequeña y se pueda lograr el deploy en render
# Se buscan 3000 usuarios al azar
np.random.seed(42)
user_ids_aleatorios = np.random.choice(training_1['user_id'].unique(), size=3000, replace=False)
training_1 = training_1[training_1['user_id'].isin(user_ids_aleatorios)]
training_1

Unnamed: 0,item_id,user_id,Norm_Rating
34,1250,fruity8,0.50
35,1250,RandomSounds,0.00
44,1250,76561198042993298,0.00
46,1250,DickTickler,1.00
54,1250,Grizzle-_-99,1.00
...,...,...,...
50979,427980,tishoo,0.75
50990,212390,76561198071484916,1.00
50994,412940,76561198103362354,1.00
51006,249330,leherplederp,1.00


In [71]:
# Se convierte  el user_id a valores numericos usando LabelEncoder
label_encoder = LabelEncoder()
#training_1['user_id_encoded'] = label_encoder.fit_transform(training_1['user_id'])
training_1.loc[:, 'user_id_encoded'] = label_encoder.fit_transform(training_1['user_id'])

# Se crea la matriz usuario-juego para poder usar cosine_similarity
matriz_2 = training_1.pivot_table(index='user_id_encoded', columns='item_id', values='Norm_Rating', fill_value=0)
matriz_2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_1.loc[:, 'user_id_encoded'] = label_encoder.fit_transform(training_1['user_id'])


item_id,10,20,50,70,80,220,240,300,320,380,...,487220,487700,488310,493370,496250,496920,501760,502550,512300,521570
user_id_encoded,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [72]:
# Se convierte la matriz a una matriz dispersa (sparse matrix)
matriz_2_sparse = sp.sparse.csr_matrix(matriz_2.values)

In [73]:
# Se instancia el coseno de similaridad 
cosine_sim = cosine_similarity(matriz_2_sparse, matriz_2_sparse)
cosine_sim

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.56011203,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.56011203, 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

Se hace la funcion para recomendar juegos a un usuario especifico

In [78]:
def recomendacion_usuario(user_id):
    n=5
    user_id = user_id.lower()

    # Verificar si el user_id está en el DataFrame
    if user_id not in training_1['user_id'].str.lower().unique():
        return {"Mensaje": f"El usuario con el ID {user_id} no está registrado en los datos."}
    
    # Encuentra el índice del usuario en el DataFrame
    user_index = training_1[training_1['user_id'].str.lower() == user_id]['user_id_encoded'].iloc[0]
    
    # Obtener juegos similares ordenados por similitud de coseno
    juegos_similares = list(enumerate(cosine_sim[user_index]))
    juegos_similares = sorted(juegos_similares, key=lambda x: x[1], reverse=True)
    
    # Obtener los juegos ya jugados por el usuario
    juegos_jugados = set(training_1[training_1['user_id'].str.lower() == user_id]['item_id'].tolist())
    
    # Filtrar juegos similares que ya han sido jugados y no están en la lista
    top_juegos_similares = []
    for idx, _ in juegos_similares:
        if len(top_juegos_similares) >= n:
            break
        game_id = training_1.iloc[idx]['item_id']
        if game_id not in juegos_jugados and game_id not in top_juegos_similares:
            top_juegos_similares.append(game_id)
    
    # Obtener los nombres de los juegos recomendados
    juegos_recomendados = []
    for game_id in top_juegos_similares:
        item_name = training[training['item_id'] == game_id]['item_name']
        if not item_name.empty:
            item_name = item_name.values[0]
            juegos_recomendados.append({'id del juego': game_id, 'Nombre': item_name})

    
    salida = {'Juegos recomendados a {}'.format(user_id): juegos_recomendados}
    return salida


Se prueba la funcion

In [80]:
recomendacion_usuario('RandomSounds')

{'Juegos recomendados a randomsounds': [{'id del juego': 221100,
   'Nombre': 'DayZ'},
  {'id del juego': 550, 'Nombre': 'Left 4 Dead 2'},
  {'id del juego': 4000, 'Nombre': "Garry's Mod"},
  {'id del juego': 383080, 'Nombre': 'Sakura Clicker'},
  {'id del juego': 248820, 'Nombre': 'Risk of Rain'}]}

Se guarda la informacion

In [76]:
training_1.to_csv('training_1.csv', index=False)
with gzip.open('cosine_sim.pkl.gz', 'wb') as f: pickle.dump(cosine_sim, f)
print(f'Se guardó el archivo')

Se guardó el archivo


In [77]:
cosine_sim.shape

(3000, 3000)