### SISTEMA DE RECOMENDACION

In [1]:
# Se cargan las librerias necesarias
import pandas as pd 
import numpy as np 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
import scipy as sp
import pickle
import gzip

In [2]:
# Cargar los datos
training = pd.read_parquet('training.parquet')
training

Unnamed: 0,item_id,user_id,Rating
0,1250,76561197970982479,5
1,251610,js41637,5
2,248820,evcentric,5
3,250320,doctr,5
4,211420,maplemage,3
...,...,...,...
57362,440,How51,5
57363,304930,76561198111410893,5
57364,265630,zaza147,5
57365,304050,lifeonhigh,5


Se normalizan los datos para mejorar los resultados

In [3]:
# Se normaliza la columna Rating para mejorar los resultados
scaler = MinMaxScaler()
training['Norm_Rating'] = scaler.fit_transform(training[['Rating']])

# Elimino la columna Rating
training.drop(columns=['Rating'], inplace=True)

Se pivotea la tabla para dejar los datos de la forma adecuada para que puedan ser tratados por la funcion cosine_similarity

In [4]:
# Se crea la matriz usuario-juego para poder usar cosine_similarity
matriz_1 = training.pivot_table(index=['user_id'], columns=['item_id'], values='Norm_Rating').fillna(0)
matriz_1

item_id,10,20,30,40,50,60,70,80,130,220,...,512540,512630,514520,516040,520550,521340,521430,521570,521990,527340
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
--000--,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--ace--,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--ionex--,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-2SV-vuLB-Kg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-Azsael-,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zwanzigdrei,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zy0705,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zynxgameth,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zyr0n1c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Se convierte la matriz a fortmato matriz dispersa (sparse matrix) para redfucir la memoria utilizada y mejorar la eficencia

In [5]:
recomendacion_sparse = sp.sparse.csr_matrix(matriz_1.values)

Se instancia cosine_similarity en los datos, teniendo en cuenta que para item es la traspuesta de la matriz que se tiene hasta el momento

In [6]:
item_cos = cosine_similarity(recomendacion_sparse.T)

Se organiza y estructura los resultados en un dataframe

In [7]:
df_item_cos = pd.DataFrame(item_cos, index = matriz_1.columns, columns = matriz_1.columns)

Se hace la funcion para el sistema de recomendacion para juegos similares

In [31]:
def recomendacion_juego(game: str):
    recomendacion = []
    count = 1
    if game not in df_item_cos.columns:
        return {"Mensaje": f"El juego con el ID {game} no está en el listado de juegos."}

    print('Juegos similares a {} :\n'.format(game))
    juegos_similares = df_item_cos.sort_values(by=game, ascending=False).index[1:6]
    
    for item in juegos_similares:
        recomendacion.append({'No.': count, 'Juego': str(item)})
        count += 1
    
    return {"Juegos similares a {}".format(game): recomendacion}

Prueba del codigo

In [38]:
recomendacion_juego(10)
# http://127.0.0.1:8000/recomendacion_juego/?game=10

Juegos similares a 10 :



{'Juegos similares a 10': [{'No.': 1, 'Juego': '254480'},
  {'No.': 2, 'Juego': '41050'},
  {'No.': 3, 'Juego': '71165'},
  {'No.': 4, 'Juego': '221430'},
  {'No.': 5, 'Juego': '360'}]}

In [10]:
df_item_cos.to_csv('df_item_cos.csv', index=True, header=True)
print(f'Se guardó el archivo')

Se guardó el archivo


In [11]:
df_item_cos.shape

(3650, 3650)

Ahora se hace la funcion para recomendar a un usuario juegos similares a los que juega

In [12]:
# Se hace una copia del dataframe training
training_2 = training.copy()
# Se filtran los valores para tener una matriz mas pequeña y se pueda lograr el deploy en render
# Se buscan 3000 usuarios al azar
np.random.seed(42)
user_ids_aleatorios = np.random.choice(training_2['user_id'].unique(), size=3000, replace=False)
training_2 = training_2[training_2['user_id'].isin(user_ids_aleatorios)]
training_2

Unnamed: 0,item_id,user_id,Norm_Rating
17,220200,MarbleShrine,1.0
29,331470,KanadeTheNeko,1.0
30,323370,Leaf_Light_Moscow,1.0
34,730,76561198061252210,1.0
35,113200,OfficialShroomsy,1.0
...,...,...,...
57305,232950,UnseenPrecision,1.0
57315,248820,76561198067892662,1.0
57324,208090,Pi0h,0.0
57337,4000,76561198082967159,1.0


In [13]:
# Se convierte  el user_id a valores numericos usando LabelEncoder
label_encoder = LabelEncoder()
training_2['user_id_encoded'] = label_encoder.fit_transform(training_2['user_id'])

# Se crea la matriz usuario-juego para poder usar cosine_similarity
matriz_2 = training_2.pivot_table(index='user_id_encoded', columns='item_id', values='Norm_Rating', fill_value=0)
matriz_2

item_id,10,20,30,70,80,220,240,300,340,380,...,496920,497180,498240,499540,503300,504490,509060,512470,521340,521990
user_id_encoded,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# Se convierte la matriz a una matriz dispersa (sparse matrix)
matriz_2_sparse = sp.sparse.csr_matrix(matriz_2.values)

In [15]:
# Se instancia el coseno de similaridad 
cosine_sim = cosine_similarity(matriz_2_sparse, matriz_2_sparse)
cosine_sim

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

Se hace la funcion para recomendar juegos a un usuario especifico

In [48]:
def recomendacion_usuario(user_id):
    user_id = user_id.lower()
    n=5

    # Verificar si el user_id está en el DataFrame
    if user_id not in training_2['user_id'].str.lower().unique():
        return {"Mensaje": f"El usuario con el ID {user_id} no está registrado en los datos."}
    
    # Encuentra el índice del usuario en el DataFrame
    user_index = training_2[training_2['user_id'].str.lower() == user_id]['user_id_encoded'].iloc[0]
    
    # Obtener juegos similares ordenados por similitud de coseno
    juegos_similares = list(enumerate(cosine_sim[user_index]))
    juegos_similares = sorted(juegos_similares, key=lambda x: x[1], reverse=True)
    
    # Obtener los juegos ya jugados por el usuario
    juegos_jugados = set(training_2[training_2['user_id'].str.lower() == user_id]['item_id'].tolist())
    
    # Filtrar juegos similares que ya han sido jugados y no están en la lista
    top_juegos_similares = []
    for idx, _ in juegos_similares:
        if len(top_juegos_similares) >= n:
            break
        game_id = training_2.iloc[idx]['item_id']
        if game_id not in juegos_jugados and game_id not in top_juegos_similares:
            top_juegos_similares.append(game_id)
    # Crear el diccionario con el formato deseado
    juegos_recomendados = [{'No.': i+1, 'Juego': str(game_id)} for i, game_id in enumerate(top_juegos_similares)]
    salida = {'Juegos recomendados a {}'.format(user_id): juegos_recomendados}
    
    return salida

Se prueba la funcion

In [51]:
recomendacion_usuario('OfficialShroomsy')

{'Juegos recomendados a officialshroomsy': [{'No.': 1, 'Juego': '383870'},
  {'No.': 2, 'Juego': '57900'},
  {'No.': 3, 'Juego': '221100'},
  {'No.': 4, 'Juego': '12110'},
  {'No.': 5, 'Juego': '203140'}]}

Se guarda la informacion

In [18]:
training_2.to_csv('training_2.csv', index=False)
with gzip.open('cosine_sim.pkl.gz', 'wb') as f: pickle.dump(cosine_sim, f)
print(f'Se guardó el archivo')

Se guardó el archivo


In [19]:
cosine_sim.shape

(3000, 3000)