In [4]:
import pandas as pd
import numpy as np

import scipy as sp
from sklearn.metrics.pairwise import cosine_similarity
import operator

import pyarrow as pa
import pyarrow.parquet as pq

In [5]:
df = pd.read_csv('datos/recomendacion.csv')
df

Unnamed: 0,user_id,item_name,rating
0,76561197970982479,Killing Floor,3
1,76561197970982479,Zeno Clash,5
2,76561197970982479,Metro 2033,3
3,js41637,Barbie™ Dreamhouse Party™,3
4,js41637,Euro Truck Simulator 2,3
...,...,...,...
49181,wayfeng,Counter-Strike: Global Offensive,3
49182,76561198251004808,Enclave,5
49183,72947282842,Counter-Strike: Global Offensive,1
49184,ApxLGhost,Counter-Strike: Global Offensive,3


In [6]:
piv = df.pivot_table(index=['user_id'], columns=['item_name'], values='rating')
piv

item_name,0RBITALIS,"10,000,000",100% Orange Juice,1001 Spikes,12 Labours of Hercules,12 Labours of Hercules II: The Cretan Bull,123 Slaughter Me Street,140,16 Bit Arena,200% Mixed Juice!,...,ibb & obb,inMomentum,liteCam Game: 100 FPS Game Capture,oO,planetarian ~the reverie of a little planet~,resident evil 4 / biohazard 4,sZone-Online,the static speaks my name,theHunter,theHunter: Primal
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
--000--,,,,,,,,,,,...,,,,,,,,,,
--ace--,,,,,,,,,,,...,,,,,,,,,,
--ionex--,,,,,,,,,,,...,,,,,,,,,,
-2SV-vuLB-Kg,,,,,,,,,,,...,,,,,,,,,,
-Beave-,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zv_odd,,,,,,,,,,,...,,,,,,,,,,
zvanik,,,,,,,,,,,...,,,,,,,,,,
zynxgameth,,,,,,,,,,,...,,,,,,,,,,
zyr0n1c,,,,,,,,,,,...,,,,,,,,,,


In [7]:
# Normalización del dataframe 'piv'
piv_norm = piv.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)
# Se borran las columnas que contienen solo cero o no tienen rating, se rellenan los vacios con 0 y se hace la transpuesta
piv_norm.fillna(0, inplace=True)
piv_norm = piv_norm.T
piv_norm = piv_norm.loc[:, (piv_norm != 0).any(axis=0)]
piv_norm

user_id,-2SV-vuLB-Kg,-GM-Dragon,-PRoSlayeR-,-_PussyDestroyer_-,00000000000000000001227,00True,011111135489484797,022899,04061993,042153100,...,zixwot,zombi_anon,zomgieee,zoozles,zrustz16,zucchin1,zuilde,zuzuga2003,zv_odd,zyr0n1c
item_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0RBITALIS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
10000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
100% Orange Juice,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
1001 Spikes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
12 Labours of Hercules,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
resident evil 4 / biohazard 4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
sZone-Online,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
the static speaks my name,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
theHunter,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.466667,0.0,0.0


In [8]:
piv_sparse = sp.sparse.csr_matrix(piv_norm.values)
piv_sparse

<2640x6699 sparse matrix of type '<class 'numpy.float64'>'
	with 24325 stored elements in Compressed Sparse Row format>

In [9]:
item_similarity = cosine_similarity(piv_sparse)
user_similarity = cosine_similarity(piv_sparse.T)

In [10]:
#item similarity dataframe
item_sim_df = pd.DataFrame(item_similarity, index = piv_norm.index, columns = piv_norm.index)
#user similarity dataframe
user_sim_df = pd.DataFrame(user_similarity, index = piv_norm.columns, columns = piv_norm.columns)

In [11]:
def top_game(game):
    '''
    Muestra una lista de juegos similares a un juego dado.

    Args:
        game (str): El nombre del juego para el cual se desean encontrar juegos similares.

    Returns:
        None: Esta función imprime una lista de juegos 5 similares al dado.

    '''
    count = 1
    print('Similar games to {} include:\n'.format(game))
    for item in item_sim_df.sort_values(by = game, ascending = False).index[1:6]:
        print('No. {}: {}'.format(count, item))
        count +=1  

In [12]:
# ejemplo
top_game('Killing Floor')

Similar games to Killing Floor include:

No. 1: Metro 2033
No. 2: Deus Ex: Revision
No. 3: Unreal Gold
No. 4: Sam & Max 301: The Penal Zone
No. 5: Star Trek Online


In [13]:
# ejemplo
top_game('Barbie™ Dreamhouse Party™')

Similar games to Barbie™ Dreamhouse Party™ include:

No. 1: Resident Evil™: Operation Raccoon City
No. 2: Retro City Rampage™ DX
No. 3: Retro/Grade
No. 4: Return to Castle Wolfenstein
No. 5: Reus


In [14]:
def top_users(user):
    '''
    Muestra una lista de los usuarios más similares a un usuario dado y sus valores de similitud.

    Args:
        user (str): El nombre o identificador del usuario para el cual se desean encontrar usuarios similares.

    Returns:
        None: Esta función imprime la lista de usuarios similares y sus valores de similitud en la consola.

    '''
    # Verifica si el usuario está presente en las columnas de piv_norm (si no está, devuelve un mensaje)
    if user not in piv_norm.columns:
        return('No data available on user {}'.format(user))
    
    print('Most Similar Users:\n')
    # Ordena los usuarios por similitud descendente y toma los 5 usuarios más similares (excluyendo el propio 'user')
    sim_values = user_sim_df.sort_values(by=user, ascending=False).loc[:,user].tolist()[1:6]
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:11]
    # Combina los nombres de usuario y los valores de similitud en una lista de tuplas
    zipped = zip(sim_users, sim_values,)
    
    # Itera a través de las tuplas y muestra los usuarios similares y sus valores de similitu
    for user, sim in zipped:
        print('User #{0}, Similarity value: {1:.2f}'.format(user, sim)) 

In [15]:
# ejemplo
top_users('76561197970982479')

Most Similar Users:

User #pipekissXD, Similarity value: 0.58
User #SwinburnCyphenMisslemike, Similarity value: 0.37
User #76561198011493875, Similarity value: 0.35
User #lodesofemone, Similarity value: 0.35
User #oiyewnam8, Similarity value: 0.33


In [16]:
def similar_user_recs(user):
    '''
    Genera una lista de los juegos más recomendados para un usuario, basándose en las calificaciones de usuarios similares.

    Args:
        user (str): El nombre o identificador del usuario para el cual se desean generar recomendaciones.

    Returns:
        list: Una lista de los juegos más recomendados para el usuario basado en la calificación de usuarios similares.

    '''
    # Verifica si el usuario está presente en las columnas de piv_norm (si no está, devuelve un mensaje)
    if user not in piv_norm.columns:
        return('No data available on user {}'.format(user))
    
    # Obtiene los usuarios más similares al usuario dado
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:11]
    
    best = []  # Lista para almacenar los juegos mejor calificados por usuarios similares
    most_common = {}  # Diccionario para contar cuántas veces se recomienda cada juego
    
    # Para cada usuario similar, encuentra el juego mejor calificado y lo agrega a la lista 'best'
    for i in sim_users:
        max_score = piv_norm.loc[:, i].max()
        best.append(piv_norm[piv_norm.loc[:, i]==max_score].index.tolist())
            # Cuenta cuántas veces se recomienda cada juego
    for i in range(len(best)):
        for j in best[i]:
            if j in most_common:
                most_common[j] += 1
            else:
                most_common[j] = 1
    
    # Ordena los juegos por la frecuencia de recomendación en orden descendente
    sorted_list = sorted(most_common.items(), key=operator.itemgetter(1), reverse=True)
    
    # Devuelve los 5 juegos más recomendados
    return sorted_list[:5] 

In [17]:
# ejemplo
similar_user_recs('76561197970982479')

[("Garry's Mod", 2),
 ('Terraria', 2),
 ('DayZ', 2),
 ('Zeno Clash', 1),
 ('Deus Ex: Human Revolution', 1)]

In [18]:
pq.write_table(pa.Table.from_pandas(piv_norm), 'datos/piv_norm.parquet')
pq.write_table(pa.Table.from_pandas(user_sim_df), 'datos/user_sim_df.parquet')
pq.write_table(pa.Table.from_pandas(item_sim_df), 'datos/item_sim_df.parquet')
print('Se guardaron correctamente')

Se guardaron correctamente
