In [1]:
%load_ext autoreload
%autoreload 2

### Conjunto de Datos:  Anime

**Anime.csv**
- anime_id - myanimelist.net's unique id identifying an anime.
- name - full name of anime.
- genre - comma separated list of genres for this anime.
- type - movie, TV, OVA, etc.
- episodes - how many episodes in this show. (1 if movie).
- rating - average rating out of 10 for this anime.
- members - number of community members that are in this anime's
"group".

**Rating.csv**

- user_id - non identifiable randomly generated user id.
- anime_id - the anime that this user has rated.
- rating - rating out of 10 this user has assigned (-1 if the user watched it but didn't assign a rating).

__enlace__: https://www.kaggle.com/CooperUnion/anime-recommendations-database

In [27]:
from utils import read_data, get_dataframe, get_rating_matrix, create_dicts, save_dicts

In [23]:
datos = get_dataframe()
# Usa solo 1,000,000 ejemplos
datos = datos.iloc[:1000000]

In [31]:
datos.head()

Unnamed: 0,user_id,anime_id,rating_user,name,genre,type,episodes,rating,members
4933638,5073,22689,7,Terra Formars: Bugs 2-hen,"Action, Horror, Sci-Fi, Space",OVA,2,7.27,21616
5952151,52158,73,9,Full Metal Panic! The Second Raid,"Action, Mecha, Military",TV,13,8.06,155340
98631,72989,356,6,Fate/stay night,"Action, Fantasy, Magic, Romance, Supernatural",TV,24,7.58,374880
683016,26964,12293,8,Campione!: Matsurowanu Kamigami to Kamigoroshi...,"Comedy, Ecchi, Fantasy, Harem, Magic, Romance",TV,13,7.36,151928
3005772,53744,20541,8,Mikakunin de Shinkoukei,"Comedy, Romance, School, Slice of Life",TV,12,7.59,133385


#### Crea y guarda diccionarios id a nombre 

In [6]:
id_to_anime_name,anime_name_to_id = create_dicts(datos)
save_dicts(id_to_anime_name, anime_name_to_id)

In [7]:
# Carga los diccionarios
#id_to_anime_name, anime_name_to_id = load_anime_dicts()

### Forma conjuntos U, I

- **U** usuarios por item
- **I** items por usuario


In [8]:
from collections import defaultdict
def get_sets(data):
    users_per_item = defaultdict(set)
    items_per_user = defaultdict(set)
    n,m = data.shape

    for j in range(n):
        datapoint = data.iloc[j]
        product_id = datapoint['anime_id']
        user_id = datapoint['user_id']
        
        users_per_item[product_id].add(user_id)
        items_per_user[user_id].add(product_id)
        
    return users_per_item, items_per_user

In [9]:
U,I = get_sets(datos)


### Medida de Similaridad

In [10]:

def Jaccard(s1, s2):
    """
    s1: set
    s2: set
    """
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    return numer / denom


In [11]:
def mas_similares(item_id , n_items):
    """
    indice: int o str
        identificador  item.
        recomienda items similares a este
    item_id:int 
        número de items a recomendar.
        
    Return n_items items más similares 
    
    """
    
    similares = []
    
    # Usuarios que han calificado el item
    usuarios = U[item_id]
    items = set()
    
    for u in usuarios:
        # Agrega los items  de cada usuario
        items = items.union(I[u])
        
    
    for item in items:
        if item == item_id:
            continue
        
        sim = Jaccard(usuarios, U[item])
        
        similares.append((sim, item))
        
    similares.sort(key = lambda d: d[0], reverse = True)
    
    return similares[:n_items]
        


### Recomendaciones

In [37]:
# 20 Naruto
# 5114  Fullmetal Alchemist: Brotherhood
# 28977 Gintama
# 28851 Koe no Katachi
# 1 Cowboy Bebop
# 30276  One Punch Man
# 2001 Tengen Toppa Gurren Lagann
item =5114 
rec = mas_similares(item_id = item  , n_items= 25 )
print("(similaridad, anime_id): ", *rec, sep= "\n")

(similaridad, anime_id): 
(0.04352078239608802, 121)
(0.04071283685888148, 11757)
(0.039293439077144915, 3588)
(0.03876197494473103, 16498)
(0.03769244381525394, 9919)
(0.03719577529465789, 1575)
(0.036672487696459755, 20)
(0.03653912443984833, 2167)
(0.03618421052631579, 2904)
(0.03463648834019204, 10620)
(0.034357316087308, 20507)
(0.03405185879412684, 226)
(0.0335545261049205, 1535)
(0.03345724907063197, 6547)
(0.03333333333333333, 15809)
(0.03271028037383177, 4224)
(0.03234802007808143, 6746)
(0.032063119019640755, 8074)
(0.03178082191780822, 2001)
(0.030906450943787595, 13601)
(0.030899393047636566, 9253)
(0.03077232502011263, 356)
(0.03073463268365817, 22319)
(0.029873200085966042, 6702)
(0.029562251279135872, 19815)


In [38]:
recomendaciones = list(map(lambda m: id_to_anime_name[ str(m[1])], rec))
recomendaciones

['Fullmetal Alchemist',
 'Sword Art Online',
 'Soul Eater',
 'Shingeki no Kyojin',
 'Ao no Exorcist',
 'Code Geass: Hangyaku no Lelouch',
 'Naruto',
 'Clannad',
 'Code Geass: Hangyaku no Lelouch R2',
 'Mirai Nikki (TV)',
 'Noragami',
 'Elfen Lied',
 'Death Note',
 'Angel Beats!',
 'Hataraku Maou-sama!',
 'Toradora!',
 'Durarara!!',
 'Highschool of the Dead',
 'Tengen Toppa Gurren Lagann',
 'Psycho-Pass',
 'Steins;Gate',
 'Fate/stay night',
 'Tokyo Ghoul',
 'Fairy Tail',
 'No Game No Life']