In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MaxAbsScaler

In [2]:
df_animes = pd.read_csv("animes.csv")
df_ratings = pd.read_csv("rating.csv")
df_usuarios = pd.read_csv("usuarios.csv")

In [3]:
df_animes.head()

Unnamed: 0,animeId,titulo,genero,tipo,episodios,rating,usuarios
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [4]:
df_usuarios.head()

Unnamed: 0,userId,username,name
0,1,AstaFan,Miquel Mateu
1,2,BuddieVital,Aleix Marques
2,3,Chirijadi,Jose Angel Cebrian
3,4,creticogi,Abdeslam Montesinos
4,5,kingelis,Gines Pallares


In [5]:
df_ratings.head()

Unnamed: 0,userId,animeId,rating
0,1,620,5
1,1,964,5
2,1,809,3
3,1,915,3
4,1,930,3


In [6]:
n_usuarios = df_ratings.userId.unique().shape[0]
n_animes = df_ratings.animeId.unique().shape[0]
print (str(n_usuarios) + ' usuarios')
print (str(n_animes) + ' animes')

100 usuarios
409 animes


In [7]:
df_matrix = pd.pivot_table(df_ratings, values='rating', index='userId', columns='animeId').fillna(0) 
df_matrix

animeId,12,14,15,16,18,21,22,23,24,25,...,1224,1232,1238,1251,1252,1255,1262,1280,1284,1287
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
ratings = df_matrix.values
sparsity = float(len(ratings.nonzero()[0]))
sparsity /= (ratings.shape[0] * ratings.shape[1]) 
sparsity *= 100
print('Densidad de los datos: {:4.2f}%'.format(sparsity))

Densidad de los datos: 3.58%


In [9]:
ratings_train, ratings_test = train_test_split(ratings, test_size = 0.2, random_state=1) 
print(ratings_train.shape)
print(ratings_test.shape)

(80, 409)
(20, 409)


In [10]:
sim_matrix = 1 - sklearn.metrics.pairwise.cosine_distances(ratings)
print(sim_matrix.shape)

(100, 100)


In [11]:
sim_matrix_train = sim_matrix[0:80,0:80]
sim_matrix_test = sim_matrix[80:100,80:100]
prediccion = sim_matrix_train.dot(ratings_train) / np.array([np.abs(sim_matrix_train).sum(axis=1)]).T

In [12]:
data_animes = pd.concat([df_animes["genero"].str.get_dummies(sep=","),
                           df_animes["tipo"].str.get_dummies(sep=","),df_animes[["rating"]],
                            df_animes[["usuarios"]],df_animes["episodios"]],axis=1)

data_animes.head()

Unnamed: 0,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,...,Yaoi,Movie,Music,ONA,OVA,Special,TV,rating,usuarios,episodios
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,9.37,200630,1
1,0,1,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,1,9.26,793665,64
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,9.25,114262,51
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,9.17,673572,24
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,9.16,151266,51


In [13]:
data_animes = MaxAbsScaler().fit_transform(data_animes)
data_animes

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        9.37000000e-01, 1.97876158e-01, 5.50055006e-04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, ...,
        9.26000000e-01, 7.82771174e-01, 3.52035204e-02],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        9.25000000e-01, 1.12693643e-01, 2.80528053e-02],
       ...,
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        4.88000000e-01, 2.15994011e-04, 2.20022002e-03],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        4.98000000e-01, 1.72597954e-04, 5.50055006e-04],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        5.46000000e-01, 1.40050911e-04, 5.50055006e-04]])

In [14]:
KNNanime = NearestNeighbors(n_neighbors=4, algorithm='ball_tree').fit(data_animes)
distances, indices = KNNanime.kneighbors(data_animes)

def nombres_indices(name):  
    return df_animes[df_animes["titulo"]==name].index.tolist()[0] 

def recomendados_por_anime(nombre): 
    found_id = nombres_indices(nombre)
    for id in indices[found_id][1:]:
            print(df_animes.loc[id]["titulo"])

distances, indices = KNNanime.kneighbors(data_animes)

In [30]:
usuario_ejemplo = 'BuddieVital'
data = df_usuarios[df_usuarios['username'] == usuario_ejemplo]
usuario_ver = data.iloc[0]['userId']-1
usuario0=prediccion.argsort()[usuario_ver]
for i, aRepo in enumerate(usuario0[-2:]):
    selRepo = df_animes[df_animes['animeId']==(aRepo+1)]
    print('Porcentaje:', prediccion[usuario_ver][aRepo])
    print(selRepo.iloc[0][1])
    recomendados_por_anime(selRepo.iloc[0][1])
    print("--------------")

Porcentaje: 1.2056138452553045
Escaflowne
Giniro no Kami no Agito
Eureka Seven: Pocket ga Niji de Ippai
1000-nen Joou: Queen Millennia
--------------
Porcentaje: 1.4619363376780585
Memories Off
I&#039;&#039;s
Nineteen 19
Refrain Blue
--------------
