In [22]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from scipy.special import erfc
from itertools import chain
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, silhouette_samples
from tqdm import tqdm
#from tqdm import tqdm_notebook as tqdm # Раскоментируйте если прогресс бар будет странно работать

%matplotlib inline

# Зафиксируем случайность, чтобы у нас получались одинаковые результаты.
np.random.seed(seed=42)

colab = False  # True если используте google colab
if colab:
    from google.colab import drive

    drive.mount('/content/drive/')

if colab:
    anime = pd.read_csv('/content/drive/My Drive/Data/anime/anime.csv.zip', index_col='anime_id')
else:
    anime = pd.read_csv('./archive/anime.csv', index_col='anime_id')
anime.dropna(inplace=True)

if colab:
    ratings = pd.read_csv('/content/drive/My Drive/Data/anime/rating.csv.zip')
else:
    ratings = pd.read_csv('./archive/rating.csv')

ratings.dropna(inplace=True)
ratings['rating'][ratings['rating'] == -1] = np.nan


def chauvenet(array):
    #mask = np.zeros_like(array)
    mask = erfc(np.abs(array - np.mean(array)) / np.std(array)) < 1 / 2 / array.size
    return mask


count_reviews = pd.DataFrame(ratings.groupby('user_id')['anime_id'].count())
count_reviews.rename(columns={'anime_id': 'count_reviews'}, inplace=True)

outlier_users = count_reviews[chauvenet(count_reviews.values)]

bad_user_threshold = outlier_users.min()

ratings = pd.merge(ratings, count_reviews, on='user_id')
ratings = ratings.drop(ratings[ratings['count_reviews'] >= bad_user_threshold.values[0]].index)
ratings = ratings.drop(ratings[ratings['count_reviews'] < ratings['count_reviews'].median()].index)
ratings = ratings.drop(columns=['count_reviews'])

outlier_anime = anime[chauvenet(anime['rating'].values)]
anime = anime.drop(outlier_anime.index)
outlier_anime = anime[chauvenet(anime['members'].values)]
anime = anime.drop(outlier_anime.index)


def flatmap(f, items):
    return chain.from_iterable(map(f, items))  #map - f(для каждого items), chain.from_iterable - распаковывает итератор


def genre_splitter(genre_names):
    return genre_names.split(", ")


m_uniq = anime['genre'].unique()
genres = list(set(flatmap(genre_splitter, m_uniq)))

anime_genre = pd.DataFrame(data=np.zeros((anime.shape[0], len(genres))), columns=[*genres], index=anime.index)

for genre in genres:
    anime_genre[genre] = anime['genre'].str.contains(genre).astype(int)

ratings = pd.merge(ratings, anime_genre, on='anime_id', how='inner').sort_values(by=['user_id', 'anime_id'])

for genre in genres:
    ratings[genre] = ratings[genre] * ratings['rating']
    ratings.loc[ratings[genre] == 0, genre] = np.nan
ratings = ratings.groupby('user_id')[genres].mean()
ratings = ratings.fillna(-1)
scaler = MinMaxScaler()
ratings_scaled = scaler.fit_transform(ratings)
ratings = pd.DataFrame(ratings_scaled, index=ratings.index, columns=ratings.columns)

N = len(genres)

scores = []
range_n_clusters = range(2, N + 1)
inertia = []

for n_clusters in tqdm(range(1, N)):  #tqdm - для отображения прогресса выполнения for
    kmeans = KMeans(n_clusters=n_clusters).fit(ratings_scaled)
    inertia.append(np.sqrt(kmeans.inertia_))
plt.figure(figsize=(10, 7))
plt.plot(range(1, N), inertia, marker='s')
plt.xlabel('$k$')
plt.ylabel('$J(C_k)$')
n_clusters = int(input('input number of clusters: '))
#n_clusters = 10
cluster = KMeans(n_clusters=n_clusters)
cluster_labels = cluster.fit_predict(ratings_scaled)
ratings['cluster_labels'] = (cluster_labels)

while True:
    user = int(input('input user: '))
    if user in ratings.index:
        break
#user = 5

user_cluster = ratings.loc[user]['cluster_labels']

user_cluster_df = ratings[ratings['cluster_labels'] == user_cluster]

cluster_profile = user_cluster_df.drop(columns='cluster_labels').mean()
top_genres = cluster_profile.sort_values(ascending=False).head(5).index.tolist()

top_anime = anime_genre.sort_values(by=top_genres, ascending=False).head(20).index.tolist()

recomendation = anime.loc[top_anime]
print(top_genres)
recomendation.head(10)










  super()._check_params_vs_input(X, default_n_init=10)


['Drama', 'Shounen', 'Comedy', 'Action', 'Supernatural']


Unnamed: 0_level_0,name,genre,type,episodes,rating,members
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
110,Chuuka Ichiban!,"Action, Comedy, Drama, Shounen",TV,52,7.69,8586
132,GetBackers,"Action, Comedy, Drama, Mystery, Shounen, Super...",TV,49,7.73,73536
154,Shaman King,"Action, Adventure, Comedy, Drama, Shounen, Sup...",TV,64,7.83,169517
231,Asagiri no Miko,"Action, Comedy, Drama, Fantasy, Magic, School,...",TV,26,6.31,4721
573,Saber Marionette J,"Action, Adventure, Comedy, Drama, Harem, Marti...",TV,25,7.45,17561
949,Top wo Nerae! Gunbuster,"Action, Comedy, Drama, Mecha, Military, Sci-Fi...",OVA,6,7.97,51724
1132,Digimon Frontier,"Action, Adventure, Comedy, Drama, Fantasy, Sho...",TV,50,7.25,74929
1327,Aoki Densetsu Shoot!,"Action, Comedy, Drama, Romance, School, Shoune...",TV,58,7.53,6937
2013,Taiho Shichau zo The Movie,"Action, Comedy, Drama, Police, Shounen",Movie,1,7.42,4029
2408,Keroro Gunsou Movie 2: Shinkai no Princess de ...,"Action, Adventure, Comedy, Drama, Sci-Fi, Shounen",Movie,1,7.48,2941
