In [90]:
import pandas as pd
import numpy as np
from gensim import models
import re
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import LabelEncoder, StandardScaler
import matplotlib.pyplot as plt

DATA_PATH = "../../data/"
MODELS_PATH = "../../models/"

In [91]:
anime_df = pd.read_csv(DATA_PATH + 'anime_full.csv')
anime_df.head()

Unnamed: 0,MAL_ID,Name,Score,Type,Episodes,Source,Duration,Rating,Popularity,Favorites,...,Super Power,Shoujo Ai,Kids,Police,Slice of Life,Yaoi,Josei,Cars,season,year
0,1.0,Cowboy Bebop,8.78,TV,26.0,Original,1440,18,39.0,61971.0,...,0,0,0,0,0,0,0,0,spring,1998
1,5.0,Cowboy Bebop:The Movie,8.39,Movie,1.0,Original,6900,18,518.0,1174.0,...,0,0,0,0,0,0,0,0,autumn,2001
2,6.0,Trigun,8.24,TV,26.0,Manga,1440,13,201.0,12944.0,...,0,0,0,0,0,0,0,0,spring,1998
3,7.0,Witch Hunter Robin,7.27,TV,26.0,Original,1500,13,1467.0,587.0,...,0,0,0,1,0,0,0,0,summer,2002
4,8.0,Beet the Vandel Buster,6.98,TV,52.0,Manga,1380,7,4369.0,18.0,...,0,0,0,0,0,0,0,0,autumn,2004


In [92]:
anime_df.shape

(10882, 54)

## Create full discription and clusterning

In [158]:
def create_full_discription(df):
    df = df.copy()
    unique_genres = df.loc[0]['Action': 'Cars'].index.values
    df = df.drop(columns=unique_genres)
    genres = pd.read_csv(DATA_PATH + 'anime.csv')[['MAL_ID', 'Genres']]
    df = pd.merge(df, genres, on='MAL_ID', how='left')
    df = df.drop(columns='MAL_ID')
    full_sypn = df.apply(lambda x: re.sub(r'[^\w\s]', '', str(x.to_dict()).lower()), axis=1).to_list()
    return full_sypn

In [154]:
def get_clusters(df):
    df = df.copy()
    unique_genres = df.loc[0]['Action': 'Cars'].index.values
    km = KMeans(n_clusters=20, random_state=22)
    km.fit(df[unique_genres])
    df['cluster'] = km.labels_
    return df

In [155]:
anime_df = get_clusters(anime_df)



In [156]:
anime_df.head()

Unnamed: 0,MAL_ID,Name,Score,Type,Episodes,Source,Duration,Rating,Popularity,Favorites,...,Shoujo Ai,Kids,Police,Slice of Life,Yaoi,Josei,Cars,season,year,cluster
0,1.0,Cowboy Bebop,8.78,TV,26.0,Original,1440,18,39.0,61971.0,...,0,0,0,0,0,0,0,spring,1998,3
1,5.0,Cowboy Bebop:The Movie,8.39,Movie,1.0,Original,6900,18,518.0,1174.0,...,0,0,0,0,0,0,0,autumn,2001,3
2,6.0,Trigun,8.24,TV,26.0,Manga,1440,13,201.0,12944.0,...,0,0,0,0,0,0,0,spring,1998,5
3,7.0,Witch Hunter Robin,7.27,TV,26.0,Original,1500,13,1467.0,587.0,...,0,0,1,0,0,0,0,summer,2002,14
4,8.0,Beet the Vandel Buster,6.98,TV,52.0,Manga,1380,7,4369.0,18.0,...,0,0,0,0,0,0,0,autumn,2004,16


In [157]:
anime_df.drop(columns='MAL_ID').to_csv(DATA_PATH + 'anime_cl.csv', index=False)

### Create embeddings

In [159]:
glove_df = pd.read_csv(MODELS_PATH + 'glove.6B.300d.txt', sep=" ",
                       quoting=3, header=None, index_col=0)
glove_model = {key: value.values for key, value in glove_df.T.items()}

In [160]:
full_sypn = create_full_discription(anime_df)

In [161]:
def glove(sypn_list):
    vector_matrix = np.empty((len(sypn_list), 300))
    
    for index, each_sentence in enumerate(sypn_list):
        sentence_vector = np.zeros((300,))
        for each_word in each_sentence.split():
            try:
                sentence_vector += glove_model[each_word]
                
            except:
                continue
        vector_matrix[index] = sentence_vector
        
    np.savez(DATA_PATH + 'matrix', vector_matrix)

In [162]:
glove(full_sypn)