In [44]:
import pandas as pd
import numpy as np
import os
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

In [22]:
def load_movielens_ml100k(data_dir="../data/ml-100k"):
    path_data = os.path.join(data_dir, "u.data")
    cols = ["user_id", "item_id", "rating", "timestamp"]
    ratings = pd.read_csv(path_data, sep='\t', names=cols, engine='python')

    path_item = os.path.join(data_dir, "u.item")
    movie_cols = ["item_id", "title", "release_date", "video_release_date", "IMDb_url"] + \
                 ["genre_" + str(g) for g in range(19)]
    movies = pd.read_csv(path_item, sep='|', names=movie_cols, engine='python', encoding='latin-1')

    return ratings, movies

ratings_df, movies_df = load_movielens_ml100k("../data/ml-100k")

print("Рейтинги:")
ratings_df["timestamp"] = pd.to_datetime(ratings_df["timestamp"], unit='s')
display(ratings_df.head())
print("\nФильмы:")
display(movies_df.head())


Рейтинги:


Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,1997-12-04 15:55:49
1,186,302,3,1998-04-04 19:22:22
2,22,377,1,1997-11-07 07:18:36
3,244,51,2,1997-11-27 05:02:03
4,166,346,1,1998-02-02 05:33:16



Фильмы:


Unnamed: 0,item_id,title,release_date,video_release_date,IMDb_url,genre_0,genre_1,genre_2,genre_3,genre_4,...,genre_9,genre_10,genre_11,genre_12,genre_13,genre_14,genre_15,genre_16,genre_17,genre_18
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [25]:
user_item_matrix = ratings_df.pivot(
    index='user_id', columns='item_id', values='rating'
).fillna(0)

In [26]:
user_item_sparse = csr_matrix(user_item_matrix.values)

In [29]:
user_item_matrix.head()

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
item_similarity = cosine_similarity(user_item_sparse.T)
item_similarity.shape

(1682, 1682)

In [37]:
movies_df.head()

Unnamed: 0,item_id,title,release_date,video_release_date,IMDb_url,genre_0,genre_1,genre_2,genre_3,genre_4,...,genre_9,genre_10,genre_11,genre_12,genre_13,genre_14,genre_15,genre_16,genre_17,genre_18
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [40]:
toy_story_id = movies_df["item_id"][movies_df["title"].str.contains("Toy Story")].values[0]
print(toy_story_id)

1


In [41]:
toy_story_id -= 1 # датасет нумерует фильмы с 1


In [43]:
similarities = item_similarity[toy_story_id]
print(similarities)

[1.         0.40238218 0.33024479 ... 0.         0.04718307 0.04718307]


In [49]:
topN = 10
# разворот
# сортировка и выбор топ n
# скип 1, иначе 1.0
indices = np.argsort(similarities)[::-1][1:topN]

movies_df.iloc[indices][["title"]]

Unnamed: 0,title
49,Star Wars (1977)
180,Return of the Jedi (1983)
120,Independence Day (ID4) (1996)
116,"Rock, The (1996)"
404,Mission: Impossible (1996)
150,Willy Wonka and the Chocolate Factory (1971)
221,Star Trek: First Contact (1996)
99,Fargo (1996)
236,Jerry Maguire (1996)
