In [1]:
import pandas as pd
import numpy as np

# импортируем scipy для работы с разреженными матрицами
from scipy.sparse import csr_matrix

# импортируем алгоритм k-ближайших соседей
from sklearn.neighbors import NearestNeighbors

__Подготовка данных__

In [2]:
movies = pd.read_csv('./movies.csv')
ratings = pd.read_csv('./ratings.csv')

In [4]:
# в df movies удалим столбец genres 
movies.drop(['genres'], axis = 1, inplace = True)
movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [7]:
# в df ratings удалим столбец timestamp
ratings.drop(['timestamp'], axis = 1, inplace = True)
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [8]:
# создаем с помощью функции pivot таблицу (pivot table)
# по горизонтали будут фильмы, по вертикали пользователи, а значения - оценки
user_item_matrix = ratings.pivot(index = 'movieId', columns = 'userId', values = 'rating')
user_item_matrix.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,,,4.0,,4.5,,,,...,4.0,,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,,,,,,4.0,,4.0,,,...,,4.0,,5.0,3.5,,,2.0,,
3,4.0,,,,,5.0,,,,,...,,,,,,,,2.0,,
4,,,,,,3.0,,,,,...,,,,,,,,,,
5,,,,,,5.0,,,,,...,,,,3.0,,,,,,


In [9]:
# заполним пропуски NaN нулями
user_item_matrix.fillna(0, inplace = True)
user_item_matrix.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
user_item_matrix.shape

(9724, 610)

Уберем неактивных пользователей и фильмы с небольшим количеством оценок

In [12]:
# сгруппируем пользователей по столбцу rating и посчитаем сколько оценок у каждого пользователя
user_votes = ratings.groupby('userId')['rating'].agg('count')

# сделаем тоже самое только для фильма
movies_votes = ratings.groupby('movieId')['rating'].agg('count')

In [13]:
# теперь создадим фильтр (mask)
user_mask = user_votes[user_votes > 50].index
movie_mask = movies_votes[movies_votes > 10].index

In [14]:
# применим фильтры и отберем фильмы с достаточным количеством оценок
user_item_matrix = user_item_matrix.loc[movie_mask,:]

# а так же активных пользователей
user_item_matrix = user_item_matrix.loc[:, user_mask]

In [15]:
user_item_matrix.shape

(2121, 378)

Преобразуем нашу разреженную матрицу (sparce matrix) в сжатое хранение строкой (compressed spare row) с помощью функции
csr_matrix библиотеки Scipy 

In [20]:
# метод values передаст функции только значения dataframe
csr_data = csr_matrix(user_item_matrix.values)
print(csr_data[:2,:5])

  (0, 0)	4.0
  (0, 3)	4.5
  (1, 2)	4.0


In [21]:
# сбросим index с помощью reset_index()
# это необходимо для удобства поиска фильма по индексу
user_item_matrix = user_item_matrix.rename_axis(None, axis = 1).reset_index()
user_item_matrix.head()

Unnamed: 0,movieId,1,4,6,7,10,11,15,16,17,...,600,601,602,603,604,605,606,607,608,610
0,1,4.0,0.0,0.0,4.5,0.0,0.0,2.5,0.0,4.5,...,2.5,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,5.0
1,2,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0
2,3,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
3,5,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.5,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
4,6,4.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,3.0,4.0,3.0,0.0,0.0,0.0,0.0,5.0


__Обучение модели__

In [26]:
# воспользуемся классом NearestNeighbors для поиска расстояний
knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute', n_neighbors = 20, n_jobs = -1)

# обучим модель
knn.fit(csr_data)

__Составление рекомендаций__

Зададим изначальные параметры поиска

In [27]:
# укажем количество рекомендаций которые мы хотим получить
recommendations = 10

# на основе какого фильма
search_word = 'Matrix'

Найдем индекс фильма в матрице предпочтений

In [28]:
# для начала найдем фильм в заголовках датафрейма movies
movie_search = movies[movies['title'].str.contains(search_word)]
movie_search

Unnamed: 0,movieId,title
1939,2571,"Matrix, The (1999)"
4351,6365,"Matrix Reloaded, The (2003)"
4639,6934,"Matrix Revolutions, The (2003)"


In [29]:
# для простоты возьмем через iloc[0] первую строку столбца movieId
movie_id = movie_search.iloc[0]['movieId']

# далее по индексу фильма в датасете movies найдем соответствующий индекс в матрице предпочтений
movie_id = user_item_matrix[user_item_matrix['movieId'] == movie_id].index[0]
movie_id

901

In [30]:
# теперь нужно найти индексы и расстояния фильмов, которые похожи на наш запрос
# воспользуемся методом kneighbors()
distances, indices = knn.kneighbors(csr_data[movie_id], n_neighbors = recommendations + 1)

In [31]:
# посмотрим индексы рекомендованных фильмов
indices

array([[ 901, 1002,  442,  454,  124,  735,  954, 1362, 1157, 1536,  978]],
      dtype=int64)

In [33]:
# посмотрим на расстояния до них
distances

array([[0.        , 0.22982441, 0.25401128, 0.27565617, 0.27760886,
        0.28691008, 0.29111012, 0.31393358, 0.31405926, 0.31548004,
        0.31748544]])

In [34]:
# уберем лишние измерения через squeeze и преобразуем массивы в списки с помощь. tolist 
indices_list = indices.squeeze().tolist()
distances_list = distances.squeeze().tolist()

# далее с помощью функций zip и list преобразуем наши списки
indices_distances = list(zip(indices_list, distances_list))

# в набор кортежей (tuple)
print(type(indices_distances[0]))

# посмотрим на первые три пары/кортежа
print(indices_distances[:5])

<class 'tuple'>
[(901, 0.0), (1002, 0.22982440568634488), (442, 0.25401128310081567), (454, 0.27565616686043737), (124, 0.2776088577731709)]


In [36]:
# отсортируем список по расстояниям
indices_distances_sorted = sorted(indices_distances, key = lambda x: x[1], reverse = True)

# уберем последний элемент (это сам фильм матрица)
indices_distances_sorted = indices_distances_sorted[:-1]
indices_distances_sorted

[(978, 0.31748544046311844),
 (1536, 0.3154800434449465),
 (1157, 0.31405925934381695),
 (1362, 0.31393358217709477),
 (954, 0.2911101181714415),
 (735, 0.2869100842838125),
 (124, 0.2776088577731709),
 (454, 0.27565616686043737),
 (442, 0.25401128310081567),
 (1002, 0.22982440568634488)]

In [37]:
# остается найти какие фильмы соотвествуют найденным нами индексам
# создаем пустой список, в который будем помещать название фильма и расстояние до него
recom_list = []

# теперь в цикле будем поочередно проходить по кортежам
for ind_dist in indices_distances_sorted:
    
    # ищем movieId в матрице предпочтений
    matrix_movie_id = user_item_matrix.iloc[ind_dist[0]]['movieId']
    
    # выясняем индекс этого фильма в датафрейме movies
    id = movies[movies['movieId'] == matrix_movie_id].index
    
    # берем название фильма и расстояние до него
    title = movies.iloc[id]['title'].values[0]
    dist = ind_dist[1]
    
    # помещаем каждую пару в питоновский словарь
    recom_list.append({'Title' : title, 'Distance' : dist})

In [38]:
recom_list

[{'Title': 'American Beauty (1999)', 'Distance': 0.31748544046311844},
 {'Title': 'Lord of the Rings: The Return of the King, The (2003)',
  'Distance': 0.3154800434449465},
 {'Title': 'Gladiator (2000)', 'Distance': 0.31405925934381695},
 {'Title': 'Lord of the Rings: The Fellowship of the Ring, The (2001)',
  'Distance': 0.31393358217709477},
 {'Title': 'Sixth Sense, The (1999)', 'Distance': 0.2911101181714415},
 {'Title': 'Saving Private Ryan (1998)', 'Distance': 0.2869100842838125},
 {'Title': 'Star Wars: Episode IV - A New Hope (1977)',
  'Distance': 0.2776088577731709},
 {'Title': 'Star Wars: Episode VI - Return of the Jedi (1983)',
  'Distance': 0.27565616686043737},
 {'Title': 'Star Wars: Episode V - The Empire Strikes Back (1980)',
  'Distance': 0.25401128310081567},
 {'Title': 'Fight Club (1999)', 'Distance': 0.22982440568634488}]

In [40]:
recom_df = pd.DataFrame(recom_list, index = range(1, recommendations + 1))
recom_df

Unnamed: 0,Title,Distance
1,American Beauty (1999),0.317485
2,"Lord of the Rings: The Return of the King, The...",0.31548
3,Gladiator (2000),0.314059
4,"Lord of the Rings: The Fellowship of the Ring,...",0.313934
5,"Sixth Sense, The (1999)",0.29111
6,Saving Private Ryan (1998),0.28691
7,Star Wars: Episode IV - A New Hope (1977),0.277609
8,Star Wars: Episode VI - Return of the Jedi (1983),0.275656
9,Star Wars: Episode V - The Empire Strikes Back...,0.254011
10,Fight Club (1999),0.229824
