### Использование датасетов о рейтинге фильмов MovieLens Latest Datasets.


In [1]:
import pandas as pd
import numpy as np

# модуль sparse библиотеки scipy понадобится для работы с разреженными матрицами 
from scipy.sparse import csr_matrix

from sklearn.neighbors import NearestNeighbors

games = pd.read_csv('all_games.csv')
ratings = pd.read_csv('Games_ratings.csv')

In [2]:
games.head(5) 

Unnamed: 0,name,gameId,platform,release_date
0,The Legend of Zelda: Ocarina of Time,1,Nintendo 64,"November 23, 1998"
1,Tony Hawk's Pro Skater 2,2,PlayStation,"September 20, 2000"
2,Grand Theft Auto IV,3,PlayStation 3,"April 29, 2008"
3,SoulCalibur,4,Dreamcast,"September 8, 1999"
4,Grand Theft Auto IV,5,Xbox 360,"April 29, 2008"


In [3]:
ratings.head(5)

Unnamed: 0,userId,gameId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
#удаление столбца
games.drop(['release_date'], axis = 1, inplace = True)
ratings.drop(['timestamp'], axis = 1, inplace = True)

### Создание матрицы предпочтений

In [5]:
# по горизонтали будут игры, по вертикали - пользователи, значения - оценки
user_item_matrix = ratings.pivot(index = 'gameId', columns = 'userId', values = 'rating')
user_item_matrix.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
gameId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,,,4.0,,4.5,,,,...,4.0,,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,,,,,,4.0,,4.0,,,...,,4.0,,5.0,3.5,,,2.0,,
3,4.0,,,,,5.0,,,,,...,,,,,,,,2.0,,
4,,,,,,3.0,,,,,...,,,,,,,,,,
5,,,,,,5.0,,,,,...,,,,3.0,,,,,,


In [6]:
#замена пропущенных значений 
user_item_matrix.fillna(0, inplace = True)
print(user_item_matrix.shape)
user_item_matrix.head()


(9724, 610)


userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
gameId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


**Чистка матрицы**

Необходимо убрать неактивных пользователей и фильмы с небольшим количеством оценок. С одной стороны, такие пользователи не окажут существенного влияния на расстояния между фильмами, с другой, малому количеству оценок довольно сложно доверять.

In [7]:
# вначале сгруппируем (объединим) пользователей, возьмем только столбец rating 
# и посчитаем, сколько было оценок у каждого пользователя
users_votes = ratings.groupby('userId')['rating'].agg('count')
 
# сделаем то же самое, только для фильма
game_votes = ratings.groupby('gameId')['rating'].agg('count')
 
# теперь создадим фильтр (mask)
user_mask = users_votes[users_votes > 50].index
game_mask = game_votes[game_votes > 10].index
 
# применим фильтры и отберем фильмы с достаточным количеством оценок
user_item_matrix = user_item_matrix.loc[game_mask,:]
 
# а также активных пользователей
user_item_matrix = user_item_matrix.loc[:,user_mask]

print(user_item_matrix.shape)
user_item_matrix.head()

(2121, 378)


userId,1,4,6,7,10,11,15,16,17,18,...,600,601,602,603,604,605,606,607,608,610
gameId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,4.5,0.0,0.0,2.5,0.0,4.5,3.5,...,2.5,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,5.0
2,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,4.0,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0
3,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
5,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.5,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
6,4.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0,0.0,4.0,...,0.0,0.0,3.0,4.0,3.0,0.0,0.0,0.0,0.0,5.0


### Преобразование разреженной матрицы
Преобразование данных матрицы в формат сжатого хранения строкой 

In [8]:
# атрибут values передаст функции csr_matrix только значения датафрейма
csr_data = csr_matrix(user_item_matrix.values)
print(csr_data)

  (0, 0)	4.0
  (0, 3)	4.5
  (0, 6)	2.5
  (0, 8)	4.5
  (0, 9)	3.5
  (0, 10)	4.0
  (0, 12)	3.5
  (0, 16)	3.0
  (0, 19)	3.0
  (0, 20)	3.0
  (0, 25)	5.0
  (0, 28)	5.0
  (0, 29)	4.0
  (0, 31)	3.0
  (0, 34)	5.0
  (0, 38)	5.0
  (0, 39)	4.0
  (0, 40)	4.0
  (0, 41)	2.5
  (0, 43)	4.5
  (0, 46)	0.5
  (0, 47)	4.0
  (0, 50)	2.5
  (0, 53)	4.0
  (0, 55)	3.0
  :	:
  (2118, 205)	4.0
  (2118, 345)	1.5
  (2118, 357)	4.0
  (2118, 369)	4.5
  (2119, 37)	3.5
  (2119, 62)	3.0
  (2119, 98)	0.5
  (2119, 127)	4.5
  (2119, 156)	4.5
  (2119, 236)	0.5
  (2119, 256)	4.5
  (2119, 317)	2.0
  (2119, 345)	2.0
  (2119, 357)	5.0
  (2119, 365)	3.5
  (2120, 37)	4.0
  (2120, 62)	5.0
  (2120, 146)	2.5
  (2120, 155)	4.5
  (2120, 156)	5.0
  (2120, 186)	5.0
  (2120, 205)	4.0
  (2120, 236)	3.0
  (2120, 317)	3.5
  (2120, 357)	4.0


In [9]:
user_item_matrix = user_item_matrix.rename_axis(None, axis = 1).reset_index()
user_item_matrix.head()

Unnamed: 0,gameId,1,4,6,7,10,11,15,16,17,...,600,601,602,603,604,605,606,607,608,610
0,1,4.0,0.0,0.0,4.5,0.0,0.0,2.5,0.0,4.5,...,2.5,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,5.0
1,2,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0
2,3,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
3,5,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.5,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
4,6,4.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,3.0,4.0,3.0,0.0,0.0,0.0,0.0,5.0


### Создание модели машинного обучения 

На основе метода k-ближайших соседей

In [10]:
# создадим объект класса NearestNeighbors
# metric = ‘cosine’: косинусное сходство; algorithm = ‘brute’: полный перебор; 
# n_neighbors = 20: количество соседей; n_jobs = -1: вычисления на всех свободных ядрах процессора
knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute', n_neighbors = 20, n_jobs = -1)
 
# обучим модель
knn.fit(csr_data)

### Получение рекомендаций

In [11]:
recommendations = 10
search_word = 'Zelda'

In [12]:
game_search = games[games['name'].str.contains(search_word)]
game_search

Unnamed: 0,name,gameId,platform
0,The Legend of Zelda: Ocarina of Time,1,Nintendo 64
13,The Legend of Zelda: Breath of the Wild,14,Switch
26,The Legend of Zelda: Breath of the Wild,27,Wii U
36,The Legend of Zelda: Twilight Princess,37,GameCube
40,The Legend of Zelda: The Wind Waker,41,GameCube
49,The Legend of Zelda Collector's Edition,50,GameCube
53,The Legend of Zelda: A Link to the Past,54,Game Boy Advance
54,The Legend of Zelda: Majora's Mask,55,Nintendo 64
56,The Legend of Zelda: Twilight Princess,57,Wii
79,The Legend of Zelda: Ocarina of Time 3D,80,3DS


In [13]:
game_id = game_search.iloc[0]['gameId']
 
# далее по индексу фильма в датасете game найдем соответствующий индекс
# в матрице предпочтений
game_id = user_item_matrix[user_item_matrix['gameId'] == game_id].index[0]
game_id

0

In [14]:
print(csr_data[game_id])

  (0, 0)	4.0
  (0, 3)	4.5
  (0, 6)	2.5
  (0, 8)	4.5
  (0, 9)	3.5
  (0, 10)	4.0
  (0, 12)	3.5
  (0, 16)	3.0
  (0, 19)	3.0
  (0, 20)	3.0
  (0, 25)	5.0
  (0, 28)	5.0
  (0, 29)	4.0
  (0, 31)	3.0
  (0, 34)	5.0
  (0, 38)	5.0
  (0, 39)	4.0
  (0, 40)	4.0
  (0, 41)	2.5
  (0, 43)	4.5
  (0, 46)	0.5
  (0, 47)	4.0
  (0, 50)	2.5
  (0, 53)	4.0
  (0, 55)	3.0
  :	:
  (0, 338)	5.0
  (0, 339)	3.0
  (0, 340)	4.0
  (0, 341)	4.5
  (0, 345)	3.5
  (0, 346)	4.0
  (0, 348)	4.0
  (0, 349)	5.0
  (0, 351)	4.0
  (0, 352)	3.0
  (0, 355)	5.0
  (0, 358)	5.0
  (0, 360)	4.0
  (0, 365)	4.0
  (0, 366)	4.0
  (0, 367)	3.0
  (0, 368)	2.5
  (0, 369)	4.0
  (0, 371)	4.0
  (0, 372)	3.0
  (0, 373)	4.0
  (0, 374)	2.5
  (0, 375)	4.0
  (0, 376)	2.5
  (0, 377)	5.0


In [15]:
#поиск индексов ближайших соседей 
#массив индексов фильмов (indices) и массив расстояний (distances) до них
distances, indices = knn.kneighbors(csr_data[game_id], n_neighbors = recommendations + 1)


In [16]:
# уберем лишние измерения через squeeze() и преобразуем массивы в списки с помощью tolist()
indices_list = indices.squeeze().tolist()
distances_list = distances.squeeze().tolist()
 
indices_distances = list(zip(indices_list, distances_list))
 
# в набор кортежей (tuple)
print(type(indices_distances[0]))
 
# и посмотрим на первые три пары/кортежа
print(indices_distances[:3])

<class 'tuple'>
[(0, 0.0), (217, 0.33488418488394356), (169, 0.3565415477121445)]


In [17]:
indices_distances_sorted = sorted(indices_distances, key = lambda x: x[1], reverse = False)

# и убрать первый элемент с индексом 901 (потому что это и есть "Матрица")
indices_distances_sorted = indices_distances_sorted[1:]
indices_distances_sorted

[(217, 0.33488418488394356),
 (169, 0.3565415477121445),
 (1048, 0.37163656407393986),
 (124, 0.3807886031957386),
 (1292, 0.3814720344599547),
 (67, 0.38463266223098214),
 (454, 0.39066271577950284),
 (141, 0.3982930777974919),
 (173, 0.3985780912457597),
 (500, 0.39949482401724845)]

In [18]:
# сопоставление индексам названия фильмов 
recom_list = []

# теперь в цикле будем поочередно проходить по кортежам
for ind_dist in indices_distances_sorted:

    # искать movieId в матрице предпочтений
    matrix_movie_id = user_item_matrix.iloc[ind_dist[0]]['gameId']

    # выяснять индекс этого фильма в датафрейме movies
    id = games[games['gameId'] == matrix_movie_id].index

    # брать название фильма и расстояние до него
    title = games.iloc[id]['name'].values[0]
    dist = ind_dist[1]

    # помещать каждую пару в питоновский словарь
    # который, в свою очередь, станет элементом списка recom_list
    recom_list.append({'Title' : title, 'Distance' : dist})

In [19]:
recom_list[:5]

[{'Title': 'Okami', 'Distance': 0.33488418488394356},
 {'Title': "Tony Hawk's Pro Skater 3", 'Distance': 0.3565415477121445},
 {'Title': 'Far Cry 5', 'Distance': 0.37163656407393986},
 {'Title': 'PaRappa the Rapper', 'Distance': 0.3807886031957386},
 {'Title': 'FIFA Soccer 06', 'Distance': 0.3814720344599547}]

In [20]:
#преобразование в датафрейм
recom_df = pd.DataFrame(recom_list, index = range(1, recommendations + 1))
recom_df

Unnamed: 0,Title,Distance
1,Okami,0.334884
2,Tony Hawk's Pro Skater 3,0.356542
3,Far Cry 5,0.371637
4,PaRappa the Rapper,0.380789
5,FIFA Soccer 06,0.381472
6,Okami,0.384633
7,Hex: Shards of Fate,0.390663
8,The Witcher 3: Wild Hunt,0.398293
9,Neverwinter Nights,0.398578
10,Football Manager 2013,0.399495
