Есть таблица истории просмотров фильмов в онлайн-кинотеатре.
Два поля: Пользователь, Фильм
Множество строк: тысячи пользователей и фильмов.

Нужно сделать SQL запрос или скрипт на Python,
который бы реализовал логику алгоритма рекомендаций фильмов к просмотру, которые пользователи еще не смотрели.
Логику рекомендаций предлагайте на свое усмотрение.
На выходе нужна таблица: Пользователь, Рекомендованный фильм, Рейтинг рекомендации


Поскольку в задаче нет дополнительной информации о фильмах и пользователях, которую можно было бы анализировать(описание фильмов, жанры, год выпуска, возраст зрителя и т.д.) будем реализовывать алгоритм, основанный на коллаборативной фильтрации (**Collaborative filtering**). 

Суть состоит в том, чтобы вычислить степень попарной "похожести" фильмов друг на друга. **Чем больше пользователей, посмотревших один из фильмов, посмотрело оба фильма** тем больше фильмы похожи. Таким образом для N фильмов получаем матрицу NxN, которую используем для формирования рекомендаций для пользователя по его истории просмотров.     

In [357]:
import numpy as np

In [358]:
import pandas as pd

In [359]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [360]:
# lets use MovieLens latest small dataset for testing: https://grouplens.org/datasets/movielens/ 
data = pd.read_csv('/content/drive/MyDrive/ratings.csv')[['movieId', 'userId', 'rating']]

In [361]:
data

Unnamed: 0,movieId,userId,rating
0,1,1,4.0
1,3,1,4.0
2,6,1,4.0
3,47,1,5.0
4,50,1,5.0
...,...,...,...
100831,166534,610,4.0
100832,168248,610,5.0
100833,168250,610,5.0
100834,168252,610,5.0


In [362]:
# For perfomance we take first 1000 movies
mIds = data['movieId'].unique()[:1000]
data = data[data['movieId'].isin(mIds)]

In [363]:
data

Unnamed: 0,movieId,userId,rating
0,1,1,4.0
1,3,1,4.0
2,6,1,4.0
3,47,1,5.0
4,50,1,5.0
...,...,...,...
100672,109374,610,4.5
100673,109487,610,3.5
100732,119145,610,4.5
100737,122882,610,5.0


In [364]:
#split data on test & train
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data, test_size= 0.3)

for each movie find their userIds and their count
then calculate intersections in loop

In [365]:
#for each movie find users who watched it and count them
def get_reviewers(data):
    keys, values = data[['movieId', 'userId']].sort_values('movieId').values.T
    ukeys, index = np.unique(keys, True)
    arrays = np.split(values, index[1:])
    df = pd.DataFrame({'movieId':ukeys, 'usersSet':[set(a) for a in arrays], 'userCount':[len(a) for a in arrays]})
    return df, ukeys

In [366]:
movieViewers, k = get_reviewers(train_data)

In [367]:
# form similarity matrix to make predictions
similarityMatrix = pd.DataFrame(index=list(range(len(k))), columns=list(range(len(k))), dtype=float)

In [368]:
similarityMatrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,951,952,953,954,955,956,957,958,959,960,961,962,963,964,965,966,967,968,969,970,971,972,973,974,975,976,977,978,979,980,981,982,983,984,985,986,987,988,989,990
0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
986,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
987,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
988,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
989,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [369]:
#to compute similarity jaccard index is used https://en.wikipedia.org/wiki/Jaccard_index
def jaccard_index(x, movie_viewers):
    B = movie_viewers.values[x.name]
    B_count = B[2]
    B_set = B[1]
    if (B_count == 0):
      return np.zeros(shape=(len(x.index),))
    res = np.zeros(shape=(len(x.index),))
    for i in x.index:
        if(i >= x.name):
          #compute only top diagonal part for performance 
          return res
        A = movie_viewers.values[i]
        A_count = A[2]
        A_set = A[1]
        if (A_count == 0):
            res[i] = 0.0
        else:
            intersection = len(A_set & B_set)
            res[i] = intersection / (A_count + B_count - intersection)

    return res

In [370]:
#!pip install swifter

In [371]:
#import swifter

In [372]:
#compute similarity matrix
res2 = similarityMatrix.apply(jaccard_index, args=(movieViewers,))

In [373]:
#fill bottom diagonal
res2 = res2.values.T + res2.values
np.fill_diagonal(res2, 1.0)

In [374]:
#similarityMatrix = pd.DataFrame(res2.values, index=k, columns=k)
similarityMatrix = pd.DataFrame(res2, index=k, columns=k)

In [375]:
similarityMatrix

Unnamed: 0,1,2,3,4,5,6,7,8,10,11,13,15,16,17,19,21,22,24,25,26,27,31,32,34,36,39,41,43,44,45,46,47,50,52,54,58,60,61,62,65,...,87222,88163,88810,89774,91104,91529,91658,92259,94070,95167,95449,95510,95543,96079,97024,97938,98203,99114,103335,103339,104374,105211,106489,106696,106782,107141,109374,109487,109853,112006,112552,113275,113394,114060,115713,119145,122882,136020,137595,140110
1,1.000000,0.176166,0.115607,0.006289,0.103448,0.143590,0.142857,0.012658,0.150485,0.127778,0.025806,0.012422,0.085106,0.157303,0.157068,0.134715,0.047059,0.074074,0.088542,0.025000,0.006369,0.058480,0.246637,0.221649,0.104396,0.162162,0.025000,0.019355,0.087209,0.058824,0.025157,0.197581,0.180000,0.068323,0.012987,0.070175,0.096386,0.012658,0.147541,0.104938,...,0.037267,0.060976,0.055901,0.012579,0.012579,0.093264,0.045714,0.017241,0.006369,0.060241,0.012903,0.066667,0.012658,0.069364,0.000000,0.065476,0.000000,0.080645,0.037500,0.012739,0.025000,0.000000,0.017857,0.053892,0.050000,0.012658,0.094118,0.095745,0.000000,0.006452,0.035088,0.000000,0.000000,0.006410,0.050314,0.041667,0.074286,0.024242,0.000000,0.012346
2,0.176166,1.000000,0.142857,0.025974,0.144330,0.118110,0.057143,0.039474,0.155556,0.119266,0.040000,0.037975,0.078947,0.126126,0.186441,0.159664,0.089888,0.056818,0.075630,0.012195,0.026667,0.063830,0.158824,0.130435,0.100917,0.145299,0.024691,0.013158,0.081633,0.031250,0.051282,0.200000,0.169399,0.022472,0.013514,0.062500,0.109890,0.025974,0.121739,0.088889,...,0.023810,0.094118,0.047059,0.000000,0.012658,0.140351,0.073684,0.032258,0.026667,0.043956,0.013333,0.091954,0.012821,0.118280,0.000000,0.065217,0.000000,0.100917,0.036585,0.040000,0.024691,0.013699,0.058824,0.043956,0.069307,0.025974,0.039604,0.096491,0.000000,0.000000,0.066667,0.000000,0.000000,0.027027,0.048780,0.032967,0.091837,0.073171,0.000000,0.037500
3,0.115607,0.142857,1.000000,0.022727,0.203125,0.161290,0.149254,0.046512,0.140187,0.086420,0.047619,0.021277,0.141026,0.070588,0.127660,0.142857,0.125000,0.053571,0.093023,0.088889,0.023810,0.118644,0.086667,0.070175,0.162162,0.111111,0.088889,0.048780,0.125000,0.065574,0.043478,0.123457,0.090909,0.036364,0.025000,0.046154,0.135593,0.071429,0.104651,0.207547,...,0.000000,0.053571,0.000000,0.000000,0.000000,0.021277,0.030303,0.000000,0.023810,0.033898,0.050000,0.033898,0.000000,0.060606,0.000000,0.015873,0.000000,0.048780,0.020000,0.023256,0.020833,0.025641,0.018182,0.000000,0.027778,0.000000,0.014286,0.011111,0.000000,0.000000,0.016393,0.000000,0.000000,0.024390,0.019608,0.034483,0.013889,0.018868,0.000000,0.000000
4,0.006289,0.025974,0.022727,1.000000,0.073171,0.027397,0.100000,0.090909,0.022989,0.018519,0.100000,0.000000,0.037037,0.074074,0.042857,0.044118,0.000000,0.040000,0.070175,0.142857,0.000000,0.031250,0.007752,0.022989,0.019231,0.030769,0.066667,0.000000,0.000000,0.066667,0.250000,0.013605,0.027972,0.090909,0.000000,0.000000,0.030303,0.090909,0.016393,0.033333,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,0.103448,0.144330,0.203125,0.073171,1.000000,0.150538,0.225806,0.047619,0.070796,0.129870,0.048780,0.000000,0.100000,0.153846,0.105263,0.131868,0.050847,0.137255,0.081395,0.090909,0.050000,0.160714,0.094595,0.141509,0.118421,0.125000,0.090909,0.050000,0.092308,0.084746,0.093023,0.096970,0.071856,0.098039,0.025641,0.015152,0.081967,0.100000,0.132530,0.125000,...,0.040816,0.074074,0.018868,0.022727,0.046512,0.055556,0.046875,0.016667,0.050000,0.071429,0.025000,0.016949,0.023256,0.078125,0.000000,0.032787,0.025000,0.089744,0.041667,0.048780,0.021277,0.026316,0.018519,0.016949,0.028169,0.000000,0.044776,0.034483,0.000000,0.000000,0.016667,0.000000,0.000000,0.025000,0.020000,0.053571,0.043478,0.039216,0.000000,0.021277
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119145,0.041667,0.032967,0.034483,0.000000,0.053571,0.022727,0.017241,0.038462,0.050505,0.029412,0.000000,0.000000,0.014286,0.028169,0.023256,0.023810,0.000000,0.051282,0.000000,0.033333,0.041667,0.043478,0.028369,0.061224,0.014925,0.037975,0.000000,0.000000,0.058824,0.021739,0.000000,0.025000,0.045161,0.000000,0.045455,0.000000,0.000000,0.000000,0.013158,0.045455,...,0.062500,0.138889,0.057143,0.037037,0.076923,0.130435,0.086957,0.023256,0.041667,0.102564,0.043478,0.131579,0.125000,0.209302,0.047619,0.150000,0.043478,0.152542,0.100000,0.000000,0.107143,0.000000,0.085714,0.102564,0.191489,0.038462,0.104167,0.140625,0.047619,0.045455,0.157895,0.047619,0.047619,0.000000,0.172414,1.000000,0.222222,0.090909,0.047619,0.068966
122882,0.074286,0.091837,0.013889,0.000000,0.043478,0.040404,0.014085,0.000000,0.063636,0.012195,0.000000,0.023810,0.050000,0.000000,0.097826,0.020619,0.000000,0.018868,0.022989,0.023256,0.000000,0.051724,0.039474,0.083333,0.025316,0.021505,0.023256,0.027027,0.080645,0.016949,0.000000,0.066265,0.093750,0.019608,0.000000,0.000000,0.033333,0.025641,0.022727,0.035088,...,0.119048,0.148936,0.086957,0.108108,0.025000,0.213333,0.125000,0.075472,0.000000,0.120000,0.027778,0.217391,0.081081,0.226415,0.000000,0.229167,0.000000,0.265625,0.121951,0.114286,0.073171,0.000000,0.085106,0.191489,0.277778,0.081081,0.157895,0.246377,0.000000,0.028571,0.163265,0.000000,0.000000,0.000000,0.146341,0.222222,1.000000,0.166667,0.000000,0.128205
136020,0.024242,0.073171,0.018868,0.000000,0.039216,0.063291,0.000000,0.000000,0.042553,0.000000,0.000000,0.000000,0.048387,0.000000,0.025000,0.025641,0.000000,0.000000,0.014493,0.000000,0.000000,0.024390,0.037313,0.020833,0.000000,0.000000,0.000000,0.000000,0.021277,0.000000,0.000000,0.053333,0.054054,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.025641,...,0.120000,0.093750,0.068966,0.100000,0.047619,0.142857,0.100000,0.085714,0.117647,0.156250,0.000000,0.233333,0.105263,0.150000,0.066667,0.250000,0.058824,0.148148,0.173913,0.250000,0.190476,0.066667,0.230769,0.121212,0.136364,0.050000,0.146341,0.116667,0.066667,0.133333,0.085714,0.066667,0.066667,0.058824,0.037037,0.090909,0.166667,1.000000,0.066667,0.190476
137595,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.076923,0.050000,0.062500,0.000000,0.142857,0.017544,0.000000,0.000000,0.250000,0.000000,0.000000,0.045455,0.166667,0.000000,1.000000,0.040000,0.333333,0.000000,0.083333,0.000000,0.100000,0.000000,0.058824,0.045455,0.028571,0.000000,0.000000,0.019231,1.000000,0.500000,0.000000,1.000000,1.000000,0.000000,0.000000,0.047619,0.000000,0.066667,1.000000,0.100000


In [376]:
#add movies not included in train data
similarityMatrix = pd.DataFrame(similarityMatrix, index=mIds, columns=mIds).fillna(0.0)

In [377]:
similarityMatrix.isnull().values.any()

False

In [378]:
similarityMatrix

Unnamed: 0,1,3,6,47,50,70,101,110,151,157,163,216,223,231,235,260,296,316,333,349,356,362,367,423,441,457,480,500,527,543,552,553,590,592,593,596,608,648,661,673,...,107141,109374,109853,112006,113275,113394,119145,129428,136020,137595,140110,44,376,511,529,1100,1358,1370,1385,1438,1518,1586,1604,1608,1616,1687,1693,1721,1840,1882,1918,2002,2027,1357,1405,1876,2072,2100,2421,2485
1,1.000000,0.115607,0.143590,0.197581,0.180000,0.101695,0.056604,0.226415,0.094118,0.044872,0.115385,0.100000,0.171717,0.187500,0.109890,0.245211,0.234323,0.162791,0.079545,0.113744,0.246795,0.071429,0.198113,0.032258,0.082353,0.224576,0.255814,0.235294,0.184906,0.071006,0.093923,0.108696,0.200873,0.188525,0.214035,0.117647,0.247788,0.284314,0.134146,0.181818,...,0.012658,0.094118,0.0,0.006452,0.0,0.0,0.041667,0.0,0.024242,0.0,0.012346,0.087209,0.086705,0.006494,0.071429,0.050314,0.084337,0.128492,0.061350,0.055901,0.012500,0.037267,0.000000,0.087719,0.049383,0.037736,0.050000,0.222222,0.032051,0.093750,0.074074,0.107784,0.012658,0.042945,0.086420,0.094118,0.043210,0.109195,0.073620,0.060976
3,0.115607,1.000000,0.161290,0.123457,0.090909,0.095890,0.104167,0.071429,0.109375,0.090909,0.157895,0.142857,0.125000,0.168142,0.115385,0.088083,0.074689,0.125000,0.171875,0.153846,0.096000,0.120690,0.094488,0.022727,0.131148,0.129870,0.082902,0.151261,0.081522,0.178571,0.220588,0.202703,0.118881,0.121795,0.069444,0.136364,0.098684,0.139535,0.126984,0.081081,...,0.000000,0.014286,0.0,0.000000,0.0,0.0,0.034483,0.0,0.018868,0.0,0.000000,0.125000,0.123077,0.025641,0.101695,0.106383,0.083333,0.060976,0.054545,0.078431,0.021739,0.040000,0.000000,0.109375,0.057692,0.086957,0.039216,0.041958,0.045455,0.090909,0.113208,0.093750,0.071429,0.018519,0.173077,0.126984,0.125000,0.200000,0.111111,0.035088
6,0.143590,0.161290,1.000000,0.145946,0.206897,0.122449,0.064103,0.159420,0.122222,0.068493,0.191919,0.096774,0.148438,0.117241,0.135922,0.159420,0.151394,0.145833,0.071429,0.162791,0.097473,0.067416,0.126667,0.027397,0.100000,0.179191,0.149038,0.105960,0.122549,0.078652,0.097087,0.155340,0.111111,0.171429,0.160000,0.060606,0.172619,0.187919,0.052083,0.089109,...,0.000000,0.020202,0.0,0.000000,0.0,0.0,0.022727,0.0,0.063291,0.0,0.025974,0.073684,0.131868,0.000000,0.079545,0.078947,0.055556,0.158416,0.073171,0.049383,0.054795,0.037975,0.014493,0.109890,0.103896,0.052632,0.012195,0.118750,0.041096,0.022727,0.047059,0.052632,0.027397,0.024096,0.034091,0.052083,0.024390,0.069307,0.071429,0.034884
47,0.197581,0.123457,0.145946,1.000000,0.234783,0.115152,0.039735,0.302905,0.080247,0.034014,0.177914,0.093168,0.188172,0.242105,0.150602,0.261044,0.324818,0.171569,0.084848,0.185185,0.312500,0.062893,0.252577,0.006757,0.074534,0.323810,0.288066,0.229592,0.257261,0.075949,0.147239,0.148810,0.227907,0.273973,0.259398,0.104938,0.260465,0.206731,0.067073,0.121951,...,0.020548,0.067073,0.0,0.000000,0.0,0.0,0.025000,0.0,0.053333,0.0,0.020000,0.121019,0.072727,0.006993,0.076433,0.026316,0.049689,0.123529,0.038462,0.025806,0.006667,0.026316,0.000000,0.107595,0.046053,0.019868,0.039735,0.150000,0.020408,0.071895,0.072368,0.080745,0.013605,0.046053,0.071429,0.093750,0.060403,0.103030,0.058065,0.031646
50,0.180000,0.090909,0.206897,0.234783,1.000000,0.137500,0.061644,0.204633,0.094937,0.041667,0.104651,0.094340,0.164894,0.141463,0.138554,0.200000,0.293907,0.144928,0.079268,0.144330,0.253333,0.063694,0.205000,0.013793,0.089172,0.215859,0.229249,0.160194,0.270042,0.090909,0.107784,0.143713,0.190909,0.231111,0.242537,0.066265,0.251163,0.202899,0.061350,0.109756,...,0.013793,0.048485,0.0,0.000000,0.0,0.0,0.045161,0.0,0.054054,0.0,0.013423,0.054545,0.093750,0.000000,0.077419,0.047619,0.084416,0.152439,0.073826,0.039735,0.027586,0.062069,0.000000,0.123377,0.075342,0.048276,0.047297,0.156682,0.027778,0.087248,0.052288,0.068323,0.020833,0.053691,0.072368,0.108974,0.046980,0.090909,0.065789,0.038710
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1876,0.094118,0.126984,0.052083,0.093750,0.108974,0.216667,0.069767,0.085561,0.066667,0.078947,0.094595,0.101695,0.134021,0.126126,0.038961,0.091398,0.081545,0.094017,0.046154,0.056075,0.068000,0.017544,0.090909,0.055556,0.068966,0.098684,0.097826,0.120690,0.043478,0.072727,0.151515,0.037975,0.085106,0.127517,0.071770,0.062500,0.111111,0.138211,0.032258,0.140625,...,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.081633,0.0,0.044444,0.0,0.000000,0.140351,0.100000,0.000000,0.035714,0.184211,0.094340,0.142857,0.133333,0.116279,0.081081,0.046512,0.000000,0.230769,0.116279,0.131579,0.045455,0.092308,0.083333,0.232558,0.130435,0.145455,0.027027,0.090909,0.080000,1.000000,0.093023,0.145161,0.104167,0.061224
2072,0.043210,0.125000,0.024390,0.060403,0.046980,0.037037,0.115385,0.027624,0.068182,0.200000,0.049180,0.116279,0.081395,0.048544,0.086207,0.027624,0.039823,0.027778,0.108696,0.078652,0.037344,0.000000,0.055046,0.050000,0.071429,0.048951,0.039326,0.066038,0.035503,0.135135,0.092593,0.048387,0.046154,0.034247,0.029851,0.062500,0.075188,0.060345,0.044444,0.076923,...,0.000000,0.044444,0.0,0.000000,0.0,0.0,0.028571,0.0,0.034483,0.0,0.000000,0.066667,0.065217,0.066667,0.025000,0.076923,0.108108,0.086207,0.096774,0.068966,0.000000,0.120000,0.000000,0.093023,0.107143,0.181818,0.115385,0.041667,0.157895,0.090909,0.093750,0.095238,0.105263,0.148148,0.121212,0.093023,1.000000,0.125000,0.161290,0.093750
2100,0.109195,0.200000,0.069307,0.103030,0.090909,0.111111,0.127660,0.060606,0.044118,0.066667,0.113924,0.058824,0.135922,0.090909,0.160000,0.093750,0.074689,0.080000,0.102941,0.071429,0.096000,0.048387,0.103175,0.022727,0.131148,0.108280,0.088542,0.113821,0.058511,0.100000,0.106667,0.085366,0.088435,0.100629,0.074419,0.153846,0.113333,0.113636,0.075758,0.095890,...,0.000000,0.014286,0.0,0.000000,0.0,0.0,0.052632,0.0,0.038462,0.0,0.000000,0.074627,0.073529,0.025641,0.048387,0.130435,0.083333,0.087500,0.074074,0.037736,0.021739,0.061224,0.000000,0.109375,0.057692,0.111111,0.039216,0.111940,0.045455,0.111111,0.180000,0.147541,0.046512,0.057692,0.089286,0.145161,0.125000,1.000000,0.200000,0.092593
2421,0.073620,0.111111,0.071429,0.058065,0.065789,0.087719,0.206897,0.049180,0.060000,0.111111,0.060606,0.080000,0.087912,0.027027,0.078125,0.060773,0.052402,0.026316,0.055556,0.096774,0.053498,0.021739,0.043103,0.038462,0.062500,0.083333,0.061111,0.072072,0.052326,0.090909,0.120690,0.075758,0.092308,0.068027,0.059701,0.140000,0.064286,0.066116,0.039216,0.087719,...,0.000000,0.039216,0.0,0.000000,0.0,0.0,0.050000,0.0,0.028571,0.0,0.000000,0.038462,0.078431,0.047619,0.044444,0.172414,0.093023,0.112903,0.025641,0.027778,0.035714,0.096774,0.000000,0.081633,0.121212,0.103448,0.129032,0.065041,0.076923,0.050000,0.205882,0.130435,0.038462,0.088235,0.048780,0.104167,0.161290,0.200000,1.000000,0.078947


In [379]:
def predict(views: np.ndarray, similarity: np.ndarray):
    pred = (views.dot(similarity) / similarity.sum(axis=1))
    return np.nan_to_num(pred, nan=0.0)

EVALUATION


In [380]:
test_users = test_data['userId'].unique()
total_watched = 0
right_preds = 0
result_df = pd.DataFrame(columns=['userId', 'movieId', 'rating'])
for user_id in test_users:
    #get movies user watched in test data
    test_user_watched = test_data[test_data['userId'] == user_id]['movieId'].values
    #get movies user already watched
    user_movie_indx = train_data[train_data['userId'] == user_id]['movieId'].values
    user_movie_rating = train_data[train_data['userId'] == user_id]['rating'].values
    #create vector with movie ids and 1
    user_watched = pd.DataFrame(np.ones(shape=(len(user_movie_indx),)), index = user_movie_indx)
    #fill not watched movies with zeros
    user_watched = pd.DataFrame(user_watched, index=mIds).fillna(0.0)
    #get predictions for user
    pred = predict(user_watched.values.reshape((len(user_watched.values),)), similarityMatrix.values)
    train_user_watched = train_data[train_data['userId'] == user_id]['movieId'].values
    #get user recommendations
    pred = pd.DataFrame(pred, index=mIds, columns=['rating'])
    #filter already watched films
    pred = pred[~pred.index.isin(train_user_watched)]

    tmp = pd.DataFrame(pred, columns=['userId', 'movieId', 'rating'])
    tmp['movieId'] = tmp.index
    tmp['userId'] = user_id
    result_df = result_df.append(tmp, ignore_index=True)

    #get top N recommendations for user with N views
    pred = pred.sort_values('rating',ascending = False).head(len(test_user_watched))
    #print(pred[pred.index.isin(test_user_watched)].shape[0],len(test_user_watched))
    total_watched += len(test_user_watched)
    right_preds +=  pred[pred.index.isin(test_user_watched)].shape[0]

print(right_preds / total_watched)
print(result_df)

  


0.36906906906906906
       userId movieId    rating
0         599     151  0.496229
1         599     316  0.592860
2         599     362  0.481788
3         599     423  0.413867
4         599     527  0.576236
...       ...     ...       ...
575939    499    1876  0.006340
575940    499    2072  0.010178
575941    499    2100  0.007067
575942    499    2421  0.007260
575943    499    2485  0.006068

[575944 rows x 3 columns]


Таким образом модель предсказала 37% просмотренных фильмов из тестового набора.

Недостатком подхода является необходимость досчитывать матрицу при добавлении новых фильмов и невозможность рекоммендовать фильмы пользователям без истории просмотров. 

Аналогичную модель можно построить на поиске похожих пользователей по истории просмотров. 

