In [1]:
!pip install surprise



In [2]:
!pip install matplotlib_venn



In [3]:
import numpy as np
import pandas as pd
from typing import Dict, Tuple
from scipy import stats
from IPython.display import Image
from IPython.display import Image
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.datasets import load_iris, load_boston
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, median_absolute_error, r2_score 
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances
from surprise import SVD, Dataset, Reader
from surprise.model_selection import PredefinedKFold
from collections import defaultdict
from surprise.accuracy import rmse
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
%matplotlib inline 
sns.set(style="ticks")

In [4]:
data = pd.read_csv('ratings_Beauty.csv')
data.head()

Unnamed: 0,UserId,ProductId,Rating,Timestamp
0,A39HTATAQ9V7YF,205616461,5.0,1369699200
1,A3JM6GV9MNOF9X,558925278,3.0,1355443200
2,A1Z513UWSAAO0F,558925278,5.0,1404691200
3,A1WMRR494NWEWV,733001998,4.0,1382572800
4,A3IAAVS479H7M7,737104473,1.0,1274227200


In [5]:
data.shape

(2023070, 4)

In [6]:
def check_null_values(data):
    return list(zip([i for i in data.columns], zip(
    #типы колонок
    [str(i) for i in data.dtypes],
    #проверка, есть ли пропущенные значения
    [i for i in data.isnull().sum()])))

In [7]:
check_null_values(data)

[('UserId', ('object', 0)),
 ('ProductId', ('object', 0)),
 ('Rating', ('float64', 0)),
 ('Timestamp', ('int64', 0))]

# Случай cold start (новый пользователь)

In [8]:
popular_products = pd.DataFrame(data.groupby('ProductId')['Rating'].count())
most_popular = popular_products.sort_values('Rating', ascending=False)
most_popular.head(10)

Unnamed: 0_level_0,Rating
ProductId,Unnamed: 1_level_1
B001MA0QY2,7533
B0009V1YR8,2869
B0043OYFKU,2477
B0000YUXI0,2143
B003V265QW,2088
B000ZMBSPE,2041
B003BQ6QXK,1918
B004OHQR1Q,1885
B00121UVU0,1838
B000FS05VG,1589


Рекомендации пользователям, основанные на истории покупок других пользователей, 
а также схожести рейтингов продуктов, купленных рассматриваемым пользователем.
Utility matrix состоит из всех возможных сведений о предпочтениях (рейтингах) пользователей, представленных в виде матрицы. Матрица полезности разрежена, так как ни один из пользователей не купил бы все элементы в списке, следовательно, большинство значений неизвестно.

In [9]:
# Подвыборка элементов из рейтингов продуктов

amazon_ratings = data.head(20000)
#amazon_ratings = data

In [10]:
ratings_utility_matrix = amazon_ratings.pivot_table(values='Rating', 
                                                index='UserId', columns='ProductId', fill_value=0)
ratings_utility_matrix

ProductId,0205616461,0558925278,0733001998,0737104473,0762451459,1304139212,1304139220,130414089X,130414643X,1304146537,...,B00005BJ8X,B00005BJ91,B00005BL1R,B00005BLRH,B00005BWXD,B00005CDRP,B00005CDRY,B00005CDS5,B00005CDS7,B00005CDS9
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A00205921JHJK5X9LNP42,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A00473363TJ8YSZ3YAGG9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A01437583CZ7V02UKZQ5S,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A020135981U0UNEAE4JV,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A024581134CV80ZBLIZTZ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AZZHJZP4GQPPZ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AZZNK89PXD006,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AZZOFVMQC0BJG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AZZQXL8VDCFTV,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
X = ratings_utility_matrix.T
X.head()

UserId,A00205921JHJK5X9LNP42,A00473363TJ8YSZ3YAGG9,A01437583CZ7V02UKZQ5S,A020135981U0UNEAE4JV,A024581134CV80ZBLIZTZ,A03056581JJIOL5FSKJY7,A03099101ZRK4K607JVHH,A03454732N8VEYJAMGTTH,A0505229A7NSH3FRXRR4,A05492663T95KW63BR75K,...,AZYR51QTD6FAY,AZZ5ENX41WOYZ,AZZ8A0TEQOD7J,AZZDA9BRMPP1B,AZZHB6U54UDYW,AZZHJZP4GQPPZ,AZZNK89PXD006,AZZOFVMQC0BJG,AZZQXL8VDCFTV,AZZTJQ7CQZUD8
ProductId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
205616461,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
558925278,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
733001998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
737104473,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
762451459,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
X.shape

(1398, 19101)

In [13]:
SVD = TruncatedSVD(n_components=10) #Dimensionality reduction
decomposed_matrix = SVD.fit_transform(X)
decomposed_matrix.shape

(1398, 10)

In [14]:
correlation_matrix = np.corrcoef(decomposed_matrix)
correlation_matrix.shape

(1398, 1398)

In [15]:
correlation_matrix

array([[ 1.        , -0.8473574 ,  0.95679943, ..., -0.08731621,
        -0.67970379, -0.56262846],
       [-0.8473574 ,  1.        , -0.78127561, ...,  0.44497768,
         0.36150203,  0.04704686],
       [ 0.95679943, -0.78127561,  1.        , ...,  0.02307221,
        -0.5514199 , -0.5588869 ],
       ...,
       [-0.08731621,  0.44497768,  0.02307221, ...,  1.        ,
        -0.21924917, -0.42964057],
       [-0.67970379,  0.36150203, -0.5514199 , ..., -0.21924917,
         1.        ,  0.70406185],
       [-0.56262846,  0.04704686, -0.5588869 , ..., -0.42964057,
         0.70406185,  1.        ]])

In [16]:
X.index

Index(['0205616461', '0558925278', '0733001998', '0737104473', '0762451459',
       '1304139212', '1304139220', '130414089X', '130414643X', '1304146537',
       ...
       'B00005BJ8X', 'B00005BJ91', 'B00005BL1R', 'B00005BLRH', 'B00005BWXD',
       'B00005CDRP', 'B00005CDRY', 'B00005CDS5', 'B00005CDS7', 'B00005CDS9'],
      dtype='object', name='ProductId', length=1398)

In [17]:
i = "6117036094"

product_names = list(X.index)
product_ID = product_names.index(i)
product_ID

99

In [18]:
correlation_product_ID = correlation_matrix[product_ID]
correlation_product_ID.shape

(1398,)

In [19]:
Recommend = list(X.index[correlation_product_ID > 0.90])

# Removes the item already bought by the customer
Recommend.remove(i) 

Recommend[0:9]

['4057363823',
 '8901110814',
 '9788071074',
 '9788071856',
 '9788072488',
 '9788074405',
 '9788081053',
 '9790776810',
 '9790777884']