In [15]:
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from implicit.als import AlternatingLeastSquares

“Требуется построить модель рекомендаций на основе скрытых факторов (implicit) на основе dataset’а https://grouplens.org/datasets/hetrec-2011/ (Delicious Bookmarks)”

In [46]:
data = pd.read_table('hetrec2011-delicious-2k/user_taggedbookmarks-timestamps.dat', sep='\\n', delimiter='\\t')

  """Entry point for launching an IPython kernel.


In [47]:
data.head()

Unnamed: 0,userID,bookmarkID,tagID,timestamp
0,8,1,1,1289255362000
1,8,2,1,1289255159000
2,8,7,1,1289238901000
3,8,7,6,1289238901000
4,8,7,7,1289238901000


In [48]:
# посчитаем количество тэгов на одну закладку и одного пользователя
data = data.groupby(['userID', 'bookmarkID'], as_index=False)['tagID'].count()

In [49]:
data.rename(columns={'tagID': 'num_of_tags'}, inplace=True)

In [50]:
data.head()

Unnamed: 0,userID,bookmarkID,num_of_tags
0,8,1,1
1,8,2,1
2,8,7,3
3,8,8,3
4,8,9,2


Преобразуем id пользователей и закладок, так чтобы они представляли непрерывную последовательность целых чисел, начиная с нуля.

In [51]:
user_id_lookup = dict(zip(range(data['userID'].nunique()), data['userID'].unique()))
bookmark_id_lookup = dict(zip(range(data['bookmarkID'].nunique()), data['bookmarkID'].unique()))

In [52]:
data['userID'] = data['userID'].map({v: k for k, v in user_id_lookup.items()})
data['bookmarkID'] = data['bookmarkID'].map({v: k for k, v in bookmark_id_lookup.items()})

In [53]:
data.head()

Unnamed: 0,userID,bookmarkID,num_of_tags
0,0,0,1
1,0,1,1
2,0,2,3
3,0,3,3
4,0,4,2


In [54]:
url_id = pd.read_table('hetrec2011-delicious-2k/bookmarks.dat', sep='\\n', delimiter='\\t')

  """Entry point for launching an IPython kernel.


In [55]:
url_id.head(2)

Unnamed: 0,id,md5,title,url,md5Principal,urlPrincipal
0,1,ab4954b633ddaf5b5bba6e9b71aa6b70,IFLA - The official website of the Internation...,http://www.ifla.org/,7f431306c428457bc4e12b15634484f,www.ifla.org
1,2,2221e9cd106d269dd34682666f576fa3,gcdp-e.pdf (application/pdf Object),http://archive.ifla.org/VII/s14/nd1/gcdp-e.pdf,1ef8cfcfe968101fa9b4e301847503d4,archive.ifla.org


In [56]:
url_id.drop(['md5', 'md5Principal'], axis=1, inplace=True)

In [57]:
url_id.head(2)

Unnamed: 0,id,title,url,urlPrincipal
0,1,IFLA - The official website of the Internation...,http://www.ifla.org/,www.ifla.org
1,2,gcdp-e.pdf (application/pdf Object),http://archive.ifla.org/VII/s14/nd1/gcdp-e.pdf,archive.ifla.org


In [59]:
data.describe()

Unnamed: 0,userID,bookmarkID,num_of_tags
count,104799.0,104799.0,104799.0
mean,895.579939,27116.365547,4.175546
std,548.079983,20262.253733,3.200233
min,0.0,0.0,1.0
25%,397.0,9684.5,2.0
50%,893.0,20935.0,3.0
75%,1375.0,43907.5,5.0
max,1866.0,69222.0,70.0


In [64]:
ids = list(np.sort(data['userID'].unique()))
urls = list(np.sort(data['bookmarkID'].unique()))

In [68]:
rows = data['userID']
cols = data['bookmarkID']
bookmarked = data['num_of_tags']

In [69]:
cols.max(), rows.max(), cols.min(), rows.min()

(69222, 1866, 0, 0)

In [70]:
len(urls), len(ids)

(69223, 1867)

In [71]:
len(rows), len(cols), len(bookmarked)

(104799, 104799, 104799)

In [72]:
data_sparse = sparse.csr_matrix((bookmarked, (cols, rows)), shape=(len(urls), len(ids)))

In [73]:
model = AlternatingLeastSquares(factors=100)
model.fit(data_sparse)

100%|██████████| 15.0/15 [00:09<00:00,  1.42it/s]


In [93]:
userid = 100
print(f'рекомендации для пользователя {user_id_lookup[userid]}')

рекомендации для пользователя 3957


In [84]:
user_items = data_sparse.transpose()
recommendations = model.recommend(userid, user_items)
# заменим id закладок на исходные
recommendations = [(bookmark_id_lookup[rec[0]], rec[1]) for rec in recommendations]

In [85]:
recommendations

[(9132, 0.6205618),
 (4276, 0.58708686),
 (4044, 0.5611164),
 (9100, 0.5353863),
 (4342, 0.53132313),
 (9104, 0.48688334),
 (9111, 0.48678592),
 (2618, 0.48070496),
 (9127, 0.4797324),
 (9092, 0.47254503)]

In [87]:
url_id[url_id['id'].isin([rec[0] for rec in recommendations])]

Unnamed: 0,id,title,url,urlPrincipal
1903,2618,Wordpress Snippets,http://wp-snippets.com/,wp-snippets.com
2831,4044,UI Guidelines for mobile and tablet web app de...,http://www.mobilexweb.com/blog/ui-guidelines-m...,www.mobilexweb.com
2975,4276,Flipboard for iPad,http://www.flipboard.com/,www.flipboard.com
3023,4342,"Theme Finder, a WPCandy Project",http://themefinder.wpcandy.com/,themefinder.wpcandy.com
6514,9092,"Spazio web, database, traffico, servizi cloud ...",http://www.webmasterpoint.org/news/spazio-web-...,www.webmasterpoint.org
6520,9100,Theme Layouts | ThemeGarden.com,http://www.themegarden.com/theme-layouts/,www.themegarden.com
6524,9104,Data Visualization: Modern Approaches | Graphi...,http://www.smashingmagazine.com/2007/08/02/dat...,www.smashingmagazine.com
6528,9111,Fracture | The greatest way from your digital ...,http://www.fractureme.com/,www.fractureme.com
6539,9127,Is Google Making Us Stupid? - Magazine - The A...,http://www.theatlantic.com/magazine/archive/20...,www.theatlantic.com
6542,9132,Tweet Nest,http://pongsocket.com/tweetnest/,pongsocket.com


In [95]:
itemid = 9104
print(f'закладки схожие с {bookmark_id_lookup[itemid]}')

закладки схожие с 12959


In [90]:
related = model.similar_items(itemid)
related = [(bookmark_id_lookup[rel[0]], rel[1]) for rel in related]

In [91]:
related

[(12959, 0.2245222),
 (12983, 0.22428977),
 (13000, 0.22428945),
 (12989, 0.22428896),
 (12997, 0.22428535),
 (12971, 0.22428079),
 (12963, 0.2242804),
 (12975, 0.22427502),
 (12961, 0.22425865),
 (12966, 0.22425257)]

In [92]:
url_id[url_id['id'].isin([rel[0] for rel in related])]

Unnamed: 0,id,title,url,urlPrincipal
9123,12959,Django | The Web framework for perfectionists ...,http://www.djangoproject.com/,www.djangoproject.com
9124,12961,Assostartup – Associazione Italiana Profession...,http://www.assostartup.it/,www.assostartup.it
9125,12963,7 A/B Testing Resources for Startups and Solo ...,http://mashable.com/2010/11/04/a-b-testing-res...,mashable.com
9126,12966,Viral Farm,http://www.viralfarm.it/,www.viralfarm.it
9130,12971,PadPyrus - Bring your paper to Mobile,http://www.padpyrus.com/index.html,www.padpyrus.com
9134,12975,Passpack Password Manager Home | Free Online P...,http://www.passpack.com/en/home/,www.passpack.com
9137,12983,How to Create a Banner Ad Revenue Estimate - B...,http://www.bigpictureweb.com/blog/2009/6/2/how...,www.bigpictureweb.com
9139,12989,Web 3.0: che cos’è?,http://www.webmasterpoint.org/news/web-30-che-...,www.webmasterpoint.org
9142,12997,Pazienti.org | Storie in rete che cambiano la ...,http://www.pazienti.org/,www.pazienti.org
9144,13000,Ibrii,http://www.ibrii.com/,www.ibrii.com
