In [1]:
# common stuff
from __future__ import (absolute_import, division,
                        print_function, unicode_literals)
import warnings
warnings.simplefilter('ignore')
%pylab inline
%config InlineBackend.figure_format = 'png' 
from pylab import rcParams
rcParams['figure.figsize'] = 8,5
import numpy as np
import pandas as pd

Populating the interactive namespace from numpy and matplotlib


In [2]:
# First, let's install mrec:
# git clone https://github.com/inpefess/mrec.git
# sudo -H python3 setup.py install --install-scripts sbin

In [3]:
# Now, download MovieLens 100K:
# wget http://www.grouplens.org/system/files/ml-100k.zip
# unzip ml-100k.zip

In [4]:
# Prepare the data
! rm -rf splits1
! mrec_prepare --dataset ml-100k/u.data --outdir splits1 --rating_thresh 4 --test_size 0.5 --binarize

[2017-11-25 17:06:31,820] INFO: sorting input data...
[2017-11-25 17:06:31,937] INFO: creating split 0: /home/stas/Projects/1-netology/Netology/lab1/splits1/u.data.train.0 /home/stas/Projects/1-netology/Netology/lab1/splits1/u.data.test.0
[2017-11-25 17:06:32,288] INFO: creating split 1: /home/stas/Projects/1-netology/Netology/lab1/splits1/u.data.train.1 /home/stas/Projects/1-netology/Netology/lab1/splits1/u.data.test.1
[2017-11-25 17:06:32,644] INFO: creating split 2: /home/stas/Projects/1-netology/Netology/lab1/splits1/u.data.train.2 /home/stas/Projects/1-netology/Netology/lab1/splits1/u.data.test.2
[2017-11-25 17:06:32,997] INFO: creating split 3: /home/stas/Projects/1-netology/Netology/lab1/splits1/u.data.train.3 /home/stas/Projects/1-netology/Netology/lab1/splits1/u.data.test.3
[2017-11-25 17:06:33,350] INFO: creating split 4: /home/stas/Projects/1-netology/Netology/lab1/splits1/u.data.train.4 /home/stas/Projects/1-netology/Netology/lab1/splits1/u.data.test.4
[2017-11-25 17:06:33,

In [5]:
# Now, let's run the cluster:
# ipcluster start -n4 --daemonize

In [6]:
# Let's train popularity model
! rm -rf models1
! mrec_train -n4 --input_format tsv --train "splits1/u.data.train.*" --outdir models1 --model=popularity

[2017-11-25 17:06:34,760] INFO: processing /home/stas/Projects/1-netology/Netology/lab1/splits1/u.data.train.2...
[2017-11-25 17:06:35,287] INFO: processing /home/stas/Projects/1-netology/Netology/lab1/splits1/u.data.train.1...
[2017-11-25 17:06:35,843] INFO: processing /home/stas/Projects/1-netology/Netology/lab1/splits1/u.data.train.0...
[2017-11-25 17:06:36,378] INFO: processing /home/stas/Projects/1-netology/Netology/lab1/splits1/u.data.train.3...
[2017-11-25 17:06:36,911] INFO: processing /home/stas/Projects/1-netology/Netology/lab1/splits1/u.data.train.4...
[2017-11-25 17:06:37,449] INFO: done


In [7]:
# Let's make some recommendations
! rm -rf recs1
! mrec_predict --input_format tsv --test_input_format tsv --train "splits1/u.data.train.*" --modeldir models1 --outdir recs1

[2017-11-25 17:06:38,491] INFO: processing /home/stas/Projects/1-netology/Netology/lab1/splits1/u.data.train.2...
[2017-11-25 17:06:38,491] INFO: creating recs directory /home/stas/Projects/1-netology/Netology/lab1/recs1/u.data.train.2-recs...
[2017-11-25 17:06:38,494] INFO: checking for existing output recs...
[2017-11-25 17:06:38,495] INFO: creating tasks...
[2017-11-25 17:06:38,495] INFO: loading dataset to get size...
[2017-11-25 17:06:38,825] INFO: loading model to get size...
[2017-11-25 17:06:38,826] INFO: created 1 tasks, 943 users per task
[2017-11-25 17:06:38,826] INFO: running in parallel across ipython engines...
[2017-11-25 17:06:39,595] INFO: checking output files...
[2017-11-25 17:06:39,595] INFO: SUCCESS: all tasks completed
[2017-11-25 17:06:39,595] INFO: concatenating 1 partial output files...
[2017-11-25 17:06:39,598] INFO: removing partial output files...
[2017-11-25 17:06:39,598] INFO: done
[2017-11-25 17:06:39,599] INFO: processing /home/stas/Projects/1-netology/N

In [8]:
# popularity model makes same recommendations for all users, let's list some of them:
from mrec import load_sparse_matrix, load_recommender
train = load_sparse_matrix('tsv','splits1/u.data.train.0')
model = load_recommender('models1/u.data.train.0.model.npz')
# Let's take user 1 and user 100 for example
recs1 = model.recommend_items(train,1,max_items=10)
recs2 = model.recommend_items(train,100,max_items=10)
# The lists are a little different, that's because recommender
# doesn't recommend something that user already watched (from mrec/popularity.py:
# known_items = set(dataset[u].indices)
# recs = []
# for i,c in self.pop_items:
#    if i not in known_items:
# ...)
display(recs1)
display(recs2)

[(99, 264),
 (180, 247),
 (287, 230),
 (120, 205),
 (173, 203),
 (97, 198),
 (221, 195),
 (6, 194),
 (55, 191),
 (116, 185)]

[(257, 270),
 (99, 264),
 (180, 247),
 (293, 241),
 (285, 232),
 (126, 232),
 (287, 230),
 (299, 207),
 (120, 205),
 (173, 203)]

In [9]:
# The lists are: (item to watch, total number of likes, i.e. scores above threshold, 4 in our case)
# Let's output movie names:
items = pd.read_csv('ml-100k/u.item', sep = '|', \
    encoding = 'latin1', names=['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url'], \
    usecols=range(5))
display(pd.concat([items[items['movie_id'] == i[0]] for i in recs1]))
display(pd.concat([items[items['movie_id'] == i[0]] for i in recs2]))

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url
98,99,Snow White and the Seven Dwarfs (1937),01-Jan-1937,,http://us.imdb.com/M/title-exact?Snow%20White%...
179,180,Apocalypse Now (1979),01-Jan-1979,,http://us.imdb.com/M/title-exact?Apocalypse%20...
286,287,Marvin's Room (1996),18-Dec-1996,,http://us.imdb.com/M/title-exact?Marvin's%20Ro...
119,120,Striptease (1996),28-Jun-1996,,http://us.imdb.com/M/title-exact?Striptease%20...
172,173,"Princess Bride, The (1987)",01-Jan-1987,,http://us.imdb.com/M/title-exact?Princess%20Br...
96,97,Dances with Wolves (1990),01-Jan-1990,,http://us.imdb.com/M/title-exact?Dances%20with...
220,221,Breaking the Waves (1996),15-Nov-1996,,http://us.imdb.com/M/title-exact?Breaking%20th...
5,6,Shanghai Triad (Yao a yao yao dao waipo qiao) ...,01-Jan-1995,,http://us.imdb.com/Title?Yao+a+yao+yao+dao+wai...
54,55,"Professional, The (1994)",01-Jan-1994,,http://us.imdb.com/Title?L%E9on+(1994)
115,116,Cold Comfort Farm (1995),23-Apr-1996,,http://us.imdb.com/M/title-exact?Cold%20Comfor...


Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url
256,257,Men in Black (1997),04-Jul-1997,,http://us.imdb.com/M/title-exact?Men+in+Black+...
98,99,Snow White and the Seven Dwarfs (1937),01-Jan-1937,,http://us.imdb.com/M/title-exact?Snow%20White%...
179,180,Apocalypse Now (1979),01-Jan-1979,,http://us.imdb.com/M/title-exact?Apocalypse%20...
292,293,Donnie Brasco (1997),28-Feb-1997,,http://us.imdb.com/M/title-exact?Donnie%20Bras...
284,285,Secrets & Lies (1996),04-Oct-1996,,http://us.imdb.com/M/title-exact?Secrets%20&%2...
125,126,"Spitfire Grill, The (1996)",06-Sep-1996,,http://us.imdb.com/M/title-exact?Spitfire%20Gr...
286,287,Marvin's Room (1996),18-Dec-1996,,http://us.imdb.com/M/title-exact?Marvin's%20Ro...
298,299,Hoodlum (1997),22-Aug-1997,,http://us.imdb.com/M/title-exact?Hoodlum+(1997)
119,120,Striptease (1996),28-Jun-1996,,http://us.imdb.com/M/title-exact?Striptease%20...
172,173,"Princess Bride, The (1987)",01-Jan-1987,,http://us.imdb.com/M/title-exact?Princess%20Br...


In [10]:
# Now, let's use kNN:
! rm -rf models2
! mrec_train -n4 --input_format tsv --train "splits1/u.data.train.*" --outdir models2 --model=knn

[2017-11-25 17:06:45,704] INFO: processing /home/stas/Projects/1-netology/Netology/lab1/splits1/u.data.train.2...
[2017-11-25 17:06:45,704] INFO: finding number of items...
[2017-11-25 17:06:46,039] INFO: 943 users and 1681 items
[2017-11-25 17:06:46,040] INFO: creating sims directory /home/stas/Projects/1-netology/Netology/lab1/models2/u.data.train.2-sims...
[2017-11-25 17:06:46,043] INFO: checking for existing output sims...
[2017-11-25 17:06:46,044] INFO: creating tasks...
[2017-11-25 17:06:46,044] INFO: running 4 tasks in parallel across ipython engines...
[2017-11-25 17:06:47,480] INFO: checking output files...
[2017-11-25 17:06:47,481] INFO: SUCCESS: all tasks completed
[2017-11-25 17:06:47,481] INFO: concatenating 4 partial output files...
[2017-11-25 17:06:47,486] INFO: removing partial output files...
[2017-11-25 17:06:47,487] INFO: loading 1681 items in CosineKNNRecommender model from /home/stas/Projects/1-netology/Netology/lab1/models2/u.data.train.2.sims.tsv
[2017-11-25 17:

In [11]:
# And recommendations:
! rm -rf recs2
! mrec_predict --input_format tsv --test_input_format tsv --train "splits1/u.data.train.*" --modeldir models2 --outdir recs2

[2017-11-25 17:07:00,673] INFO: processing /home/stas/Projects/1-netology/Netology/lab1/splits1/u.data.train.2...
[2017-11-25 17:07:00,674] INFO: creating recs directory /home/stas/Projects/1-netology/Netology/lab1/recs2/u.data.train.2-recs...
[2017-11-25 17:07:00,676] INFO: checking for existing output recs...
[2017-11-25 17:07:00,677] INFO: creating tasks...
[2017-11-25 17:07:00,677] INFO: loading dataset to get size...
[2017-11-25 17:07:01,004] INFO: loading model to get size...
[2017-11-25 17:07:01,038] INFO: created 1 tasks, 41928 users per task
[2017-11-25 17:07:01,038] INFO: running in parallel across ipython engines...
[2017-11-25 17:07:02,642] INFO: checking output files...
[2017-11-25 17:07:02,642] INFO: SUCCESS: all tasks completed
[2017-11-25 17:07:02,642] INFO: concatenating 1 partial output files...
[2017-11-25 17:07:02,646] INFO: removing partial output files...
[2017-11-25 17:07:02,646] INFO: done
[2017-11-25 17:07:02,647] INFO: processing /home/stas/Projects/1-netology

In [12]:
# That's definitely better than with ItemPop:
#mrr            0.5192 +/- 0.0047
#prec@5         0.2620 +/- 0.0017
#prec@10        0.2476 +/- 0.0007
#prec@15        0.2293 +/- 0.0013
#prec@20        0.2120 +/- 0.0009

# Let's output recommendations for users 1 and 100
train = load_sparse_matrix('tsv','splits1/u.data.train.0')
model = load_recommender('models2/u.data.train.0.model.npz')
recs1 = model.recommend_items(train,1,max_items=10)
recs2 = model.recommend_items(train,100,max_items=10)
display(pd.concat([items[items['movie_id'] == i[0]] for i in recs1]))
display(pd.concat([items[items['movie_id'] == i[0]] for i in recs2]))

# As we can see, personalized recommendations are very different

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url
98,99,Snow White and the Seven Dwarfs (1937),01-Jan-1937,,http://us.imdb.com/M/title-exact?Snow%20White%...
313,314,3 Ninjas: High Noon At Mega Mountain (1998),01-Jan-1997,,http://us.imdb.com/M/title-exact?imdb-title-11...
273,274,Sabrina (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Sabrina%20(1995)
311,312,Midnight in the Garden of Good and Evil (1997),01-Jan-1997,,http://us.imdb.com/M/title-exact?Midnight+in+t...
122,123,"Frighteners, The (1996)",19-Jul-1996,,"http://us.imdb.com/M/title-exact?Frighteners,%..."
473,474,Dr. Strangelove or: How I Learned to Stop Worr...,01-Jan-1963,,http://us.imdb.com/M/title-exact?Dr.%20Strange...
13,14,"Postino, Il (1994)",01-Jan-1994,,"http://us.imdb.com/M/title-exact?Postino,%20Il..."
235,236,Citizen Ruth (1996),13-Dec-1996,,http://us.imdb.com/M/title-exact?Citizen%20Rut...
266,267,unknown,,,
286,287,Marvin's Room (1996),18-Dec-1996,,http://us.imdb.com/M/title-exact?Marvin's%20Ro...


Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url
115,116,Cold Comfort Farm (1995),23-Apr-1996,,http://us.imdb.com/M/title-exact?Cold%20Comfor...
119,120,Striptease (1996),28-Jun-1996,,http://us.imdb.com/M/title-exact?Striptease%20...
13,14,"Postino, Il (1994)",01-Jan-1994,,"http://us.imdb.com/M/title-exact?Postino,%20Il..."
123,124,Lone Star (1996),21-Jun-1996,,http://us.imdb.com/M/title-exact?Lone%20Star%2...
109,110,Operation Dumbo Drop (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Operation%20D...
93,94,Home Alone (1990),01-Jan-1990,,http://us.imdb.com/M/title-exact?Home%20Alone%...
5,6,Shanghai Triad (Yao a yao yao dao waipo qiao) ...,01-Jan-1995,,http://us.imdb.com/Title?Yao+a+yao+yao+dao+wai...
98,99,Snow White and the Seven Dwarfs (1937),01-Jan-1937,,http://us.imdb.com/M/title-exact?Snow%20White%...
286,287,Marvin's Room (1996),18-Dec-1996,,http://us.imdb.com/M/title-exact?Marvin's%20Ro...
256,257,Men in Black (1997),04-Jul-1997,,http://us.imdb.com/M/title-exact?Men+in+Black+...


In [13]:
# At this point it's clear how to use mrec and what it does under the hood, but let's do
# one more thing and use SLIM.


# First, find regularization constants:
! mrec_tune -d 'splits1/u.data.train.0' --input_format tsv --l1_min 0.001 --l1_max 1.0 --l2_min 0.0001 --l2_max 1 --max_sims 200 --min_sims 1 --max_sparse 0.3

[2017-11-25 17:07:12,490] INFO: preparing tasks for a grid search of these values:
[2017-11-25 17:07:12,490] INFO: {'l2_reg': [0.0001, 0.001, 0.01, 0.1, 1], 'l1_reg': [0.001, 0.01, 0.1, 1]}
[2017-11-25 17:07:12,503] INFO: running 20 tasks in parallel...
best parameter setting: {'l2_reg': 0.1, 'l1_reg': 0.001}
mean # positive similarity weights per item = 78.5
proportion of items with fewer than 1 positive similarity weights = 0.17
mean # negative similarity weights per item = 29.7


In [17]:
# Now, train the model
! rm -rf models3
! mrec_train -n4 --input_format tsv --train "splits1/u.data.train.*" --outdir models3 --model=slim \
    --l1_reg=0.001 --l2_reg=0.1

[2017-11-25 17:08:47,491] INFO: processing /home/stas/Projects/1-netology/Netology/lab1/splits1/u.data.train.2...
[2017-11-25 17:08:47,492] INFO: finding number of items...
[2017-11-25 17:08:47,834] INFO: 943 users and 1681 items
[2017-11-25 17:08:47,834] INFO: creating sims directory /home/stas/Projects/1-netology/Netology/lab1/models3/u.data.train.2-sims...
[2017-11-25 17:08:47,840] INFO: checking for existing output sims...
[2017-11-25 17:08:47,840] INFO: creating tasks...
[2017-11-25 17:08:47,840] INFO: running 4 tasks in parallel across ipython engines...
[2017-11-25 17:08:50,619] INFO: checking output files...
[2017-11-25 17:08:50,619] INFO: SUCCESS: all tasks completed
[2017-11-25 17:08:50,619] INFO: concatenating 4 partial output files...
[2017-11-25 17:08:50,625] INFO: removing partial output files...
[2017-11-25 17:08:50,626] INFO: loading 1681 items in SLIM model from /home/stas/Projects/1-netology/Netology/lab1/models3/u.data.train.2.sims.tsv
[2017-11-25 17:08:51,251] INFO:

In [18]:
# Predict
! rm -rf recs3
! mrec_predict --input_format tsv --test_input_format tsv --train "splits1/u.data.train.*" --modeldir models3 --outdir recs3

[2017-11-25 17:09:11,808] INFO: processing /home/stas/Projects/1-netology/Netology/lab1/splits1/u.data.train.2...
[2017-11-25 17:09:11,808] INFO: creating recs directory /home/stas/Projects/1-netology/Netology/lab1/recs3/u.data.train.2-recs...
[2017-11-25 17:09:11,812] INFO: checking for existing output recs...
[2017-11-25 17:09:11,813] INFO: creating tasks...
[2017-11-25 17:09:11,813] INFO: loading dataset to get size...
[2017-11-25 17:09:12,162] INFO: loading model to get size...
[2017-11-25 17:09:12,229] INFO: created 1 tasks, 58734 users per task
[2017-11-25 17:09:12,229] INFO: running in parallel across ipython engines...
[2017-11-25 17:09:13,715] INFO: checking output files...
[2017-11-25 17:09:13,716] INFO: SUCCESS: all tasks completed
[2017-11-25 17:09:13,716] INFO: concatenating 1 partial output files...
[2017-11-25 17:09:13,720] INFO: removing partial output files...
[2017-11-25 17:09:13,720] INFO: done
[2017-11-25 17:09:13,721] INFO: processing /home/stas/Projects/1-netology

In [19]:
# We got event better score than with kNN:
#mrr            0.6303 +/- 0.0029
#prec@5         0.4102 +/- 0.0015
#prec@10        0.3582 +/- 0.0009
#prec@15        0.3266 +/- 0.0008
#prec@20        0.3028 +/- 0.0008

train = load_sparse_matrix('tsv','splits1/u.data.train.0')
model = load_recommender('models3/u.data.train.0.model.npz')
recs1 = model.recommend_items(train,1,max_items=10)
recs2 = model.recommend_items(train,100,max_items=10)
display(pd.concat([items[items['movie_id'] == i[0]] for i in recs1]))
display(pd.concat([items[items['movie_id'] == i[0]] for i in recs2]))

# Note that these predictions are pretty close to that of kNNs, that's expected

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url
98,99,Snow White and the Seven Dwarfs (1937),01-Jan-1937,,http://us.imdb.com/M/title-exact?Snow%20White%...
273,274,Sabrina (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Sabrina%20(1995)
311,312,Midnight in the Garden of Good and Evil (1997),01-Jan-1997,,http://us.imdb.com/M/title-exact?Midnight+in+t...
313,314,3 Ninjas: High Noon At Mega Mountain (1998),01-Jan-1997,,http://us.imdb.com/M/title-exact?imdb-title-11...
179,180,Apocalypse Now (1979),01-Jan-1979,,http://us.imdb.com/M/title-exact?Apocalypse%20...
122,123,"Frighteners, The (1996)",19-Jul-1996,,"http://us.imdb.com/M/title-exact?Frighteners,%..."
235,236,Citizen Ruth (1996),13-Dec-1996,,http://us.imdb.com/M/title-exact?Citizen%20Rut...
473,474,Dr. Strangelove or: How I Learned to Stop Worr...,01-Jan-1963,,http://us.imdb.com/M/title-exact?Dr.%20Strange...
115,116,Cold Comfort Farm (1995),23-Apr-1996,,http://us.imdb.com/M/title-exact?Cold%20Comfor...
266,267,unknown,,,


Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url
115,116,Cold Comfort Farm (1995),23-Apr-1996,,http://us.imdb.com/M/title-exact?Cold%20Comfor...
98,99,Snow White and the Seven Dwarfs (1937),01-Jan-1937,,http://us.imdb.com/M/title-exact?Snow%20White%...
179,180,Apocalypse Now (1979),01-Jan-1979,,http://us.imdb.com/M/title-exact?Apocalypse%20...
119,120,Striptease (1996),28-Jun-1996,,http://us.imdb.com/M/title-exact?Striptease%20...
172,173,"Princess Bride, The (1987)",01-Jan-1987,,http://us.imdb.com/M/title-exact?Princess%20Br...
256,257,Men in Black (1997),04-Jul-1997,,http://us.imdb.com/M/title-exact?Men+in+Black+...
13,14,"Postino, Il (1994)",01-Jan-1994,,"http://us.imdb.com/M/title-exact?Postino,%20Il..."
123,124,Lone Star (1996),21-Jun-1996,,http://us.imdb.com/M/title-exact?Lone%20Star%2...
109,110,Operation Dumbo Drop (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Operation%20D...
286,287,Marvin's Room (1996),18-Dec-1996,,http://us.imdb.com/M/title-exact?Marvin's%20Ro...
