# Training a recommender system on a large dataset
Train a recommender system using a large fast.ai curated dataset



In [1]:
# imports for notebook boilerplate
!pip install -Uqq fastbook
import fastbook
from fastbook import *
from fastai.collab import *

In [2]:
# set up the notebook for fast.ai
fastbook.setup_book()

# Ingest the dataset
- define the path object
- define a dataframe to contain the dataset

In [3]:
# ingest the curated recommender system dataset ML_SAMPLE
path = untar_data(URLs.ML_100k)

In [4]:
# examine the directory structure
path.ls()

(#23) [Path('/storage/data/ml-100k/ua.base'),Path('/storage/data/ml-100k/README'),Path('/storage/data/ml-100k/u4.test'),Path('/storage/data/ml-100k/u.genre'),Path('/storage/data/ml-100k/u.item'),Path('/storage/data/ml-100k/u2.test'),Path('/storage/data/ml-100k/u.user'),Path('/storage/data/ml-100k/u5.test'),Path('/storage/data/ml-100k/u.occupation'),Path('/storage/data/ml-100k/u5.base')...]

In [5]:
# dataset structure
'''
├── README
├── allbut.pl
├── mku.sh
├── u.data
├── u.genre
├── u.info
├── u.item
├── u.occupation
├── u.user
├── u1.base
├── u1.test
├── u2.base
├── u2.test
├── u3.base
├── u3.test
├── u4.base
├── u4.test
├── u5.base
├── u5.test
├── ua.base
├── ua.test
├── ub.base
└── ub.test
'''

'\n├── README\n├── allbut.pl\n├── mku.sh\n├── u.data\n├── u.genre\n├── u.info\n├── u.item\n├── u.occupation\n├── u.user\n├── u1.base\n├── u1.test\n├── u2.base\n├── u2.test\n├── u3.base\n├── u3.test\n├── u4.base\n├── u4.test\n├── u5.base\n├── u5.test\n├── ua.base\n├── ua.test\n├── ub.base\n└── ub.test\n'

In [6]:
# ingest the u.data file into a Pandas dataframe
# the data is this file is tab separated
df_data = pd.read_csv(path/'u.data', delimiter = '\t',header = None,names = ['userId','movieId','rating','timestamp'])

In [7]:
# ingest the u.item file into a dataframe
# the data in this file is vertical bar ('|') separated
df_item = pd.read_csv(path/'u.item', delimiter = '|',header = None,encoding = "ISO-8859-1")

# Examine the dataset

In [8]:
# examine the first few records in the df_data dataframe
df_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [9]:
df_data.shape

(100000, 4)

In [10]:
# examine the first few records in the df_item dataframe
df_item.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%20(1995),0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(1995),0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%20(1995),0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


In [11]:
df_item.shape

(1682, 24)

# Combine the dataframes
- combine the two dataframes, df_data and df_item, to get a single dataframe that contains the user ID, movie ID, and movie title

In [12]:
# slice off just the first two columns of df_item and label columns
df_item = df_item.iloc[:,0:2]
df_item.columns = ['movieId','movieName']
df_item.head()

Unnamed: 0,movieId,movieName
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [13]:
# merge the dataframes
df = pd.merge(df_data,df_item,on=['movieId'],how='left')
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,movieName
0,196,242,3,881250949,Kolya (1996)
1,186,302,3,891717742,L.A. Confidential (1997)
2,22,377,1,878887116,Heavyweights (1994)
3,244,51,2,880606923,Legends of the Fall (1994)
4,166,346,1,886397596,Jackie Brown (1997)


In [14]:
# get the number of records in the dataset
df.shape

(100000, 5)

In [15]:
# get the count of unique values in each column of the dataset
df.nunique()

userId         943
movieId       1682
rating           5
timestamp    49282
movieName     1664
dtype: int64

In [16]:
# count the number of missing values in each column of the dataset
df.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
movieName    0
dtype: int64

In [17]:
df['rating'].nunique()

5

In [18]:
# defined a CollabDataLoaders object
dls=CollabDataLoaders.from_df(df,item_name='movieName',bs= 64)

In [19]:
dls.show_batch()

Unnamed: 0,userId,movieName,rating
0,752,In & Out (1997),4
1,810,Money Talks (1997),3
2,308,Mimic (1997),2
3,354,"Grand Day Out, A (1992)",3
4,326,Once Upon a Time in the West (1969),2
5,379,Back to the Future (1985),5
6,385,Vertigo (1958),5
7,13,Army of Darkness (1993),1
8,244,Dave (1993),1
9,325,"Abyss, The (1989)",1


# Define and train the model

In [20]:
# define the model
learn=collab_learner(dls,y_range= [ 1 , 5 ] )

In [21]:
# train the model
learn.fit_one_cycle( 5 )

epoch,train_loss,valid_loss,time
0,1.25232,1.224364,00:13
1,0.905251,0.914894,00:13
2,0.846929,0.868641,00:13
3,0.807095,0.853929,00:13
4,0.805621,0.851817,00:13


In [22]:
# get a selection of movies with high bias
# this cell is adapted from Howard & Gugger
movie_bias = learn.model.i_bias.weight.squeeze()
idxs = movie_bias.argsort(descending=True)[:5]
[dls.classes['movieName'][i] for i in idxs]
 

['Titanic (1997)',
 'L.A. Confidential (1997)',
 "Schindler's List (1993)",
 'Shawshank Redemption, The (1994)',
 'Good Will Hunting (1997)']

In [23]:
# get a selection of movies with low bias
# this cell is adapted from Howard & Gugger
idxs = movie_bias.argsort(descending=False)[:5]
[dls.classes['movieName'][i] for i in idxs]

['Leave It to Beaver (1997)',
 'Beautician and the Beast, The (1997)',
 'Lawnmower Man 2: Beyond Cyberspace (1996)',
 "McHale's Navy (1997)",
 'Children of the Corn: The Gathering (1996)']

In [26]:
# get the subset of ratings for one movie
df_one_movie = df[df.movieName=='Showgirls (1995)']
df_one_movie.head()

Unnamed: 0,userId,movieId,rating,timestamp,movieName
979,233,375,4,876374419,Showgirls (1995)
3969,201,375,3,884287140,Showgirls (1995)
6158,343,375,2,876406978,Showgirls (1995)
11568,291,375,1,874868791,Showgirls (1995)
31108,346,375,1,875266176,Showgirls (1995)


In [25]:
# get the average rating for this movie
df_one_movie['rating'].mean()

1.9565217391304348

# Exercise the trained model
- define a dataframe containing test data
- apply the trained model to the dataframe

In [43]:
# define the test dataframe
scoring_columns = ['userId','movieId','movieName']
test_df = pd.DataFrame(columns=scoring_columns)
test_df.at[0,'userId'] = 607
test_df.at[0,'movieId'] = 242
test_df.at[0,'movieName'] = 'Kolya (1996)'
test_df.at[1,'userId'] = 607
test_df.at[1,'movieId'] = 302
test_df.at[1,'movieName'] = 'L.A. Confidential (1997)'
test_df.at[2,'userId'] = 607
test_df.at[2,'movieId'] = 375
test_df.at[2,'movieName'] = 'Showgirls (1995)'
test_df.head()

Unnamed: 0,userId,movieId,movieName
0,607,242,Kolya (1996)
1,607,302,L.A. Confidential (1997)
2,607,375,Showgirls (1995)


In [42]:
# exercise the recommender system on the test dataframe
dl = learn.dls.test_dl(test_df)
learn.get_preds(dl=dl)

(tensor([4.2557, 4.3637, 2.2996]), None)

In [18]:
learn.summary()

epoch,train_loss,valid_loss,time
0,,00:00,


EmbeddingDotBias (Input shape: ['64 x 2'])
Layer (type)         Output Shape         Param #    Trainable 
Embedding            64 x 50              5,050      True      
________________________________________________________________
Embedding            64 x 50              5,050      True      
________________________________________________________________
Embedding            64 x 1               101        True      
________________________________________________________________
Embedding            64 x 1               101        True      
________________________________________________________________

Total params: 10,302
Total trainable params: 10,302
Total non-trainable params: 0

Optimizer used: <function Adam at 0x7ff50d5b48b0>
Loss function: FlattenedLoss of MSELoss()

Model unfrozen

Callbacks:
  - TrainEvalCallback
  - Recorder
  - ProgressCallback