# Training a recommender system
Train a recommender system using a fast.ai curated dataset

The example shown here is adapted from the paper by Howard and Gugger https://arxiv.org/pdf/2002.04688.pdf

In [1]:
# imports for notebook boilerplate
!pip install -Uqq fastbook
import fastbook
from fastbook import *
from fastai.collab import *

In [2]:
# set up the notebook for fast.ai
fastbook.setup_book()

# Ingest the dataset
- define the path object
- define a dataframe to contain the dataset

In [3]:
# ingest the curated recommender system dataset ML_SAMPLE
path = untar_data(URLs.ML_SAMPLE)

In [4]:
# examine the directory structure
path.ls()

(#1) [Path('/root/.fastai/data/movie_lens_sample/ratings.csv')]

In [5]:
# ingest the dataset into a Pandas dataframe
df = pd.read_csv(path/'ratings.csv')

# Examine the dataset

In [6]:
# examine the first few records in the dataframe
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,73,1097,4.0,1255504951
1,561,924,3.5,1172695223
2,157,260,3.5,1291598691
3,358,1210,5.0,957481884
4,130,316,2.0,1138999234


In [7]:
# get the number of records in the dataset
df.shape

(6031, 4)

In [8]:
# get the count of unique values in each column of the dataset
df.nunique()

userId        100
movieId       100
rating         10
timestamp    5609
dtype: int64

In [9]:
# count the number of missing values in each column of the dataset
df.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [10]:
df['rating'].nunique()

10

In [11]:
# defined a CollabDataLoaders object
dls=CollabDataLoaders.from_df(df,bs= 64)

In [12]:
dls.show_batch()

Unnamed: 0,userId,movieId,rating
0,388,1923,5.0
1,607,1200,4.0
2,346,4993,4.0
3,607,153,3.0
4,150,1210,4.5
5,460,2716,5.0
6,481,1704,3.5
7,134,32,4.5
8,23,1136,4.0
9,105,2716,4.0


# Define and train the model

In [13]:
# define the model
learn=collab_learner(dls,y_range= [ 0 , 5.0 ] )

In [14]:
# train the model
learn.fit_one_cycle( 5 )

epoch,train_loss,valid_loss,time
0,2.609334,2.528086,00:00
1,2.315725,2.012042,00:00
2,1.701957,1.337556,00:00
3,1.26564,1.117061,00:00
4,1.097266,1.089355,00:00


# Exercise the trained model
- define a dataframe containing test data
- apply the trained model to the dataframe

In [15]:
#scoring_columns = ['userId','movieId','timestamp']
scoring_columns = ['userId','movieId']
test_df = pd.DataFrame(columns=scoring_columns)
test_df.at[0,'userId'] = 388
test_df.at[0,'movieId'] = 153
test_df.at[1,'userId'] = 607
test_df.at[1,'movieId'] = 1210
test_df.head()

Unnamed: 0,userId,movieId
0,388,153
1,607,1210


In [16]:
dl = learn.dls.test_dl(test_df)
learn.get_preds(dl=dl)

(tensor([2.4751, 3.6097]), None)

In [17]:
learn.summary()

EmbeddingDotBias (Input shape: 64)
Layer (type)         Output Shape         Param #    Trainable 
                     64 x 50             
Embedding                                 5050       True      
Embedding                                 5050       True      
____________________________________________________________________________
                     64 x 1              
Embedding                                 101        True      
Embedding                                 101        True      
____________________________________________________________________________

Total params: 10,302
Total trainable params: 10,302
Total non-trainable params: 0

Optimizer used: <function Adam at 0x7f19ae4ef160>
Loss function: FlattenedLoss of MSELoss()

Model unfrozen

Callbacks:
  - TrainEvalCallback
  - Recorder
  - ProgressCallback

In [18]:
learn.model

EmbeddingDotBias(
  (u_weight): Embedding(101, 50)
  (i_weight): Embedding(101, 50)
  (u_bias): Embedding(101, 1)
  (i_bias): Embedding(101, 1)
)