# Training a recommender system
Train a recommender system using a fast.ai curated dataset

The example shown here is adapted from the paper by Howard and Gugger https://arxiv.org/pdf/2002.04688.pdf

In [58]:
# imports for notebook boilerplate
!pip install -Uqq fastbook
import fastbook
from fastbook import *
from fastai.collab import *

In [59]:
# set up the notebook for fast.ai
fastbook.setup_book()

# Ingest the dataset
- define the path object
- define a dataframe to contain the dataset

In [60]:
# ingest the curated recommender system dataset ML_SAMPLE
path = untar_data(URLs.ML_SAMPLE)

In [61]:
# examine the directory structure
path.ls()

(#1) [Path('/storage/data/movie_lens_sample/ratings.csv')]

In [62]:
# ingest the dataset into a Pandas dataframe
df = pd.read_csv(path/'ratings.csv')

# Examine the dataset

In [63]:
# examine the first few records in the dataframe
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,73,1097,4.0,1255504951
1,561,924,3.5,1172695223
2,157,260,3.5,1291598691
3,358,1210,5.0,957481884
4,130,316,2.0,1138999234


In [64]:
# get the number of records in the dataset
df.shape

(6031, 4)

In [65]:
# get the count of unique values in each column of the dataset
df.nunique()

userId        100
movieId       100
rating         10
timestamp    5609
dtype: int64

In [66]:
# count the number of missing values in each column of the dataset
df.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [67]:
df['rating'].nunique()

10

In [68]:
# defined a CollabDataLoaders object
dls=CollabDataLoaders.from_df(df,bs= 64)

In [69]:
dls.show_batch()

Unnamed: 0,userId,movieId,rating
0,268,3578,5.0
1,648,344,3.0
2,423,344,1.5
3,88,296,3.5
4,475,1580,3.0
5,119,924,4.0
6,292,1265,3.5
7,598,4886,4.0
8,78,150,3.5
9,294,1198,4.5


# Define and train the model

In [70]:
# define the model
learn=collab_learner(dls,y_range= [ 0 , 5.0 ] )

In [71]:
# train the model
learn.fit_one_cycle( 5 )

epoch,train_loss,valid_loss,time
0,2.568535,2.594522,00:00
1,2.278521,2.051913,00:00
2,1.66185,1.358955,00:00
3,1.235316,1.137351,00:00
4,1.060143,1.107881,00:00


# Exercise the trained model
- define a dataframe containing test data
- apply the trained model to the dataframe

In [72]:
#scoring_columns = ['userId','movieId','timestamp']
scoring_columns = ['userId','movieId']
test_df = pd.DataFrame(columns=scoring_columns)
test_df.at[0,'userId'] = 388
test_df.at[0,'movieId'] = 153
test_df.at[1,'userId'] = 607
test_df.at[1,'movieId'] = 1210
test_df.head()

Unnamed: 0,userId,movieId
0,388,153
1,607,1210


In [73]:
dl = learn.dls.test_dl(test_df)
learn.get_preds(dl=dl)

(tensor([2.4156, 3.6090]), None)