# Training a recommender system on a large dataset
Train a recommender system using a large fast.ai curated dataset



In [1]:
# imports for notebook boilerplate
!pip install -Uqq fastbook
import fastbook
from fastbook import *
from fastai.collab import *

In [2]:
# set up the notebook for fast.ai
fastbook.setup_book()

# Ingest the dataset
- define the path object
- define a dataframe to contain the dataset

In [3]:
# ingest the curated recommender system dataset ML_SAMPLE
path = untar_data(URLs.ML_100k)

In [4]:
# examine the directory structure
path.ls()

(#23) [Path('/storage/data/ml-100k/ua.base'),Path('/storage/data/ml-100k/README'),Path('/storage/data/ml-100k/u4.test'),Path('/storage/data/ml-100k/u.genre'),Path('/storage/data/ml-100k/u.item'),Path('/storage/data/ml-100k/u2.test'),Path('/storage/data/ml-100k/u.user'),Path('/storage/data/ml-100k/u5.test'),Path('/storage/data/ml-100k/u.occupation'),Path('/storage/data/ml-100k/u5.base')...]

In [None]:
# dataset structure
'''
├── README
├── allbut.pl
├── mku.sh
├── u.data
├── u.genre
├── u.info
├── u.item
├── u.occupation
├── u.user
├── u1.base
├── u1.test
├── u2.base
├── u2.test
├── u3.base
├── u3.test
├── u4.base
├── u4.test
├── u5.base
├── u5.test
├── ua.base
├── ua.test
├── ub.base
└── ub.test
'''

In [21]:
# ingest the u.data file into a Pandas dataframe
# the data is this file is tab separated
df_data = pd.read_csv(path/'u.data', delimiter = '\t',header = None,names = ['userId','movieId','rating','timestamp'])

In [42]:
# ingest the u.item file into a dataframe
# the data in this file is vertical bar ('|') separated
df_item = pd.read_csv(path/'u.item', delimiter = '|',header = None,encoding = "ISO-8859-1")

# Examine the dataset

In [35]:
# examine the first few records in the df_data dataframe
df_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [43]:
# examine the first few records in the df_item dataframe
df_item.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%20(1995),0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(1995),0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%20(1995),0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


In [45]:
#df_item = df_item[df_item.columns[0,1]]
df_item = df_item.iloc[:,0:2]
df_item.columns = ['movieId','movieName']
df_item.head()

Unnamed: 0,movieId,movieName
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [8]:
# get the number of records in the dataset
df.shape

(6031, 4)

In [9]:
# get the count of unique values in each column of the dataset
df.nunique()

userId        100
movieId       100
rating         10
timestamp    5609
dtype: int64

In [10]:
# count the number of missing values in each column of the dataset
df.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [11]:
df['rating'].nunique()

10

In [12]:
# defined a CollabDataLoaders object
dls=CollabDataLoaders.from_df(df,bs= 64)

In [13]:
dls.show_batch()

Unnamed: 0,userId,movieId,rating
0,388,1923,5.0
1,607,1200,4.0
2,346,4993,4.0
3,607,153,3.0
4,150,1210,4.5
5,460,2716,5.0
6,481,1704,3.5
7,134,32,4.5
8,23,1136,4.0
9,105,2716,4.0


# Define and train the model

In [14]:
# define the model
learn=collab_learner(dls,y_range= [ 0 , 5.0 ] )

In [15]:
# train the model
learn.fit_one_cycle( 5 )

epoch,train_loss,valid_loss,time
0,2.609334,2.528086,00:00
1,2.315725,2.012042,00:00
2,1.701957,1.337556,00:00
3,1.26564,1.117061,00:00
4,1.097266,1.089355,00:00


# Exercise the trained model
- define a dataframe containing test data
- apply the trained model to the dataframe

In [16]:
#scoring_columns = ['userId','movieId','timestamp']
scoring_columns = ['userId','movieId']
test_df = pd.DataFrame(columns=scoring_columns)
test_df.at[0,'userId'] = 388
test_df.at[0,'movieId'] = 153
test_df.at[1,'userId'] = 607
test_df.at[1,'movieId'] = 1210
test_df.head()

Unnamed: 0,userId,movieId
0,388,153
1,607,1210


In [17]:
dl = learn.dls.test_dl(test_df)
learn.get_preds(dl=dl)

(tensor([2.4751, 3.6097]), None)

In [18]:
learn.summary()

epoch,train_loss,valid_loss,time
0,,00:00,


EmbeddingDotBias (Input shape: ['64 x 2'])
Layer (type)         Output Shape         Param #    Trainable 
Embedding            64 x 50              5,050      True      
________________________________________________________________
Embedding            64 x 50              5,050      True      
________________________________________________________________
Embedding            64 x 1               101        True      
________________________________________________________________
Embedding            64 x 1               101        True      
________________________________________________________________

Total params: 10,302
Total trainable params: 10,302
Total non-trainable params: 0

Optimizer used: <function Adam at 0x7ff50d5b48b0>
Loss function: FlattenedLoss of MSELoss()

Model unfrozen

Callbacks:
  - TrainEvalCallback
  - Recorder
  - ProgressCallback