# Training a recommender system on a standalone dataset
Train a recommender system with fastai using a standalone dataset

- This notebook ingests the Amazon reviews dataset (https://www.kaggle.com/saurav9786/amazon-product-reviews)



In [1]:
# imports for notebook boilerplate
!pip install -Uqq fastbook
import fastbook
from fastbook import *
from fastai.collab import *

In [2]:
# set up the notebook for fast.ai
fastbook.setup_book()

In [3]:
modifier = 'apr13'

# Ingest the dataset
- define the path object
- define a dataframe to contain the dataset

In [4]:
# ingest the standalone dataset
# this step assumes you have completed the steps in "Getting Ready"
# in section "Training a recommender system on a standalone dataset" of Chapter 5
path = URLs.path('amazon_reviews')

In [5]:
# examine the directory structure
path.ls()

(#1) [Path('/storage/archive/amazon_reviews/ratings_Electronics.csv')]

In [6]:
# ingest the dataset into a Pandas dataframe
df = pd.read_csv(path/'ratings_Electronics.csv',header = None)
# add the column names described in https://www.kaggle.com/saurav9786/amazon-product-reviews
df.columns = ['userID','productID','rating','timestamp']

# Examine the dataset

In [7]:
# examine the first few records in the dataframe
df.head()

Unnamed: 0,userID,productID,rating,timestamp
0,AKM1MP6P0OYPR,132793040,5.0,1365811200
1,A2CX7LUOHB2NDG,321732944,5.0,1341100800
2,A2NWSAGRHCP8N5,439886341,1.0,1367193600
3,A2WNBOD3WNDNKT,439886341,3.0,1374451200
4,A1GI0U4ZRJA8WN,439886341,1.0,1334707200


In [8]:
# get the number of records in the dataset
df.shape

(7824482, 4)

In [9]:
# get the count of unique values in each column of the dataset
df.nunique()

userID       4201696
productID     476002
rating             5
timestamp       5489
dtype: int64

In [10]:
# count the number of missing values in each column of the dataset
df.isnull().sum()

userID       0
productID    0
rating       0
timestamp    0
dtype: int64

In [11]:
df['rating'].nunique()

5

In [12]:
%%time
# defined a CollabDataLoaders object
dls=CollabDataLoaders.from_df(df,bs= 64)

CPU times: user 36.7 s, sys: 3.86 s, total: 40.6 s
Wall time: 39.2 s


In [13]:
dls.show_batch()

Unnamed: 0,userID,productID,rating
0,APSVFXSVU0P6C,B008ABOJKS,4.0
1,A2C2TQICKW8W8,B00BQ4SBSM,2.0
2,A1I2HYPP41PIAF,B000BKJZ9Q,2.0
3,A6XLG77BC9R8R,B003A4H4VQ,5.0
4,A2NYK9KWFMJV4Y,B002JDVBYU,5.0
5,A1H9OR8UASFIR6,B000BMAQAQ,4.0
6,A1E7USO8M79M7A,B0018JV6X2,1.0
7,A1N6RWK9XBXG3T,B007B31IYQ,5.0
8,A5NBOXDPQ75RJ,B006202R44,5.0
9,A29ZTEO6EKSRDV,B004S4XNKI,3.0


# Define and train the model

In [14]:
%%time
# define the model
learn=collab_learner(dls,y_range= [ 0 , 5.0 ] )

CPU times: user 8.66 s, sys: 671 ms, total: 9.33 s
Wall time: 5.65 s


In [15]:
%%time
# train the model
learn.fit_one_cycle( 1 )

epoch,train_loss,valid_loss,time
0,2.829242,2.826828,3:36:08


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



CPU times: user 2h 27min 1s, sys: 1h 9min 51s, total: 3h 36min 53s
Wall time: 3h 36min 8s


# Exercise the trained model
- define a dataframe containing test data
- apply the trained model to the dataframe

In [16]:
# set values for test dataframe
scoring_columns = ['userID','productID']
test_df = pd.DataFrame(columns=scoring_columns)
test_df.at[0,'userID'] = 'A2NYK9KWFMJV4Y'
test_df.at[0,'productID'] = 'B008ABOJKS'
test_df.at[1,'userID'] = 'A29ZTEO6EKSRDV'
test_df.at[1,'productID'] = 'B006202R44'
test_df.head()

Unnamed: 0,userID,productID
0,A2NYK9KWFMJV4Y,B008ABOJKS
1,A29ZTEO6EKSRDV,B006202R44


In [17]:
dl = learn.dls.test_dl(test_df)
learn.get_preds(dl=dl)

(tensor([4.4364, 2.5531]), None)

In [18]:
learn.summary()

epoch,train_loss,valid_loss,time
0,,00:00,


EmbeddingDotBias (Input shape: ['64 x 2'])
Layer (type)         Output Shape         Param #    Trainable 
Embedding            64 x 50              210,084,850 True      
________________________________________________________________
Embedding            64 x 50              23,800,150 True      
________________________________________________________________
Embedding            64 x 1               4,201,697  True      
________________________________________________________________
Embedding            64 x 1               476,003    True      
________________________________________________________________

Total params: 238,562,700
Total trainable params: 238,562,700
Total non-trainable params: 0

Optimizer used: <function Adam at 0x7feeb33e0820>
Loss function: FlattenedLoss of MSELoss()

Model unfrozen

Callbacks:
  - TrainEvalCallback
  - Recorder
  - ProgressCallback

In [19]:
# save the model - first save the current path
keep_path = learn.path

In [20]:
learn.path

Path('.')

In [21]:
learn.path = Path('/notebooks/temp')

In [22]:
learn.model_dir

'models'

In [23]:
learn.save('recomm_'+modifier)

Path('/notebooks/temp/models/recomm_apr13.pth')

In [24]:
learn.path = keep_path