# Deep Learning Based Recommender Systems

**imports**

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

np.random.seed(123)

**reading and storing the data from a csv file into a data frame**

In [2]:
ratings = pd.read_csv(r"C:\Users\Anshul M\movie-ai\Deep-Learning-Based-Recommender-System\Data\rating.csv",
                      parse_dates = ['timestamp'])
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40
...,...,...,...,...
20000258,138493,68954,4.5,2009-11-13 15:42:00
20000259,138493,69526,4.5,2009-12-03 18:31:48
20000260,138493,69644,3.0,2009-12-07 18:10:57
20000261,138493,70286,5.0,2009-11-13 15:42:24


**As we can see above, there is a lot of data and for the purpose of keeping it manageable, we used 25% of the ratings**

In [3]:
rand_userIds = np.random.choice(ratings['userId'].unique(), 
                                size=int(len(ratings['userId'].unique())*0.25), 
                                replace=False)

# mapping the new ratings to the older IDs
ratings = ratings.loc[ratings['userId'].isin(rand_userIds)]

print(len(ratings))
ratings

5009284


Unnamed: 0,userId,movieId,rating,timestamp
236,3,1,4.0,1999-12-11 13:36:47
237,3,24,3.0,1999-12-14 12:54:08
238,3,32,4.0,1999-12-11 13:14:07
239,3,50,5.0,1999-12-11 13:13:38
240,3,160,3.0,1999-12-14 12:54:08
...,...,...,...,...
19999803,138491,4128,4.0,2009-03-04 01:37:27
19999804,138491,6874,4.0,2009-07-09 23:48:57
19999805,138491,8961,2.5,2009-07-09 23:49:07
19999806,138491,33794,2.5,2009-07-09 23:48:54


**using the train-test split strategy**

In [4]:
ratings['rank_latest'] = ratings.groupby(['userId'])['timestamp'] \
                                .rank(method='first', ascending=False)

ratings

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings['rank_latest'] = ratings.groupby(['userId'])['timestamp'] \


Unnamed: 0,userId,movieId,rating,timestamp,rank_latest
236,3,1,4.0,1999-12-11 13:36:47,81.0
237,3,24,3.0,1999-12-14 12:54:08,10.0
238,3,32,4.0,1999-12-11 13:14:07,140.0
239,3,50,5.0,1999-12-11 13:13:38,143.0
240,3,160,3.0,1999-12-14 12:54:08,11.0
...,...,...,...,...,...
19999803,138491,4128,4.0,2009-03-04 01:37:27,13.0
19999804,138491,6874,4.0,2009-07-09 23:48:57,2.0
19999805,138491,8961,2.5,2009-07-09 23:49:07,1.0
19999806,138491,33794,2.5,2009-07-09 23:48:54,3.0


**Since we randomly chose certain ratings, we must re-adjust the index in order to make traversals easier**

In [6]:
ratings = ratings.reset_index(drop=True)
ratings

Unnamed: 0,userId,movieId,rating,timestamp,rank_latest
0,3,1,4.0,1999-12-11 13:36:47,81.0
1,3,24,3.0,1999-12-14 12:54:08,10.0
2,3,32,4.0,1999-12-11 13:14:07,140.0
3,3,50,5.0,1999-12-11 13:13:38,143.0
4,3,160,3.0,1999-12-14 12:54:08,11.0
...,...,...,...,...,...
5009279,138491,4128,4.0,2009-03-04 01:37:27,13.0
5009280,138491,6874,4.0,2009-07-09 23:48:57,2.0
5009281,138491,8961,2.5,2009-07-09 23:49:07,1.0
5009282,138491,33794,2.5,2009-07-09 23:48:54,3.0


**For getting the adjusted ratings, for each rating we can find the average rating of each user and subtract it by the orginal rating of each movie.
In doing this, we get positive and negative values for each rating
**

In [7]:
new_ratings = []

def avgId(id):
    uid = ratings[ratings.userId == id]
    return sum(uid.rating) / len(uid.rating)

for i in tqdm(range(0, 5009283)):
    new_rating = ratings['rating'][i] - avgId(ratings['userId'][i])
    new_ratings.append(new_rating)

new_ratings

  0%|          | 0/5009283 [00:00<?, ?it/s]

[-0.12299465240641716,
 -1.1229946524064172,
 -0.12299465240641716,
 0.8770053475935828,
 -1.1229946524064172,
 -2.122994652406417,
 0.8770053475935828,
 -1.1229946524064172,
 0.8770053475935828,
 0.8770053475935828,
 0.8770053475935828,
 0.8770053475935828,
 0.8770053475935828,
 -1.1229946524064172,
 -1.1229946524064172,
 -1.1229946524064172,
 0.8770053475935828,
 0.8770053475935828,
 0.8770053475935828,
 -2.122994652406417,
 0.8770053475935828,
 -0.12299465240641716,
 0.8770053475935828,
 -0.12299465240641716,
 -1.1229946524064172,
 -1.1229946524064172,
 -0.12299465240641716,
 0.8770053475935828,
 0.8770053475935828,
 -1.1229946524064172,
 -0.12299465240641716,
 0.8770053475935828,
 0.8770053475935828,
 -1.1229946524064172,
 -1.1229946524064172,
 0.8770053475935828,
 0.8770053475935828,
 -0.12299465240641716,
 -0.12299465240641716,
 0.8770053475935828,
 0.8770053475935828,
 -0.12299465240641716,
 0.8770053475935828,
 -0.12299465240641716,
 -0.12299465240641716,
 -1.1229946524064172,


**Adding the adjusted ratings to the original data frame**

In [21]:
ratings['Calculated Rating'] = new_ratings

In [22]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp,rank_latest,Calculated Rating
0,3,1,4.0,1999-12-11 13:36:47,81.0,-0.122995
1,3,24,3.0,1999-12-14 12:54:08,10.0,-1.122995
2,3,32,4.0,1999-12-11 13:14:07,140.0,-0.122995
3,3,50,5.0,1999-12-11 13:13:38,143.0,0.877005
4,3,160,3.0,1999-12-14 12:54:08,11.0,-1.122995
...,...,...,...,...,...,...
5009279,138491,4128,4.0,2009-03-04 01:37:27,13.0,1.318182
5009280,138491,6874,4.0,2009-07-09 23:48:57,2.0,1.318182
5009281,138491,8961,2.5,2009-07-09 23:49:07,1.0,-0.181818
5009282,138491,33794,2.5,2009-07-09 23:48:54,3.0,-0.181818
