In [2]:
import boto3
s3 = boto3.resource('s3')

In [5]:
s3.Bucket('recommendationsystemshubham').download_file('Nearest Neighbors Rec Engine/BX-Users.csv','datasets/BX-Users.csv')

In [6]:
s3.Bucket('recommendationsystemshubham').download_file('Nearest Neighbors Rec Engine/BX-Book-Ratings.csv','datasets/BX-Book-Ratings.csv') 

In [7]:
s3.Bucket('recommendationsystemshubham').download_file('Nearest Neighbors Rec Engine/BX-Books.csv','datasets/BX-Books.csv')

In [13]:
import pandas as pd
dataFile='datasets/BX-Book-Ratings.csv'
data=pd.read_csv(dataFile,sep=";",header=0,encoding = "ISO-8859-1", error_bad_lines=False,names=["user","isbn","rating"])

In [14]:
data.head()

Unnamed: 0,user,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [18]:
bookFile='datasets/BX-Books.csv'
books=pd.read_csv(bookFile,sep=";",header=0,error_bad_lines=False, encoding = "ISO-8859-1",usecols=[0,1,2],index_col=0,names=['isbn',"title","author"])

In [19]:
books.head()

Unnamed: 0_level_0,title,author
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1
195153448,Classical Mythology,Mark P. O. Morford
2005018,Clara Callan,Richard Bruce Wright
60973129,Decision in Normandy,Carlo D'Este
374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
393045218,The Mummies of Urumchi,E. J. W. Barber


In [20]:
def bookMeta(isbn):
    title = books.at[isbn,"title"]
    author = books.at[isbn,"author"]
    return title, author
bookMeta("0671027360")

('Angels &amp; Demons', 'Dan Brown')

In [21]:
data = data[data["isbn"].isin(books.index)]

In [22]:
def faveBooks(user,N):
    userRatings = data[data["user"]==user]
    sortedRatings = pd.DataFrame.sort_values(userRatings,['rating'],ascending=[0])[:N] 
    sortedRatings["title"] = sortedRatings["isbn"].apply(bookMeta)
    return sortedRatings

In [23]:
faveBooks(204622,5)

Unnamed: 0,user,isbn,rating,title
844955,204622,0967560500,10,"(Natural Hormonal Enhancement, Rob Faigin)"
844935,204622,0671027360,10,"(Angels &amp; Demons, Dan Brown)"
844926,204622,0385504209,10,"(The Da Vinci Code, Dan Brown)"
844958,204622,097173660X,9,"(Life After School Explained, Cap &amp; Compass)"
844920,204622,0060935464,9,"(To Kill a Mockingbird, Harper Lee)"


In [24]:
data.shape

(1031175, 3)

In [25]:
usersPerISBN = data.isbn.value_counts()
usersPerISBN.head(10)

0971880107    2502
0316666343    1295
0385504209     883
0060928336     732
0312195516     723
044023722X     647
0142001740     615
067976402X     614
0671027360     586
0446672211     585
Name: isbn, dtype: int64

In [26]:
usersPerISBN.shape

(270170,)

In [27]:
ISBNsPerUser = data.user.value_counts()

In [28]:
ISBNsPerUser.shape

(92107,)

In [29]:
data = data[data["isbn"].isin(usersPerISBN[usersPerISBN>10].index)]

In [30]:
data = data[data["user"].isin(ISBNsPerUser[ISBNsPerUser>10].index)]

In [31]:
from scipy.sparse import coo_matrix
data['user'] = data['user'].astype("category")
data['isbn'] = data['isbn'].astype("category")

R = coo_matrix((data['rating'].astype(float),
                       (data['user'].cat.codes.copy(),
                        data['isbn'].cat.codes.copy())))

In [36]:
R.shape

(10706, 15451)

In [37]:
len(R.data)

405709

In [38]:
R.row[0]

10633

In [39]:
R.col[0]

3053

In [40]:
M,N = R.shape
K = 3

In [63]:
import numpy as np
U = np.random.rand(M,K)
P = np.random.rand(K,N)

In [64]:
R.data

array([0., 5., 0., ..., 0., 7., 0.])

In [65]:
data.loc[:,'rating'].head()

31    0
33    5
34    0
89    0
97    6
Name: rating, dtype: int64

In [66]:
R.row

array([10633, 10633, 10633, ..., 10632, 10632, 10632], dtype=int32)

In [67]:
data.loc[:,'user'].head()

31    276762
33    276762
34    276762
89    276798
97    276798
Name: user, dtype: category
Categories (10706, int64): [8, 99, 242, 243, ..., 278637, 278771, 278843, 278851]

In [68]:
R.col

array([ 3053,  4025,  7873, ...,  7167, 12395, 13146], dtype=int32)

In [69]:
data.loc[:,'isbn'].head()

31    034544003X
33    0380711524
34    0451167317
89    3423084049
97    3548603203
Name: isbn, dtype: category
Categories (15451, object): [0002005018, 0002251760, 0002259834, 0002558122, ..., 950491036X, 9681500830, 9681500954, 9871138016]

## This was quite tricky and it really took a lot of my time to understand.

#### After Reading coo_matrix doc I found that R.data[0] stores the first non null rating in matrix of users* books and therefore R.row[0]  and R.col[0] gives index of first non null rating for users and books respectively.

###### For example suppose A[m][n] is a sparse matrix now R.data[i] will contain value of ith non null element, R.row[i] will give index in range(m) and R.col[i] will give index in range(n)

Therefore, if A[j][k] contains ith non null entry then R.data[i] = A[j][k], R.row[i] = j and R.col[i] = k. :). phew.

In [71]:
from numpy.linalg import norm

def error(R,U,P,lamda=0.02):
    ratings = R.data
    rows = R.row
    cols = R.col
    e = 0 
    for ui in range(len(ratings)):
        rui=ratings[ui]
        u = rows[ui]
        i = cols[ui]
        if rui>0:
            e= e + pow(rui-np.dot(U[u,:],P[:,i]),2)+\
                lamda*(pow(norm(U[u,:]),2)+pow(norm(P[:,i]),2))
    return e

In [72]:
error(R,U,P)

7150288.344809474

In [73]:
rmse = np.sqrt(error(R,U,P)/len(R.data))

In [74]:
rmse

4.198116215214838

In [83]:
def SGD(R, K, lamda=0.02,steps=10, gamma=0.001):
    
    M,N = R.shape
    U = np.random.rand(M,K)
    P = np.random.rand(K,N)
    
    rmse = np.sqrt(error(R,U,P,lamda)/len(R.data))
    print("Initial RMSE: "+str(rmse))
    
    for step in range(steps):
        for ui in range(len(R.data)):
            rui=R.data[ui]
            u = R.row[ui]
            i = R.col[ui]
            if rui>0:
                eui=rui-np.dot(U[u,:],P[:,i])
                U[u,:]=U[u,:]+gamma*2*(eui*P[:,i]-lamda*U[u,:])
                P[:,i]=P[:,i]+gamma*2*(eui*U[u,:]-lamda*P[:,i])
        rmse = np.sqrt(error(R,U,P,lamda)/len(R.data))
        if rmse<0.5:
            break
    print("Final RMSE: "+str(rmse))
    return U,P

In [84]:
(U,P)=SGD(R,K=4,gamma=0.0007,lamda=0.01, steps=100)

Initial RMSE: 4.050678601008977
Final RMSE: 0.7439664118844737
