In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plot
import torch
from torch import nn
import scipy.sparse as sp
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))

Using device: cuda
NVIDIA GeForce RTX 3070


In [4]:
df_b = pd.read_csv('yelp_dataset/business.csv')

In [5]:
df_b['id'] = df_b.index

In [6]:
df_b = df_b.drop(['address','city','state','postal_code','latitude','longitude','is_open','hours'],axis=1)

In [7]:
bid_to_id = {}
for i,r in df_b.iterrows():
    bid_to_id[r['business_id']] = r['id']

In [8]:
df_users = pd.read_csv('yelp_dataset/users.csv')

In [9]:
df_users['id'] = df_users.index

In [10]:
uid_to_id = {}
for i,r in df_users.iterrows():
    uid_to_id[r['user_id']] = r['id']

In [11]:
df_users = df_users.drop(['yelping_since','elite','friends'],axis=1)

In [12]:
len(df_users)

68587

In [13]:
def one_hot(idx, length):
   a = torch.zeros(length)
   a[idx] = 1
   return a

In [14]:
one_hot(1,10)

tensor([0., 1., 0., 0., 0., 0., 0., 0., 0., 0.])

In [15]:
df_reviews = pd.read_csv('yelp_dataset/reviews.csv')

In [16]:
df_reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,score,negative,neutral,positive,compound
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11,"{'neg': 0.0, 'neu': 0.893, 'pos': 0.107, 'comp...",0.0,0.893,0.107,0.8597
1,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5.0,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03,"{'neg': 0.0, 'neu': 0.66, 'pos': 0.34, 'compou...",0.0,0.66,0.34,0.9588
2,Xs8Z8lmKkosqW5mw_sVAoA,IQsF3Rc6IgCzjVV9DE8KXg,eFvzHawVJofxSnD7TgbZtg,5.0,0,0,0,My absolute favorite cafe in the city. Their b...,2014-11-12 15:30:27,"{'neg': 0.025, 'neu': 0.738, 'pos': 0.237, 'co...",0.025,0.738,0.237,0.9679
3,G_5UczbCBJriUAbxz3J7Tw,clWLI5OZP2ad25ugMVI8gg,x4XdNhp0Xn8lOivzc77J-g,5.0,0,0,0,Best thai food in the area. Everything was au...,2013-08-15 15:27:51,"{'neg': 0.0, 'neu': 0.586, 'pos': 0.414, 'comp...",0.0,0.586,0.414,0.891
4,DyrAIuKl60j_X8Yrrv-kpg,mNsVyC9tQVYtzLOCbh2Piw,MWmXGQ98KbRo3vsS5nZhMA,5.0,1,0,0,I recently had dinner here with my wife over t...,2014-10-27 02:47:28,"{'neg': 0.026, 'neu': 0.753, 'pos': 0.221, 'co...",0.026,0.753,0.221,0.9646


In [17]:
df_reviews['user_id'] = df_reviews['user_id'].map(uid_to_id)
df_reviews['business_id'] = df_reviews['business_id'].map(bid_to_id)

In [18]:
df_ncf = df_reviews.drop(['review_id','useful','funny','cool','text','date','score','negative','neutral','positive'],axis=1)

In [19]:
df_ncf.head()

Unnamed: 0,user_id,business_id,stars,compound
0,19125,98,3.0,0.8597
1,23563,334,5.0,0.9588
2,5260,259,5.0,0.9679
3,3177,332,5.0,0.891
4,20118,458,5.0,0.9646


In [20]:
num_users, num_businesses = len(df_users), len(df_b)

In [21]:
df_ncf2 = df_ncf.copy()

In [22]:
df_ncf2['score'] = (df_ncf2['stars'] + ((df_ncf2['compound'] + 1) * 2.5))/10

In [23]:
df_ncf2 = df_ncf2.drop(['stars','compound'],axis=1)

In [24]:
df_ncf2 = df_ncf2.sample(frac=1)
df_ncf2.head()

Unnamed: 0,user_id,business_id,score
394620,58508,6511,0.3693
601924,5802,2261,0.856975
338353,64323,5749,0.963875
411498,26263,6472,0.177725
268281,32215,412,0.889275


In [25]:
user_item_matrix  = sp.coo_matrix((df_ncf2["score"], (df_ncf2["user_id"], df_ncf2["business_id"])))

In [26]:
df_ncf2.shape

(626103, 3)

In [27]:
user_item_matrix 

<68587x7314 sparse matrix of type '<class 'numpy.float64'>'
	with 626103 stored elements in COOrdinate format>

In [28]:
from scipy.sparse import coo_matrix

user_item_matrix = coo_matrix((df_ncf2["score"], (df_ncf2["user_id"], df_ncf2["business_id"])))