In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plot
import torch
from torch import nn


In [2]:
df_b = pd.read_csv('yelp_dataset/business.csv')

In [3]:
df_b['id'] = df_b.index

In [4]:
df_b = df_b.drop(['address','city','state','postal_code','latitude','longitude','is_open','hours'],axis=1)

In [5]:
bid_to_id = {}
for i,r in df_b.iterrows():
    bid_to_id[r['business_id']] = r['id']

In [6]:
df_users = pd.read_csv('yelp_dataset/users.csv')

In [7]:
df_users['id'] = df_users.index

In [8]:
uid_to_id = {}
for i,r in df_users.iterrows():
    uid_to_id[r['user_id']] = r['id']

In [9]:
df_users = df_users.drop(['yelping_since','elite','friends'],axis=1)

In [10]:
len(df_users)

68587

In [11]:
def one_hot(idx, length):
   a = torch.zeros(length)
   a[idx] = 1
   return a

In [12]:
one_hot(1,10)

tensor([0., 1., 0., 0., 0., 0., 0., 0., 0., 0.])

In [13]:
df_reviews = pd.read_csv('yelp_dataset/reviews.csv')

In [14]:
df_reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,score,negative,neutral,positive,compound
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11,"{'neg': 0.0, 'neu': 0.893, 'pos': 0.107, 'comp...",0.0,0.893,0.107,0.8597
1,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5.0,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03,"{'neg': 0.0, 'neu': 0.66, 'pos': 0.34, 'compou...",0.0,0.66,0.34,0.9588
2,Xs8Z8lmKkosqW5mw_sVAoA,IQsF3Rc6IgCzjVV9DE8KXg,eFvzHawVJofxSnD7TgbZtg,5.0,0,0,0,My absolute favorite cafe in the city. Their b...,2014-11-12 15:30:27,"{'neg': 0.025, 'neu': 0.738, 'pos': 0.237, 'co...",0.025,0.738,0.237,0.9679
3,G_5UczbCBJriUAbxz3J7Tw,clWLI5OZP2ad25ugMVI8gg,x4XdNhp0Xn8lOivzc77J-g,5.0,0,0,0,Best thai food in the area. Everything was au...,2013-08-15 15:27:51,"{'neg': 0.0, 'neu': 0.586, 'pos': 0.414, 'comp...",0.0,0.586,0.414,0.891
4,DyrAIuKl60j_X8Yrrv-kpg,mNsVyC9tQVYtzLOCbh2Piw,MWmXGQ98KbRo3vsS5nZhMA,5.0,1,0,0,I recently had dinner here with my wife over t...,2014-10-27 02:47:28,"{'neg': 0.026, 'neu': 0.753, 'pos': 0.221, 'co...",0.026,0.753,0.221,0.9646


In [15]:
df_reviews['user_id'] = df_reviews['user_id'].map(uid_to_id)
df_reviews['business_id'] = df_reviews['business_id'].map(bid_to_id)

In [16]:
df_ncf = df_reviews.drop(['review_id','useful','funny','cool','text','date','score','negative','neutral','positive'],axis=1)

In [17]:
df_ncf.head()

Unnamed: 0,user_id,business_id,stars,compound
0,19125,98,3.0,0.8597
1,23563,334,5.0,0.9588
2,5260,259,5.0,0.9679
3,3177,332,5.0,0.891
4,20118,458,5.0,0.9646


In [18]:
num_users, num_businesses = len(df_users), len(df_b)

In [19]:
df_ncf2 = df_ncf.copy()

In [20]:
df_ncf2['score'] = df_ncf2['stars'] + df_ncf2['compound']

In [21]:
df_ncf2 = df_ncf2.drop(['stars','compound'],axis=1)

In [22]:
df_ncf2 = df_ncf2.sample(frac=1)

In [94]:
class NCF(nn.Module):
    def __init__(self, input_size, embed_size, hidden_size):
        super().__init__()
        self.input_size = input_size
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        
        self.embed = nn.Embedding(self.input_size, self.embed_size)
        self.fc1 = nn.Linear(2 * self.embed_size, self.hidden_size)
        self.fc2 = nn.Linear(self.hidden_size, self.hidden_size)
        self.fc3 = nn.Linear(self.hidden_size, 1)
        self.relu = nn.ReLU()
    def forward(self, x):
        user_embed = self.embed(x[0])
        item_embed = self.embed(x[1])
        out = self.fc1(torch.concat((user_embed,item_embed)))
        out = self.relu(out)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.fc3(out)
        return out


In [95]:
input_size = num_users + num_businesses
embed_size = 32
hidden_size = 128
model = NCF(input_size=input_size, embed_size=embed_size, hidden_size=hidden_size)

In [96]:
print(model)

NCF(
  (embed): Embedding(75901, 32)
  (fc1): Linear(in_features=64, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=1, bias=True)
  (relu): ReLU()
)


In [68]:
x, y = torch.tensor(df_ncf2.iloc[:,:-1].values), torch.tensor(df_ncf2.iloc[:,-1].values)

In [44]:
split = 0.8
split_idx = int(len(x) * split)
train_x, test_x, train_y, test_y = x[:split_idx], x[split_idx:], y[:split_idx], y[split_idx:]
data = (train_x, test_x, train_y, test_y)


In [107]:
def train(model, data, epochs, optimizer, criterion):
    train_x, test_x, train_y, test_y = data
    
    for epoch in range(epochs):
        train_loss = 0
        test_loss = 0
        for i in range(len(train_x)):
            x,y = train_x[i], train_y[i].to(torch.float32)
            model.zero_grad()
            pred_y = model(x)
            loss = criterion(y, pred_y)
            train_loss += loss
            loss.backward()
            optimizer.step()
            if i % 100000 == 0 and i != 0:
                b = train_loss / i
                print(f'train_loss ({i}/{len(train_x)}): {b}')
            

        for i in range(len(test_x)):
            x,y = test_x[i], test_y[i]

            with torch.no_grad():
                pred_y = model(x)
                loss = criterion(y, pred_y)
                test_loss += loss
            
        train_loss /= len(train_x)
        test_loss /= len(test_x)

        print(f'Epoch {epoch+1}:\t Train Loss:{train_loss}\t Test Loss:{test_loss}')



In [109]:
learning_rate = 1e-3
reg_rate = 1e-6
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate,weight_decay=reg_rate)
criterion = nn.MSELoss()
epochs = 4


In [110]:
train(model, data, epochs, optimizer, criterion)

train_loss (1000/500882): 2.543792486190796
train_loss (2000/500882): 2.687988758087158
train_loss (3000/500882): 2.673556327819824
train_loss (4000/500882): 2.714850425720215
train_loss (5000/500882): 2.7638819217681885
train_loss (6000/500882): 2.7824065685272217
train_loss (7000/500882): 2.7613420486450195
train_loss (8000/500882): 2.760427236557007
train_loss (9000/500882): 2.780895948410034
train_loss (10000/500882): 2.7654783725738525
train_loss (11000/500882): 2.7550716400146484
train_loss (12000/500882): 2.7615208625793457
train_loss (13000/500882): 2.7493019104003906
train_loss (14000/500882): 2.7289795875549316
train_loss (15000/500882): 2.718489170074463
train_loss (16000/500882): 2.718994140625
train_loss (17000/500882): 2.710357189178467
train_loss (18000/500882): 2.6980860233306885
train_loss (19000/500882): 2.686115026473999
train_loss (20000/500882): 2.674257755279541
train_loss (21000/500882): 2.6662936210632324
train_loss (22000/500882): 2.6636886596679688
train_loss 

In [None]:
torch.save(model.state_dict(), 'ncf_params.pth')

g