<a href="https://colab.research.google.com/github/TaoM1992/gh-exercise/blob/master/hw1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Homework 1
#### Shiqi Tao

In [48]:
import pandas as pd

import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from tqdm.notebook import tqdm
import torch.nn.functional as F

In [49]:
train = pd.read_csv('book_ratings_train_v2.csv')
test = pd.read_csv('book_ratings_test_v2.csv')

In [50]:
train.shape, test.shape

((682295, 20), (29877, 20))

#### 1. First create a dictionary for users and books, then do the following.

In [51]:
user = train.user_id.unique()
book = train.isbn.unique()

user_dic = dict()
for i,u in enumerate(user):
    user_dic[u] = i 

book_dic = dict()
for i,b in enumerate(book):
    book_dic[b] = i 

In [52]:
num_users, num_books = len(user), len(book_dic)

In [53]:
for id in test.user_id.unique():
  if id not in user:
    test = test.drop(test[test.user_id==id].index)

In [54]:
for b in test.isbn.unique():
  if b not in book:
    test = test.drop(test[test.isbn==b].index)

- Create a DataSet class that outputs the user and book indices in a single tensor (in preparation for input into an nn.Embedding layer) and another tensor with the book rating.

In [55]:
class RatingDataset(Dataset):
    def __init__(self, df):
        self.df = df
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        user_idx = user_dic[row['user_id']]
        book_idx = book_dic[row['isbn']]
        
        x1 = torch.tensor(user_idx)
        x2 = torch.tensor(book_idx)
        
        y = torch.tensor(row['rating']).float()
        y_bin = torch.tensor(row['rating']>4).float()
        
        return x1, x2, y, y_bin

- Create a DataLoader for the training data and the test data.

In [56]:
rating_train_ds = RatingDataset(train)
rating_train_dl = DataLoader(rating_train_ds, batch_size=1000, shuffle=True)

In [57]:
rating_test_ds = RatingDataset(test)
rating_test_dl = DataLoader(rating_test_ds, batch_size=1000, shuffle=True)

#### 2. Create three different classes of models using nn.Module. You will need a second DataSet class for the third model.

- A model which predicts the rating a user will give to a book using Matrix Factorization (similar to what you did before in Dr. Interian’s course)

In [58]:
class MF_rating_v1(nn.Module):
    def __init__(self, num_users, num_books, emb_size=50):
        super(MF_rating_v1, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.book_emb = nn.Embedding(num_books, emb_size)
        # initlializing weights
        self.user_emb.weight.data.uniform_(0,0.05)
        self.book_emb.weight.data.uniform_(0,0.05)
        
    def forward(self, u, b):
        u = self.user_emb(u)
        b = self.book_emb(b)
        return (u*b).sum(1)   

- A model which predicts the rating a user will give to a book by embedding both the book and the user as 50-dimensional features, followed by a linear layers (Hint: it will look like nn.Linear(100, 1)).

In [59]:
class MF_rating_v2(nn.Module):
    def __init__(self, num_users, num_books, emb_size=50):
        super(MF_rating_v2, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.book_emb = nn.Embedding(num_books, emb_size)
        # initlializing weights
        self.user_emb.weight.data.uniform_(0,0.05)
        self.book_emb.weight.data.uniform_(0,0.05)
        self.linear = nn.Linear(emb_size * 2, 1)
        
    def forward(self, u, v):
        u = self.user_emb(u)
        v = self.book_emb(v)
        x = torch.cat((u, v), dim=1)
        x = self.linear(x)
        return torch.squeeze(x)

- A model which predicts the rating a user will give to a book by embedding both the book and the user in some feature space (dimension up to you) and by including at least two other features from the dataset (such as age, location, year of publication, etc.). Note that for categorical variables you will need to use more embedding layers! Feel free to use any techniques we learned last week in this model.

In [60]:
age = train.age.unique()
yp = train.year_of_publication.unique()

age_dic = dict()
for i,a in enumerate(age):
    age_dic[a] = i 

yp_dic = dict()
for i,y in enumerate(yp):
    yp_dic[y] = i 

In [61]:
num_age, num_yp = len(age), len(yp)

In [62]:
num_age, num_yp

(85, 79)

In [63]:
class RatingDataset_v2(Dataset):
    def __init__(self, df):
        self.df = df
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        user_idx = user_dic[row['user_id']]
        book_idx = book_dic[row['isbn']]
        age_idx = age_dic[row['age']]
        yp_idx = yp_dic[row['year_of_publication']]
        
        x1 = torch.tensor(user_idx)
        x2 = torch.tensor(book_idx)
        x3 = torch.tensor(age_idx)
        x4 = torch.tensor(yp_idx)
        
        y = torch.tensor(row['rating']).float()
        y_bin = torch.tensor(row['rating']>4).float()
        return x1, x2, x3, x4, y, y_bin

In [64]:
class MF_rating_v3(nn.Module):
    def __init__(self, num_users, num_books, num_age, num_yp, emb_size=50):
        super(MF_rating_v3, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.book_emb = nn.Embedding(num_books, emb_size)
        self.age_emb = nn.Embedding(num_age, emb_size)
        self.yp_emb = nn.Embedding(num_yp, emb_size)
        # initlializing weights
        self.user_emb.weight.data.uniform_(0,0.05)
        self.book_emb.weight.data.uniform_(0,0.05)
        self.linear = nn.Linear(emb_size * 4, 1)
        
    def forward(self, u, v, a, yp):
        u = self.user_emb(u)
        v = self.book_emb(v)
        a = self.age_emb(a)
        yp = self.yp_emb(yp)
        x = torch.cat((u, v, a, yp), dim=1)
        x = self.linear(x)
        return torch.squeeze(x)

#### 3. Initialize each of the three models and pass one batch through them to make sure they are working properly.
- MF_rating_v1

In [65]:
u, b, y, y_bin = next(iter(rating_train_dl))
m1 = MF_rating_v1(num_users, num_books)
optimizer = optim.Adam(m1.parameters(), lr = 0.01)


y_pred = m1(u, b)
nn.MSELoss()(y_pred, y)

tensor(20.5126, grad_fn=<MseLossBackward>)

- MF_rating_v2

In [66]:
u, b, y, y_bin = next(iter(rating_train_dl))
m2 = MF_rating_v2(num_users, num_books)
optimizer = optim.Adam(m2.parameters(), lr = 0.01)


y_pred = m2.forward(u, b)
nn.MSELoss()(y_pred, y)

tensor(20.7266, grad_fn=<MseLossBackward>)

- MF_rating_v3

In [67]:
rating_train_ds_v2 = RatingDataset_v2(train)
rating_train_dl_v2 = DataLoader(rating_train_ds_v2, batch_size=1000, shuffle=True)

In [76]:
rating_test_ds_v2 = RatingDataset_v2(test)
rating_test_dl_v2 = DataLoader(rating_test_ds_v2, batch_size=1000, shuffle=True)

In [68]:
u, b, a, yp, y, y_bin = next(iter(rating_train_dl_v2))
m3 = MF_rating_v3(num_users, num_books, num_age, num_yp)
optimizer = optim.Adam(m3.parameters(), lr = 0.01)


y_pred = m3.forward(u, b, a, yp)
nn.MSELoss()(y_pred, y)

tensor(20.0090, grad_fn=<MseLossBackward>)

4. Train each of the models for this regression task using an appropriate Loss function for at least two epochs. At the end of each epoch, print the accuracy of your model in predicting whether a user will rate a book as “good” (rating above 4) or as “bad” (rating 4 or below) for both the training and test sets.

- For context, I achieved around 57% percent accuracy on the test set after 5 epochs and 20 minutes on my laptop using the second model. I used a batch size of 10000 and Adam optimization with a learning rate of 0.01.

- You will not be graded on model performance, just being able to train the model and print the accuracy. The dataset is rather large, so if you are interested in pushing the performance and trying other methods I suggest using Google Colab or Kaggle GPUs.

- MF_rating_v1

In [71]:
m1 = MF_rating_v1(num_users, num_books)
optimizer = optim.Adam(m1.parameters(), lr = 0.001)

for epoch in tqdm(range(2)):
    total_loss = 0.0
    total_correct = 0
    total = 0
    m1.train()
    for u, b, y, y_bin in rating_train_dl:      
        y_hat = m1(u, b)
        #print(y_hat,y)
        loss = nn.MSELoss()(y_hat, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += u.size(0)*loss.item()
        total += u.size(0)
    train_loss = total_loss/total

    for u, b, y, y_bin in rating_test_dl:
        m1.eval()
        y_hat = m1(u, b)
        loss = nn.MSELoss()(y_hat, y)
        y_pred = (y_hat > 4).long()
        correct = (y_pred.float() == y_bin).float().sum()
        val_accuracy = correct/y_pred.shape[0]
        val_loss = loss.item()
    print("train_loss %.3f val_loss %.3f val_accuracy %.3f" % (train_loss, val_loss, val_accuracy)) 

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

train_loss 19.403 val_loss 25.872 val_accuracy 0.535
train_loss 15.784 val_loss 19.306 val_accuracy 0.577



- MF_rating_v2

In [72]:
m2 = MF_rating_v1(num_users, num_books)
optimizer = optim.Adam(m2.parameters(), lr = 0.001)

for epoch in tqdm(range(2)):
    total_loss = 0.0
    total_correct = 0
    total = 0
    m2.train()
    for u, b, y, y_bin in rating_train_dl:      
        y_hat = m2(u, b)
        #print(y_hat,y)
        loss = nn.MSELoss()(y_hat, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += u.size(0)*loss.item()
        total += u.size(0)
    train_loss = total_loss/total

    for u, b, y, y_bin in rating_test_dl:
        m2.eval()
        y_hat = m2(u, b)
        loss = nn.MSELoss()(y_hat, y)
        y_pred = (y_hat > 4).long()
        #print(y_pred_)
        correct = (y_pred.float() == y_bin).float().sum()
        val_accuracy = correct/y_pred.shape[0]
        val_loss = loss.item()
    print("train_loss %.3f val_loss %.3f val_accuracy %.3f" % (train_loss, val_loss, val_accuracy)) 

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

train_loss 19.403 val_loss 25.733 val_accuracy 0.536
train_loss 15.786 val_loss 20.381 val_accuracy 0.568



- MF_rating_v3

In [77]:
m3 = MF_rating_v3(num_users, num_books, num_age, num_yp)
optimizer = optim.Adam(m3.parameters(), lr = 0.001)

for epoch in tqdm(range(2)):
    total_loss = 0.0
    total_correct = 0
    total = 0
    m3.train()
    for u, b, a, yp, y, y_bin in rating_train_dl_v2:      
        y_hat = m3(u, b, a, yp)
        #print(y_hat,y)
        loss = nn.MSELoss()(y_hat, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += u.size(0)*loss.item()
        total += u.size(0)
    train_loss = total_loss/total

    for u, b, a, yp, y, y_bin in rating_test_dl_v2:
        m3.eval()
        y_hat = m3(u, b, a, yp)
        loss = nn.MSELoss()(y_hat, y)
        y_pred = (y_hat > 4).long()
        #print(y_pred_)
        correct = (y_pred.float() == y_bin).float().sum()
        val_accuracy = correct/y_pred.shape[0]
        val_loss = loss.item()
    print("train_loss %.3f val_loss %.3f val_accuracy %.3f" % (train_loss, val_loss, val_accuracy)) 

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

train_loss 12.696 val_loss 12.146 val_accuracy 0.700
train_loss 10.327 val_loss 10.743 val_accuracy 0.752



5. Pick one of the model architecture and use it directly predict whether a user will rate a book as “good” or “bad” (rather than through regression onto the rating). Constrast with the accuracy you obtained in the previous problem.

In [78]:
m1 = MF_rating_v1(num_users, num_books)
optimizer = optim.Adam(m1.parameters(), lr = 0.001)

for epoch in tqdm(range(2)):
    total_loss = 0.0
    total_correct = 0
    total = 0
    m1.train()
    for u, b, y, y_bin in rating_train_dl:      
        y_hat = m1(u, b)
        #print(y_hat,y)
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += u.size(0)*loss.item()
        total += u.size(0)
    train_loss = total_loss/total

    for u, b, y, y_bin in rating_test_dl:
        m1.eval()
        y_hat = m1(u, b)
        loss = F.binary_cross_entropy_with_logits(y_hat, y_bin)
        y_pred = (y_hat > 0).float()
        correct = (y_pred.float() == y_bin).float().sum()
        val_accuracy = correct/y_pred.shape[0]
        val_loss = loss.item()
    print("train_loss %.3f val_loss %.3f val_accuracy %.3f" % (train_loss, val_loss, val_accuracy)) 

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

train_loss 0.034 val_loss 0.704 val_accuracy 0.571
train_loss -3.279 val_loss 0.963 val_accuracy 0.585

