In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import os
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import MinMaxScaler

In [47]:
class RatingDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.scaler = MinMaxScaler()
        self.data['rating'] = self.scaler.fit_transform(self.data['rating'].values.reshape(-1, 1))
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        user = self.data['userId'].iloc[idx]
        item = self.data['itemId'].iloc[idx]
        rating = self.data['rating'].iloc[idx]
        return {'user': torch.tensor(user),
                'item': torch.tensor(item),
                'rating': torch.tensor(rating)}

In [58]:
class DANNModel(nn.Module):
    def __init__(self):
        super(DANNModel, self).__init__()
        self.feature_extractor = nn.Sequential(
            nn.Linear(2, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
        )
        self.classifier = nn.Sequential(
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )
        self.domain_classifier = nn.Sequential(
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )
        
    def forward(self, x, alpha=0.1):
        feature = self.feature_extractor(x)
        label_output = self.classifier(feature)
        domain_output = self.domain_classifier(feature)
        return label_output, domain_output
    
    def domain_loss(self, domain_output, domain_labels):
        domain_loss_fn = nn.BCELoss()
        return domain_loss_fn(domain_output, domain_labels)
    
    def label_loss(self, label_output, labels):
        label_loss_fn = nn.MSELoss()
        return label_loss_fn(label_output.squeeze(), labels)
        

In [69]:
def train_model(source_data, target_data, model, optimizer, device, epochs):
    source_dataset = RatingDataset(source_data)
    target_dataset = RatingDataset(target_data)
    source_loader = DataLoader(source_dataset, batch_size=32, shuffle=True)
    target_loader = DataLoader(target_dataset, batch_size=32, shuffle=True)
    
    for epoch in range(epochs):
        model.train()
        for batch_idx, source_batch in enumerate(source_loader):
            target_batch = next(iter(target_loader))
            
            source_inputs = torch.stack([source_batch['user'], source_batch['item']], dim=1).float().to(device)
            source_labels = source_batch['rating'].float().to(device)
            
            target_inputs = torch.stack([target_batch['user'], target_batch['item']], dim=1).float().to(device)

            domain_labels = torch.ones(source_inputs.shape[0] + target_inputs.shape[0], 1).to(device) # 32 + 32
            #print(source_inputs.shape[0], target_inputs.shape[0])
            domain_labels[:source_inputs.shape[0]] = 0
            
            optimizer.zero_grad()
            
            label_output, domain_output = model(source_inputs, alpha=0.1)
            domain_output = torch.cat((torch.zeros(source_inputs.shape[0], 1).to(device), domain_output), dim=0)
#            print(domain_output.shape) # 32
            label_loss = model.label_loss(label_output, source_labels)
            domain_loss = model.domain_loss(domain_output, domain_labels)
            
            loss = label_loss + domain_loss
            loss.backward()

            optimizer.step()

            if batch_idx % 100 == 0:
                print('Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tLabel Loss: {:.6f}\tDomain Loss: {:.6f}'.format(
                    epoch, batch_idx * len(source_inputs), len(source_loader.dataset),
                    100. * batch_idx / len(source_loader), loss.item(), label_loss.item(), domain_loss.item()))
                
    


In [60]:

def test_model(target_data, model, device):
    target_dataset = RatingDataset(target_data)
    target_loader = DataLoader(target_dataset, batch_size=32, shuffle=True)
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for target_batch in target_loader:
            target_inputs = torch.stack([target_batch['user'], target_batch['item']], dim=1).float().to(device)
            target_labels = target_batch['rating'].float().to(device)
            label_output, domain_output = model(target_inputs, alpha=0.1)
            test_loss += model.label_loss(label_output, target_labels).item() # sum up batch loss
            pred = label_output.squeeze().round()
            correct += pred.eq(target_labels.view_as(pred)).sum().item()

    test_loss /= len(target_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(target_loader.dataset),
        100. * correct / len(target_loader.dataset)))

In [32]:
source_datapath = os.path.join('..', 'Model', 'ml-latest-small', 'ratings.csv')
target_datapath = os.path.join('..', '..', 'Dataset', 'all_recipe_cocktail_reviews.json')

In [40]:
def main():
    source_data = pd.read_csv(source_datapath)
    target_data = pd.read_json(target_datapath)["cocktail_reviews"].apply(pd.Series)

    # userId == 'Allrecipes Member'인 데이터 제거
    target_data = target_data[target_data["User"] != "Allrecipes Member"]

    # json을 Dataframe으로 변환
    allRecipe_userMapping = {name:idx for idx, name in enumerate(target_data["User"].unique())}
    allRecipe_cocktailMapping = {name:idx for idx, name in enumerate(target_data["Cocktail"].unique())}

    # Dataframe의 User, Item을 숫자로 변환
    target_data["User"] = target_data["User"].apply(lambda x: allRecipe_userMapping[x]+1)
    target_data["Cocktail"] = target_data["Cocktail"].apply(lambda x: allRecipe_cocktailMapping[x]+1)

    target_data = target_data[["User", "Cocktail", "Rating", "Date"]]
    target_data.columns = ["userId", "cocktailId", "rating", "timestamp"]

    model = DANNModel()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    train_model(source_data, target_data, model, optimizer, device, epochs=10)
    test_model(target_data, model, device)

In [50]:
source_data = pd.read_csv(source_datapath)
target_data = pd.read_json(target_datapath)["cocktail_reviews"].apply(pd.Series)

# userId == 'Allrecipes Member'인 데이터 제거
target_data = target_data[target_data["User"] != "Allrecipes Member"]

# json을 Dataframe으로 변환
allRecipe_userMapping = {name:idx for idx, name in enumerate(target_data["User"].unique())}
allRecipe_cocktailMapping = {name:idx for idx, name in enumerate(target_data["Cocktail"].unique())}

# Dataframe의 User, Item을 숫자로 변환
target_data["User"] = target_data["User"].apply(lambda x: allRecipe_userMapping[x]+1)
target_data["Cocktail"] = target_data["Cocktail"].apply(lambda x: allRecipe_cocktailMapping[x]+1)

target_data = target_data[["User", "Cocktail", "Rating", "Date"]]
target_data.columns = ["userId", "itemId", "rating", "timestamp"]

source_data.columns = ["userId", "itemId", "rating", "timestamp"]


In [70]:
model = DANNModel()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

DANNModel(
  (feature_extractor): Sequential(
    (0): Linear(in_features=2, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ReLU()
  )
  (classifier): Sequential(
    (0): Linear(in_features=64, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=1, bias=True)
    (3): Sigmoid()
  )
  (domain_classifier): Sequential(
    (0): Linear(in_features=64, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=1, bias=True)
    (3): Sigmoid()
  )
)

In [71]:
train_model(source_data, target_data, model, optimizer, device, epochs=10)




ValueError: Using a target size (torch.Size([36, 1])) that is different to the input size (torch.Size([8, 1])) is deprecated. Please ensure they have the same size.

In [46]:
test_model(target_data, model, device)

KeyError: 'user_id'