In [1]:
import pandas as pd
import numpy as np
import torch
import sys
import torch.nn as nn
import traceback
import random
import torch.nn.functional as F
import os
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer

np.random.seed(42)

In [2]:
class DatasetCFG:
    data_root = 'ml-25m'
    num_negatives = 1

### 获取训练数据


In [3]:
class MovieLensDataset(Dataset):
    '''
        ratings_data: userId|user_behavior|rating|movieId|title|genres
    '''

    def __init__(self, ratings_data, mode='train'):
        self.user_ids, self.user_behaviors, self.movie_ids, self.movie_titles, self.movie_genres, self.labels = self.generate_dataset(
            ratings_data, mode)

    def __getitem__(self, index):
        return self.user_ids[index], self.user_behaviors[index], self.movie_ids[index], self.movie_titles[index], self.movie_genres[index], self.labels[index]

    def __len__(self):
        return len(self.user_ids)

    def generate_dataset(self, ratings_data, mode='train'):
        movie_id_title_genres_list = list(
            set(zip(ratings_data['movieId'], ratings_data['title'], ratings_data['genres'])))

        user_item_set = set(
            zip(ratings_data['userId'], ratings_data['movieId']))
        user_ids, user_behaviors, movie_ids, movie_titles, movie_genres, labels = [], [], [], [], [], []
        for rating in ratings_data.itertuples():
            user_id = getattr(rating, 'userId')
            user_behavior = getattr(rating, 'user_behavior')
            movie_id = getattr(rating, 'movieId')
            movie_title = getattr(rating, 'title')
            movie_genre = getattr(rating, 'genres')
            if mode == 'train':
                for _ in range(DatasetCFG.num_negatives):
                    negative_movie = random.choice(movie_id_title_genres_list)
                    while (user_id, negative_movie[0]) in user_item_set:
                        negative_movie = random.choice(
                            movie_id_title_genres_list)
                    user_ids.append(str(user_id))
                    user_behaviors.append(user_behavior)
                    movie_ids.append(str(negative_movie[0]))
                    movie_titles.append(negative_movie[1])
                    movie_genres.append(negative_movie[2])
                    labels.append(0)
            user_ids.append(str(user_id))
            user_behaviors.append(user_behavior)
            movie_ids.append(str(movie_id))
            movie_titles.append(movie_title)
            movie_genres.append(movie_genre)
            labels.append(1)
        return user_ids, user_behaviors, movie_ids, movie_titles, movie_genres, labels


def spilt_train_test(file_name='ratings_data_process_0001.csv'):
    ratings_data = pd.read_csv(os.path.join(
        DatasetCFG.data_root, file_name))
    ratings_train = ratings_data[ratings_data['rank_latest'] != ratings_data.groupby(
        'userId')['rank_latest'].transform('max')]
    ratings_test = ratings_data[ratings_data['rank_latest'] == ratings_data.groupby(
        'userId')['rank_latest'].transform('max')]
    return ratings_train, ratings_test

In [4]:
class Net(nn.Module):
    def __init__(self, device):
        super(Net, self).__init__()
        self.device = device
        self.embedding = SentenceTransformer(
            'models/all_datasets_v4_MiniLM-L6')
        for param in self.embedding.parameters():
            param.requires_grad = False
        self.user_fc1 = nn.Linear(384*2, 512)
        self.user_fc2 = nn.Linear(512, 128)
        self.movie_fc1 = nn.Linear(384*3, 512)
        self.movie_fc2 = nn.Linear(512, 128)
        self.relu = nn.ReLU()
        self.to(device)

    def forward(self, user_id, user_behavior, movie_id, movie_title, movie_genre):
        # display(user_id,user_behavior,movie_id,movie_title,movie_genre)
        user_id_embedding = torch.tensor(
            self.embedding.encode(user_id)).requires_grad_(True)
        user_behavior_embedding = torch.tensor(
            self.embedding.encode(user_behavior)).requires_grad_(True)
        movie_id_embedding = torch.tensor(self.embedding.encode(
            movie_id), requires_grad=True).requires_grad_(True)
        movie_title_embedding = torch.tensor(self.embedding.encode(
            movie_title), requires_grad=True).requires_grad_(True)
        movie_genre_embedding = torch.tensor(self.embedding.encode(
            movie_genre), requires_grad=True).requires_grad_(True)
        # display(user_id_embedding.shape,user_behavior_embedding.shape,movie_id_embedding.shape,movie_title_embedding.shape,movie_genre_embedding.shape)

        user_embedding = torch.cat(
            [user_id_embedding, user_behavior_embedding], dim=1)
        movie_embedding = torch.cat(
            [movie_id_embedding, movie_title_embedding, movie_genre_embedding], dim=1)

        user_embedding = user_embedding.to(self.device)
        movie_embedding = movie_embedding.to(self.device)

        user_out = self.user_fc1(user_embedding)
        user_out = self.relu(user_out)
        user_out = self.user_fc2(user_out)

        movie_out = self.movie_fc1(movie_embedding)
        movie_out = self.relu(movie_out)
        movie_out = self.movie_fc2(movie_out)

        result = user_out*movie_out
        result = F.softmax(torch.sum(result, dim=1))

        return result

In [5]:
def train(net, train_dataloader, epochs, device):
    net.train()
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(net.parameters(), lr=0.001)
    for epoch in range(epochs):
        for user_id, user_behavior, movie_id, movie_title, movie_genre, labels in tqdm(train_dataloader):
            optimizer.zero_grad()
            outputs = net(user_id, user_behavior, movie_id,
                          movie_title, movie_genre)
            labels = labels.to(device)
            loss = criterion(outputs, labels.float())
            loss.backward()
            optimizer.step()
        print('epoch %d loss: %.3f' % (epoch + 1, loss.item()))

In [6]:
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

ratings_train, ratings_test = spilt_train_test('ratings_data_process_0001.csv')
train_dataset = MovieLensDataset(ratings_train, mode='train')
test_dataset = MovieLensDataset(ratings_test, mode='test')

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)

model = Net(device)
optimizer = torch.optim.SGD(
    filter(lambda p: p.requires_grad, model.parameters()), lr=0.01)
criterion = nn.MSELoss()

for epoch in range(5):
    for batch in tqdm(train_dataloader, total=len(train_dataloader)):
        user_id, user_behavior, movie_id, movie_title, movie_genre, label = batch
        label = torch.tensor(label).float().to(device)
        optimizer.zero_grad()
        out = model(user_id, user_behavior, movie_id, movie_title, movie_genre)
        loss = criterion(out, label)
        loss.backward()
        optimizer.step()
    print("epoch:{},loss:{}".format(epoch, loss.item()))

  label = torch.tensor(label).float().to(device)
  result = F.softmax(torch.sum(result, dim=1))
100%|██████████| 1453/1453 [01:48<00:00, 13.41it/s]


epoch:0,loss:0.4366406798362732


100%|██████████| 1453/1453 [01:36<00:00, 15.01it/s]


epoch:1,loss:0.5300308465957642


100%|██████████| 1453/1453 [01:43<00:00, 14.05it/s]


epoch:2,loss:0.37444034218788147


100%|██████████| 1453/1453 [01:43<00:00, 13.99it/s]


epoch:3,loss:0.4988362789154053


100%|██████████| 1453/1453 [01:44<00:00, 13.89it/s]


epoch:4,loss:0.4366995096206665


In [7]:
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
model = SentenceTransformer('models/all_datasets_v4_MiniLM-L6')
for _ in tqdm(range(10000)):
    text1 = 'dsadsad sdasdasdas dsadas'
    encode1 = torch.tensor(model.encode(text1, device=device))

100%|██████████| 10000/10000 [00:43<00:00, 232.26it/s]
