In [178]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer
np.random.seed(42)

In [179]:
class DatasetCFG:
    data_root='ml-25m'
    user_chosen_num=1000
    num_negatives=4

### 获取训练数据


In [180]:
class MovieLensDataset(Dataset):
    '''
        ratings_data: userId|user_behavior|rating|movieId|title|genres
    '''
    def __init__(self,ratings_data,mode='train'):        
        self.user_ids,self.user_behaviors,self.movie_ids,self.movie_titles,self.movie_genres,self.labels=self.generate_dataset(ratings_data,mode)
        
    def __getitem__(self, index):
        return super().__getitem__(index)
        
    def __len__(self):
        return len(self.users)
    
    def generate_dataset(self,ratings_data,mode='train'):
        user_ids,user_behaviors,movie_ids,movie_titles,movie_genres,labels=[],[],[],[],[],[]
        for rating in ratings_data.itertuples():
            user_id=getattr(rating,'userId')
            user_behavior=getattr(rating,'user_behavior')
            movie_id=getattr(rating,'movieId')
            movie_title=getattr(rating,'title')
            movie_genre=getattr(rating,'genres')
            if mode=='train':
                for _ in range(DatasetCFG.num_negatives):
                    user_ids.append(user_id)
                    user_behaviors.append(user_behavior)
                    movie_ids.append(movie_id)
                    movie_titles.append(movie_title)
                    movie_genres.append(movie_genre)
                    labels.append(0)
            user_ids.append(user_id)
            user_behaviors.append(user_behavior)
            movie_ids.append(movie_id)
            movie_titles.append(movie_title)
            movie_genres.append(movie_genre)
            labels.append(1)
        return user_ids,user_behaviors,movie_ids,movie_titles,movie_genres,labels


def data_preprocess():
    ratings_path=os.path.join(DatasetCFG.data_root,'ratings.csv')
    movies_path=os.path.join(DatasetCFG.data_root,'movies.csv')
    ratings_data=pd.read_csv(ratings_path)
    movies_data=pd.read_csv(movies_path)
    
    # random_user_ids=np.random.choice(ratings_data['userId'].unique(), 
    #                             size=int(len(ratings_data['userId'].unique())*0.001), 
    #                             replace=False)
    # ratings_data=ratings_data[ratings_data['userId'].isin(random_user_ids)]      


    ratings_data=ratings_data.merge(movies_data,on='movieId')
    ratings_data['rank_latest'] = ratings_data.groupby(['userId'])['timestamp'].rank(method='first', ascending=True)
    
    ratings_data=ratings_data.sort_values(['userId','rank_latest'],ascending=[True,False]).reset_index(drop=True)
    
    for _, group in tqdm(ratings_data.groupby('userId'),total=len(ratings_data['userId'].unique())):
        user_behavior_list = []
        for _, row in group.iterrows():
            user_behavior=" ".join(map(str,group[group['rank_latest'] < row['rank_latest']]['movieId'].to_list()))
            if user_behavior=='':
                user_behavior=' '
            user_behavior_list.append(user_behavior)
        ratings_data.loc[group.index,'user_behavior'] = user_behavior_list
        
    ratings_data.to_csv(os.path.join(DatasetCFG.data_root,'ratings_data_process_1.csv'),index=False)     

In [181]:
class Net(nn.Module):
    def __init__(self,device):
        self.device = device
        self.embeddings =SentenceTransformer('models/all_datasets_v4_MiniLM-L6')
        self.user_fc1=nn.Linear(384*2,512)
        self.user_fc2=nn.Linear(512,256)
        
        self.movie_fc1=nn.Linear(384*3,512)
        self.movie_fc2=nn.Linear(512,256)
        self.relu=nn.ReLU()
        
        self.to(device)
        
    def forward(self,user_id,user_behavior,movie_id,movie_title,movie_genre):
        user_id_embedding=torch.tensor(self.embeddings.encode(str(user_id)))
        user_behavior_embedding=torch.tensor(self.embeddings.encode(" ".join(user_behavior)))
        movie_id_embedding=torch.tensor(self.embeddings.encode(str(movie_id)))
        movie_title_embedding=torch.tensor(self.embeddings.encode(movie_title))
        movie_genre_embedding=torch.stack([torch.tensor(self.embeddings.encode(x)) for x  in movie_genre]).mean(dim=0)
        
        user_embedding=torch.cat([user_id_embedding,user_behavior_embedding],dim=0)
        movie_embedding=torch.cat([movie_id_embedding,movie_title_embedding,movie_genre_embedding],dim=0)
        user_embedding=user_embedding.to(self.device)
        movie_embedding=movie_embedding.to(self.device)
        
        user_out=self.user_fc1(user_embedding)
        user_out=self.relu(user_out)
        user_out=self.user_fc2(user_out)
        
        movie_out=self.movie_fc1(movie_embedding)
        movie_out=self.relu(movie_out)
        movie_out=self.movie_fc2(movie_out)
        
        return torch.matmul(user_out,movie_out.T)

In [182]:
data_preprocess()

  ratings_data.loc[group.index,'user_behavior'] = user_behavior_list
100%|██████████| 162541/162541 [2:13:41<00:00, 20.26it/s]   


In [183]:
model=SentenceTransformer('models/all_datasets_v4_MiniLM-L6')
text1='12 2 3 4 5 6'
encode1=torch.tensor(model.encode(text1))
# display(encode1)
text2='1 2 3 4 5 6'
encode2=torch.tensor(model.encode(text2))
c=torch.stack([encode1,encode2]).mean(dim=0)
display(c.shape)
# display(torch.cat([encode1,encode2],dim=0))

# display(F.cosine_similarity(encode1,encode2,dim=0))

torch.Size([384])