In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer
np.random.seed(42)

In [3]:
class DatasetCFG:
    data_root = 'ml-latest'
    user_chosen_num = 1000
    num_negatives = 4

### 获取训练数据


In [4]:
def data_preprocess():
    ratings_path = os.path.join(DatasetCFG.data_root, 'ratings.csv')
    movies_path = os.path.join(DatasetCFG.data_root, 'movies.csv')
    ratings_data = pd.read_csv(ratings_path)
    movies_data = pd.read_csv(movies_path)

    # random_user_ids=np.random.choice(ratings_data['userId'].unique(),
    #                             size=int(len(ratings_data['userId'].unique())*0.001),
    #                             replace=False)
    # ratings_data=ratings_data[ratings_data['userId'].isin(random_user_ids)]

    ratings_data = ratings_data.merge(movies_data, on='movieId')
    ratings_data['rank_latest'] = ratings_data.groupby(
        ['userId'])['timestamp'].rank(method='first', ascending=True)

    ratings_data = ratings_data.sort_values(['userId', 'rank_latest'], ascending=[
                                            True, False]).reset_index(drop=True)

    for _, group in tqdm(ratings_data.groupby('userId'), total=len(ratings_data['userId'].unique())):
        user_behavior_list = []
        for _, row in group.iterrows():
            user_behavior = " ".join(
                map(str, group[group['rank_latest'] < row['rank_latest']]['movieId'].to_list()))
            if user_behavior == '':
                user_behavior = ' '
            user_behavior_list.append(user_behavior)
        ratings_data.loc[group.index, 'user_behavior'] = user_behavior_list

    ratings_data.to_csv(os.path.join(DatasetCFG.data_root,
                        'ratings_data_process_1.csv'), index=False)


def data_process_with_genome():
    ratings_data = pd.read_csv(os.path.join(
        DatasetCFG.data_root, 'ratings_data_process_1.csv'))
    genome_scores_data = pd.read_csv(os.path.join(
        DatasetCFG.data_root, 'genome-scores.csv'))
    genome_tags_data = pd.read_csv(os.path.join(
        DatasetCFG.data_root, 'genome-tags.csv'))

    genome_scores_data = genome_scores_data.merge(genome_tags_data, on='tagId')
    genome_scores_data = genome_scores_data.pivot(
        index='movieId', columns='tag', values='relevance')
    genome_scores_data = genome_scores_data.reset_index()

    ratings_data = ratings_data.merge(genome_scores_data, on='movieId')
    ratings_data.to_csv(os.path.join(DatasetCFG.data_root,
                        'ratings_data_process_2.csv'), index=False)

In [5]:
# data_preprocess()
df = pd.read_csv(os.path.join(DatasetCFG.data_root, 'ratings.csv'))
display(df.head(30))

df = pd.read_csv(os.path.join(DatasetCFG.data_root, 'genome-scores.csv'))
display(df.groupby('movieId').count().describe())

df = pd.read_csv(os.path.join(DatasetCFG.data_root, 'movies.csv'))
display(df.describe())

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,1225734739
1,1,110,4.0,1225865086
2,1,158,4.0,1225733503
3,1,260,4.5,1225735204
4,1,356,5.0,1225735119
5,1,381,3.5,1225734105
6,1,596,4.0,1225733524
7,1,1036,5.0,1225735626
8,1,1049,3.0,1225734079
9,1,1066,4.0,1225736961


Unnamed: 0,tagId,relevance
count,16376.0,16376.0
mean,1128.0,1128.0
std,0.0,0.0
min,1128.0,1128.0
25%,1128.0,1128.0
50%,1128.0,1128.0
75%,1128.0,1128.0
max,1128.0,1128.0


Unnamed: 0,movieId
count,86537.0
mean,155932.817096
std,78037.145347
min,1.0
25%,111443.0
50%,164574.0
75%,211225.0
max,288983.0
