In [1]:
import os
from pathlib import Path

import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_columns', None)

from utils import read_dataframe, get_dummy_df, get_tensor_dataset
from model.fm import FMModel

BATCH_SIZE = 32
DATA_DIR = Path('../data/ml-1m/')
UNAMES = ['user_id', 'gender', 'age', 'occupation', 'zip']
RNAMES = ['user_id', 'movie_id', 'rating', 'timestamp']
MNAMES = ['movie_id', 'title', 'genres']

user_df = read_dataframe(DATA_DIR / 'users.dat', sep="::", names=UNAMES)
rate_df = read_dataframe(DATA_DIR / 'ratings.dat', sep='::', names=RNAMES)
movie_df = read_dataframe(DATA_DIR / 'movies.dat', sep='::', names=MNAMES)
df = pd.merge(pd.merge(user_df, rate_df, on='user_id'), movie_df, on='movie_id')
df = df[df['rating'] != 3]
df['rating'] = df['rating'].map(lambda x: 0 if x > 3 else 1)

  from .autonotebook import tqdm as notebook_tqdm
  return func(*args, **kwargs)


In [2]:
# 数据量较大
# 挑选有20～35次打分的movie_id
movie_group = df.groupby('movie_id').size()
movie_group = pd.Series(movie_group).where(lambda x: x < 35).dropna()
movie_list = pd.Series(movie_group).where(
    lambda x: x > 20).dropna().index.values
print(f"length of movie list:{len(movie_list)}")
df = df[df['movie_id'].isin(movie_list)]

# 挑选有>20次打分行为的user_id
user_group = df.groupby('user_id').size()
user_list = pd.Series(user_group).where(lambda x: x > 20).dropna().index.values
print(f"length of movie list:{len(user_list)}")
df = df[df['user_id'].isin(user_list)]

length of movie list:322
length of movie list:37


In [3]:
dummy_cols = ['user_id', 'movie_id']
df = get_dummy_df(df, dummy_cols)

df = df.drop(['timestamp', 'gender', 'age', 'occupation','zip', 'title', 'genres'], axis=1)
if not os.path.exists('../data/user_data/fm_data.csv'):
    df.to_csv('../data/user_data/fm_data.csv', index=None)

y = df['rating']
del df['rating']
X = df

In [4]:
trn_x, tst_x, trn_y, tst_y = train_test_split(X, y, test_size=0.2,)
train_loader = get_tensor_dataset(trn_x.values, trn_y.values, BATCH_SIZE)
test_loader = get_tensor_dataset(tst_x.values, tst_y.values, BATCH_SIZE)

In [5]:
model = FMModel(X.shape[1], 5)
model.fit(train_loader, n_epochs=100, lr=0.01)
model.eval(train_loader)
model.eval(test_loader)

EPOCH: 10 || TRAIN LOSS: 10.1202
EPOCH: 20 || TRAIN LOSS: 4.5611
EPOCH: 30 || TRAIN LOSS: 2.1174
EPOCH: 40 || TRAIN LOSS: 1.0628
EPOCH: 50 || TRAIN LOSS: 0.5970
EPOCH: 60 || TRAIN LOSS: 0.3659
EPOCH: 70 || TRAIN LOSS: 0.2396
EPOCH: 80 || TRAIN LOSS: 0.1648
EPOCH: 90 || TRAIN LOSS: 0.1176
EPOCH: 100 || TRAIN LOSS: 0.0863
TEST RESULT || AUC: 1.0 ||
TEST RESULT || AUC: 0.7283644681662218 ||
