In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import json
import numpy as np
import torch

In [31]:
nrows = 2e5
# nrows=1e9
ratings = pd.read_csv('data/ratings.csv', nrows=nrows) \
        .drop('timestamp', axis=1)

In [32]:
metadata = pd.read_csv('data/movies_metadata.csv') \
        .drop(["homepage", "imdb_id", "poster_path", "production_countries", "status", "tagline", "original_title", "video"], axis=1)   

# we have to cast the id to int, so that we can merge the dataframes
metadata = metadata.assign(
    id=metadata['id'].apply(pd.to_numeric, errors='coerce').dropna().astype(int))
ratings_with_metadata = ratings.merge(
    metadata, left_on='movieId', right_on='id')

  exec(code_obj, self.user_global_ns, self.user_ns)
  if not (rk == rk.astype(lk.dtype))[~np.isnan(rk)].all():


In [4]:
# make a random split of ratings
train, test = train_test_split(ratings, test_size=0.2)

In [None]:
X_train = torch.LongTensor(train[['userId', 'movieId']].to_numpy())
X_test = torch.LongTensor(test[['userId', 'movieId']].to_numpy())
y_train = torch.Tensor(train['rating'].to_numpy())
y_test = torch.Tensor(test['rating'].to_numpy())

In [None]:
from zero.als import MangakiALS
from zero.svd import MangakiSVD

svd = MangakiSVD()
svd.nb_users = X_train[:, 0].unique().max().item() + 1
svd.nb_works = X_train[:, 1].unique().max().item() + 1
svd.fit(X_train, y_train)

y_pred = svd.predict(X_test)

print(mean_squared_error(y_test, y_pred))


als = MangakiALS()
als.nb_users = X_train[:, 0].unique().max().item() + 1
als.nb_works = X_train[:, 1].unique().max().item() + 1
als.fit(X_train.detach().numpy(), y_train.detach().numpy())

y_pred = als.predict(X_test.detach().numpy())

print(mean_squared_error(y_test, y_pred))


In [5]:
def preprocess_text(data):
    return data.fillna('').astype(str)

def convert_numeric(data):
    return data.replace({'True': 1, 'False': 0}) \
               .apply(pd.to_numeric, errors='coerce').astype(float)


def parse_genres(json_str):
    try:
        genres = json.loads(json_str.replace("'", "\""))
        return [genre['name'] for genre in genres]
    except json.JSONDecodeError:
        return []

def parse_json_df(data):
    return data.apply(parse_genres).to_numpy()


def binarize_genres(genres_list):
    mlb = MultiLabelBinarizer()
    return mlb.fit_transform(genres_list)


def date_transformer(data):
    dates = pd.to_datetime(data.fillna('1900-01-01'),
                           format="%Y-%m-%d", errors='coerce')
    return pd.DataFrame({
        'release_year': dates.dt.year,
        'release_month': dates.dt.month,
        'release_day': dates.dt.day
    })


text_pipeline = Pipeline([
    ('to_string', FunctionTransformer(preprocess_text, validate=False)),
    ('vectorize', TfidfVectorizer(stop_words='english', max_features=1000))
])

numerical_preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('to_float', FunctionTransformer(convert_numeric, validate=False)),
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), ['budget', 'popularity', 'revenue', 'runtime', 'vote_count']),
        ('passthrough', 'passthrough', ['rating']),
        # ('text_overview', text_pipeline, 'overview'),

        # This one is very inefficient
        # ('json_genres', Pipeline([
        #     ('extractor', FunctionTransformer(parse_json_df, validate=False)),
        #     ('binarizer', FunctionTransformer(binarize_genres, validate=False))
        # ]), 'genres'),
        # ('release_date', FunctionTransformer(
        #     date_transformer, validate=False), 'release_date'),

    ],
    remainder='drop'
)


In [None]:
numerical_data = numerical_preprocessor.fit_transform(ratings_with_metadata)

In [None]:
# build an explainable decision tree model 
from sklearn.tree import DecisionTreeRegressor

X, Y = numerical_data[:, :-1], numerical_data[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

model = DecisionTreeRegressor(max_depth=4, min_samples_split=1000, min_samples_leaf=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(mean_squared_error(y_test, y_pred))

In [None]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

# Change the index if you want to look at another tree
tree = model

# Visualize the tree
plt.figure(figsize=(20, 10))
# Replace feature names with actual names
plot_tree(tree, filled=True, max_depth=3, feature_names=[
    'adult', 'budget', 'popularity', 'revenue', 'runtime', 'vote_count',
    'release_year', 'release_month', 'release_day',
    'userId', 'movieId'
])
plt.show()

In [None]:
import sys
import os
import os.path

os.environ['LIBFM_PATH'] = os.path.expanduser('~/libfm/bin/')

In [None]:
fm_preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), ['userId', 'movieId']),
        ('passthrough', 'passthrough', ['rating']),

    ],
    remainder='drop'
)


fm_data = fm_preprocessor.fit_transform(ratings_with_metadata)

In [None]:
import pywFM
!source "$HOME/.zshrc"

X, Y = fm_data[:, :-1], fm_data[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

fm = pywFM.FM(task='regression', num_iter=100,
              init_stdev=0.1, learning_method='mcmc')

prediction, global_bias, weights, pairwise, rlog = fm.run(X_train, y_train, X_test, y_test)

In [None]:
def historical_ratings_encoder(df):
    # Creating a copy to avoid changing original dataframe
    df_copy = df.copy()

    # One-hot encode userId and movieId within the copy for transformation
    transformed_data = pd.get_dummies(df_copy, columns=['movieId'])

    # Aggregate historical rated movies data for each user
    filtered_movies = transformed_data.filter(regex='movieId_.*')
    historical_rated_data = filtered_movies.groupby(df_copy['userId']).sum()
    historical_rated_data = historical_rated_data.rename(
        columns=lambda x: 'rated:' + x)

    # Ensure historical data index is named for proper merging
    historical_rated_data.index.name = 'userId'

    # Resetting index to enable join on 'userId'
    df_copy = df_copy.join(historical_rated_data, on='userId')

    # Dropping duplicate or unnecessary columns
    # Return only historical data columns
    return df_copy.iloc[:, -filtered_movies.shape[1]:]


fm_preprocessor_full = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), ['userId', 'movieId']),
        ('num', Pipeline([
            ('to_float', FunctionTransformer(convert_numeric, validate=False)),
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), ['budget', 'popularity', 'revenue', 'runtime', 'vote_count']),
        ('json_genres', Pipeline([
            ('extractor', FunctionTransformer(parse_json_df, validate=False)),
            ('binarizer', FunctionTransformer(binarize_genres, validate=False))
        ]), 'genres'),
        ('historical', FunctionTransformer(historical_ratings_encoder), ['userId', 'movieId']),
        ('passthrough', 'passthrough', ['rating']),

    ],
    remainder='drop'
)


fm_data_full = fm_preprocessor_full.fit_transform(ratings_with_metadata)

In [None]:
X, Y = fm_data_full[:, :-1], fm_data_full[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

fm = pywFM.FM(task='regression', num_iter=100,
              init_stdev=0.1, learning_method='mcmc')

prediction, global_bias, weights, pairwise, rlog = fm.run(
    X_train, y_train, X_test, y_test)

In [None]:
# results
# (43797, 3610) - raw 100 epochs 0.86
# (43797, 6244) - history + genre info 100 epochs 0.88
# (43797, 6230) - history + numericals 100 epochs 0.86
# (43797, X)    - history + genre + numericals 100 epochs 0.87

# 
# (437491, 20121) - history + numericals 100 epochs 0.86

In [33]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

deep_fm_data = ratings_with_metadata.drop(
    ['id', 'original_language', 'overview', 'title', 'belongs_to_collection', 'revenue', 'budget', 'adult'], axis=1)

deep_fm_data['genres'] = deep_fm_data['genres'].apply(parse_genres)
deep_fm_data['spoken_languages'] = deep_fm_data['spoken_languages'].apply(
    parse_genres)
deep_fm_data['production_companies'] = deep_fm_data['production_companies'].apply(
    parse_genres)

# deep_fm_data['genres'] = deep_fm_data['genres'].apply(lambda x: ', '.join(x))
# deep_fm_data['production_companies'] = deep_fm_data['production_companies'].apply(
#     lambda x: ', '.join(x))
# deep_fm_data['spoken_languages'] = deep_fm_data['spoken_languages'].apply(
#     lambda x: ', '.join(x))


scaler = StandardScaler()
numerical_cols = ['popularity', 'vote_average', 'vote_count', 'runtime']
deep_fm_data[numerical_cols] = scaler.fit_transform(
    deep_fm_data[numerical_cols])

# Convert release date to just the year and treat as categorical
deep_fm_data['release_year'] = pd.to_datetime(
    deep_fm_data['release_date']).dt.year.fillna(1900).astype(float)

deep_fm_data.drop(['release_date'], axis=1, inplace=True)

imputer = SimpleImputer(strategy='median')
# fix nan in release_year, popularity and runtime
deep_fm_data['release_year'] = imputer.fit_transform(
    deep_fm_data['release_year'].values.reshape(-1, 1))
deep_fm_data['popularity'] = imputer.fit_transform(
    deep_fm_data['popularity'].values.reshape(-1, 1))
deep_fm_data['runtime'] = imputer.fit_transform(
    deep_fm_data['runtime'].values.reshape(-1, 1))

 

# label_encoders = {}
# for col in ['genres', 'production_companies', 'spoken_languages']:
#     le = LabelEncoder()
#     # Convert list to comma-separated string if not already
#     deep_fm_data[col] = deep_fm_data[col].apply(lambda x: ','.join(x))
#     deep_fm_data[col] = le.fit_transform(deep_fm_data[col])
#     label_encoders[col] = le

In [34]:
from deepctr_torch.inputs import SparseFeat, DenseFeat, VarLenSparseFeat, get_feature_names
from keras.utils import pad_sequences
from deepctr_torch.models import DeepFM


def split(x):
    key_ans = x
    for key in key_ans:
        if key not in key2index:
            # Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input
            key2index[key] = len(key2index) + 1
    return list(map(lambda x: key2index[x], key_ans))


sparse_features = ["movieId", "userId"]
list_features = ['genres', 'production_companies', 'spoken_languages']
dense_features = ["runtime", "vote_average", "popularity", "vote_count", "release_year"]
target = ['rating']

for feat in sparse_features:
    lbe = LabelEncoder()
    deep_fm_data[feat] = lbe.fit_transform(deep_fm_data[feat])

model_input = {name: deep_fm_data[name] for name in sparse_features + dense_features}

varlen_feature_columns = []
for list_feature in list_features:
    key2index = {}
    genres_list = list(map(split, deep_fm_data[list_feature].values))
    genres_length = np.array(list(map(len, genres_list)))
    max_len = max(genres_length)
    # Notice : padding=`post`
    genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', )
    model_input[list_feature] = genres_list
    varlen_feature_columns.append(VarLenSparseFeat(SparseFeat(list_feature, vocabulary_size=len(
    key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean'))

# 2.count #unique features for each sparse field
fixlen_feature_columns = [SparseFeat(feat, deep_fm_data[feat].nunique() + 1, embedding_dim=4)
                              for feat in sparse_features] + [DenseFeat(feat, 1, )
                                                              for feat in dense_features]

linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [40]:
# 4.Define Model,train,predict and evaluate
device = 'cpu'
use_cuda = True
if use_cuda and torch.cuda.is_available():
    print('cuda ready...')
    device = 'cuda:0'

model = DeepFM(linear_feature_columns, dnn_feature_columns,
               task='regression', device=device, 
               dnn_hidden_units=(64, 64),
               l2_reg_dnn=1e-0,
               l2_reg_embedding=1e-0,
               l2_reg_linear=1e-0,
               dnn_dropout=0.2)

model.compile("adam", "mse", metrics=['mse'], )
history = model.fit(model_input, deep_fm_data[target].values, batch_size=256,epochs=10,verbose=2,validation_split=0.2)

cpu
Train on 69571 samples, validate on 17393 samples, 272 steps per epoch
Epoch 1/10
5s - loss:  1.1270 - mse:  1.1086 - val_mse:  1.1287
Epoch 2/10
5s - loss:  0.9391 - mse:  0.8804 - val_mse:  1.0784
Epoch 3/10
4s - loss:  0.9326 - mse:  0.8580 - val_mse:  1.0684
Epoch 4/10
3s - loss:  0.9258 - mse:  0.8455 - val_mse:  1.0901
Epoch 5/10
3s - loss:  0.9310 - mse:  0.8474 - val_mse:  1.1037
Epoch 6/10
4s - loss:  0.9359 - mse:  0.8521 - val_mse:  1.1777
Epoch 7/10
5s - loss:  0.9300 - mse:  0.8446 - val_mse:  1.0727
Epoch 8/10
4s - loss:  0.9185 - mse:  0.8328 - val_mse:  1.0602
Epoch 9/10
4s - loss:  0.9248 - mse:  0.8388 - val_mse:  1.0602
Epoch 10/10
4s - loss:  0.9230 - mse:  0.8365 - val_mse:  1.0994
