Skip to content

Commit

Permalink
Merge pull request #54 from ahuds001/add_user_metadata_cont
Browse files Browse the repository at this point in the history
Add user metadata cont
  • Loading branch information
nathancooperjones committed Aug 20, 2022
2 parents 48b6487 + 4b89de0 commit 0e3789b
Show file tree
Hide file tree
Showing 14 changed files with 2,074 additions and 579 deletions.
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,15 @@ All notable changes to this project will be documented in this file.

The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project uses [Semantic Versioning](http://semver.org/).

# [1.3.0] - 2022-8-20
### Added
- ``HybridModel`` and ``HybridPretrainedModel`` now take additional optional parameters ``user_metadata`` and ``user_metadata_layers_dims``
- ``get_data.py`` now includes ``get_user_metadata``

### Changed
- Added ``item_metadata_layers_dims`` and ``user_metadata_layers_dims`` parameters to ``HybridPretrainedModel`` and ``HybridModel`` and removed ``metadata_layers_dims``
- Updated notebooks and examples to include usage of ``user_metadata``

# [1.2.2] - 2022-7-14
### Fixed
- a ``Value Error`` is now raised when ``item_metadata`` contains nulls
Expand Down
2 changes: 1 addition & 1 deletion collie/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '1.2.2'
__version__ = '1.3.0'
9 changes: 8 additions & 1 deletion collie/model/base/base_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,12 +211,19 @@ def __init__(self,
self.hparams.num_epochs_completed = 0

# ensure there are no nulls in ``item_metadata``
if 'item_metadata' in kwargs:
if kwargs.get('item_metadata') is not None:
if torch.isnan(kwargs.get('item_metadata')).any():
raise ValueError(
'``item_metadata`` may not contain nulls'
)

# ensure there are no nulls in ``user_metadata``
if kwargs.get('user_metadata') is not None:
if torch.isnan(kwargs.get('user_metadata')).any():
raise ValueError(
'``user_metadata`` may not contain nulls'
)

self._configure_loss()

# check weight decay and sparsity
Expand Down
346 changes: 271 additions & 75 deletions collie/model/hybrid_matrix_factorization.py

Large diffs are not rendered by default.

318 changes: 256 additions & 62 deletions collie/model/hybrid_pretrained_matrix_factorization.py

Large diffs are not rendered by default.

93 changes: 93 additions & 0 deletions collie/movielens/get_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,48 @@ def read_movielens_df_item() -> pd.DataFrame:
return df_item


def read_movielens_df_user() -> pd.DataFrame:
"""
Read ``u.user`` from the MovieLens 100K dataset.
If there is not a directory at ``$DATA_PATH/ml-100k``, this function creates that directory and
downloads the entire dataset there.
See the MovieLens 100K README for additional information on the dataset:
https://files.grouplens.org/datasets/movielens/ml-100k-README.txt
Returns
-------
df_user: pd.DataFrame
MovieLens 100K ``u.user`` containing columns:
* user_id
* age
* gender
* occupation
* zip
Side Effects
------------
Creates directory at ``$DATA_PATH/ml-100k`` and downloads data files if data does not exist.
"""
_make_data_path_dirs_if_not_exist()

df_user_path = os.path.join(DATA_PATH, 'ml-100k', 'u.user')
if not Path(df_user_path).exists():
_download_movielens_100k()

column_names = ['user_id', 'age', 'gender', 'occupation', 'zip']
df_user = pd.read_csv(df_user_path, sep='|', encoding='latin-1', names=column_names)

return df_user


def _make_data_path_dirs_if_not_exist() -> None:
"""Get path to the movielens dataset file."""
if not DATA_PATH.exists():
Expand Down Expand Up @@ -258,3 +300,54 @@ def get_movielens_metadata(df_item: pd.DataFrame = None) -> pd.DataFrame:
metadata_df = metadata_df[cols]

return metadata_df


def get_user_metadata(df_user: pd.DataFrame = None) -> pd.DataFrame:
"""
Return MovieLens 100K user metadata as a DataFrame.
DataFrame returned has the following column order:
.. code-block:: python
[
'age', 'gender', 'occupation_administrator', 'occupation_artist'
'occupation_doctor', 'occupation_educator', 'occupation_engineer'
'occupation_entertainment', 'occupation_executive'
'occupation_healthcare', 'occupation_homemaker'
'occupation_lawyer', 'occupation_librarian', 'occupation_marketing'
'occupation_none', 'occupation_other', 'occupation_programmer'
'occupation_retired', 'occupation_salesman', 'occupation_scientist'
'occupation_student', 'occupation_technician', 'occupation_writer',
]
See the MovieLens 100K README for additional information on the dataset:
https://files.grouplens.org/datasets/movielens/ml-100k-README.txt
Parameters
----------
df_user: pd.DataFrame
DataFrame of MovieLens 100K ``u.user`` containing columns of user
metadata. If ``None``, will automatically read the output of ``read_movielens_df_user()``
Returns
-------
metadata_df: pd.DataFrame
"""
if df_user is None:
df_user = read_movielens_df_user()

# format user occupation
df_user_occupation = df_user[['occupation']].copy()
df_occupation = pd.get_dummies(df_user_occupation.occupation, prefix='occupation')
df_occupation = df_occupation.sort_index(axis=1)

# format user gender
df_user['gender'] = df_user.gender.replace({'F': 1, 'M': 0})

# format final metadata structure
user_metadata_df = df_user[['age', 'gender']].merge(df_occupation,
left_index=True,
right_index=True)
return user_metadata_df
2 changes: 1 addition & 1 deletion docs/source/models.rst
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ While each model's API differs slightly, generally, the training procedure for e
When we have side-data about items, this can be incorporated directly into the loss function of the model. For details on this, see :ref:`Losses`.

Hybrid Collie models also allow incorporating this side-data directly into the model. For an in-depth example of this, see :ref:`Tutorials`.
Hybrid Collie models allow incorporating side-data about items and/or users directly into the model. For an in-depth example of this, see :ref:`Tutorials`.

**Creating a Custom Architecture**

Expand Down
4 changes: 4 additions & 0 deletions docs/source/movielens.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ Format MovieLens 100K Item Metadata Data
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. autofunction:: collie.movielens.get_movielens_metadata

Format MovieLens 100K User Metadata Data
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. autofunction:: collie.movielens.get_user_metadata


MovieLens Model Training Pipeline
---------------------------------
Expand Down
26 changes: 20 additions & 6 deletions tests/fixtures/model_fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ def untrained_implicit_model_no_val_data(train_val_implicit_data):
def models_trained_for_one_step(request,
train_val_implicit_sample_data,
movielens_metadata_df,
user_metadata_df,
movielens_implicit_df,
train_val_implicit_pandas_data,
gpu_count):
Expand Down Expand Up @@ -392,7 +393,9 @@ def models_trained_for_one_step(request,
val=val,
item_metadata=movielens_metadata_df,
trained_model=implicit_model,
metadata_layers_dims=metadata_layers_dims,
item_metadata_layers_dims=metadata_layers_dims,
user_metadata=user_metadata_df,
user_metadata_layers_dims=metadata_layers_dims,
freeze_embeddings=True,
dropout_p=0.15,
loss='warp',
Expand All @@ -411,7 +414,9 @@ def models_trained_for_one_step(request,
val=val,
item_metadata=movielens_metadata_df,
trained_model=implicit_model,
metadata_layers_dims=metadata_layers_dims,
item_metadata_layers_dims=metadata_layers_dims,
user_metadata=user_metadata_df,
user_metadata_layers_dims=metadata_layers_dims,
freeze_embeddings=False,
dropout_p=0.15,
loss='bpr',
Expand Down Expand Up @@ -448,7 +453,9 @@ def models_trained_for_one_step(request,
val=val,
item_metadata=movielens_metadata_df,
embedding_dim=10,
metadata_layers_dims=metadata_layers_dims,
item_metadata_layers_dims=metadata_layers_dims,
user_metadata=user_metadata_df,
user_metadata_layers_dims=metadata_layers_dims,
lr=1e-1,
optimizer='adam',
**additional_kwargs)
Expand Down Expand Up @@ -516,6 +523,7 @@ def models_trained_for_one_step(request,
def explicit_models_trained_for_one_step(request,
train_val_explicit_sample_data,
movielens_metadata_df,
user_metadata_df,
gpu_count):
train, val = train_val_explicit_sample_data

Expand Down Expand Up @@ -598,7 +606,9 @@ def explicit_models_trained_for_one_step(request,
val=val,
item_metadata=movielens_metadata_df,
trained_model=implicit_model,
metadata_layers_dims=None,
item_metadata_layers_dims=None,
user_metadata=user_metadata_df,
user_metadata_layers_dims=None,
freeze_embeddings=True,
dropout_p=0.15,
loss='mae',
Expand All @@ -619,7 +629,9 @@ def explicit_models_trained_for_one_step(request,
val=val,
item_metadata=movielens_metadata_df,
trained_model=implicit_model,
metadata_layers_dims=None,
item_metadata_layers_dims=None,
user_metadata=user_metadata_df,
user_metadata_layers_dims=None,
freeze_embeddings=False,
dropout_p=0.15,
loss='mse',
Expand All @@ -636,7 +648,9 @@ def explicit_models_trained_for_one_step(request,
val=val,
item_metadata=movielens_metadata_df,
embedding_dim=10,
metadata_layers_dims=[16, 12],
item_metadata_layers_dims=[16, 12],
user_metadata=user_metadata_df,
user_metadata_layers_dims=[16, 12],
lr=1e-1,
loss='mae',
optimizer='adam')
Expand Down
12 changes: 12 additions & 0 deletions tests/fixtures/movielens_fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@
from collie.cross_validation import random_split, stratified_split
from collie.interactions import ExplicitInteractions, Interactions
from collie.movielens import (get_movielens_metadata,
get_user_metadata,
read_movielens_df,
read_movielens_df_item,
read_movielens_df_user,
read_movielens_posters_df)
from collie.utils import convert_to_implicit

Expand All @@ -30,6 +32,11 @@ def movielens_df_item():
return read_movielens_df_item()


@pytest.fixture(scope='session')
def movielens_df_user():
return read_movielens_df_user()


@pytest.fixture(scope='session')
def movielens_posters_df():
return read_movielens_posters_df()
Expand All @@ -40,6 +47,11 @@ def movielens_metadata_df():
return get_movielens_metadata()


@pytest.fixture(scope='session')
def user_metadata_df():
return get_user_metadata()


@pytest.fixture(scope='session')
def movielens_implicit_interactions(movielens_implicit_df):
return Interactions(users=movielens_implicit_df['user_id'],
Expand Down

0 comments on commit 0e3789b

Please sign in to comment.