In [48]:
import warnings
warnings.filterwarnings('ignore')

In [49]:
try:
    import mlflow.tensorflow
except ModuleNotFoundError:
    import subprocess
    command = ["pip", "install", "mlflow"]
    subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    import mlflow.tensorflow

In [50]:
mlflow.__version__

'1.14.1'

In [51]:
mlflow.tensorflow.autolog()

# ライブラリのインポート

In [52]:
# !pip install --upgrade pip
# !pip install tensorflow==2.4
# !pip install -q tensorflow-recommenders
# !pip install -q --upgrade tensorflow-datasets
# !pip install -q scann

In [53]:
import os
import pprint
import tempfile

from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

# データセットの読み込み

In [54]:
# Ratings data.
ratings = tfds.load("movielens/100k-ratings", split="train")
# Features of all the available movies.
movies = tfds.load("movielens/100k-movies", split="train")

In [55]:
for x in ratings.take(1).as_numpy_iterator():
  pprint.pprint(x)

{'bucketized_user_age': 45.0,
 'movie_genres': array([7]),
 'movie_id': b'357',
 'movie_title': b"One Flew Over the Cuckoo's Nest (1975)",
 'raw_user_age': 46.0,
 'timestamp': 879024327,
 'user_gender': True,
 'user_id': b'138',
 'user_occupation_label': 4,
 'user_occupation_text': b'doctor',
 'user_rating': 4.0,
 'user_zip_code': b'53211'}


In [56]:
for x in movies.take(1).as_numpy_iterator():
  pprint.pprint(x)

{'movie_genres': array([4]),
 'movie_id': b'1681',
 'movie_title': b'You So Crazy (1994)'}


In [57]:
ratings = ratings.map(lambda x: {
    "movie_title": x["movie_title"],
    "user_id": x["user_id"],
})
movies = movies.map(lambda x: x["movie_title"])

今回はレーティングデータのみを利用するが、他のコンテキストを利用して精度を上げることもできる

# データセットの分割

In [58]:
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

In [59]:
movie_titles = movies.batch(1_000)
user_ids = ratings.batch(1_000_000).map(lambda x: x["user_id"])

unique_movie_titles = np.unique(np.concatenate(list(movie_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

unique_movie_titles[:10]

array([b"'Til There Was You (1997)", b'1-900 (1994)',
       b'101 Dalmatians (1996)', b'12 Angry Men (1957)', b'187 (1997)',
       b'2 Days in the Valley (1996)',
       b'20,000 Leagues Under the Sea (1954)',
       b'2001: A Space Odyssey (1968)',
       b'3 Ninjas: High Noon At Mega Mountain (1998)',
       b'39 Steps, The (1935)'], dtype=object)

# モデルの構築

In [60]:
embedding_dimension = 32

In [61]:
user_model = tf.keras.Sequential([
  tf.keras.layers.experimental.preprocessing.StringLookup(
      vocabulary=unique_user_ids, mask_token=None),
  # We add an additional embedding to account for unknown tokens.
  tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
])

In [62]:
movie_model = tf.keras.Sequential([
  tf.keras.layers.experimental.preprocessing.StringLookup(
      vocabulary=unique_movie_titles, mask_token=None),
  tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
])

In [63]:
metrics = tfrs.metrics.FactorizedTopK(
  candidates=movies.batch(128).map(movie_model)
)

In [64]:
task = tfrs.tasks.Retrieval(
  metrics=metrics
)

In [65]:
class MovielensModel(tfrs.Model):

  def __init__(self, user_model, movie_model):
    super().__init__()
    self.movie_model: tf.keras.Model = movie_model
    self.user_model: tf.keras.Model = user_model
    self.task: tf.keras.layers.Layer = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    user_embeddings = self.user_model(features["user_id"])
    # And pick out the movie features and pass them into the movie model,
    # getting embeddings back.
    positive_movie_embeddings = self.movie_model(features["movie_title"])

    # The task computes the loss and the metrics.
    return self.task(user_embeddings, positive_movie_embeddings)

# 学習

In [66]:
model = MovielensModel(user_model, movie_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [67]:
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

In [68]:
model.fit(cached_train, epochs=3)

2021/03/14 09:49:41 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '75fc47eb3b5b4d3f968b1a97c2b94e8e', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current tensorflow workflow


Epoch 1/3
Epoch 2/3
Epoch 3/3




<tensorflow.python.keras.callbacks.History at 0x7fd3d90de490>

# 評価

In [69]:
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.000699999975040555,
 'factorized_top_k/top_5_categorical_accuracy': 0.009600000455975533,
 'factorized_top_k/top_10_categorical_accuracy': 0.02215000055730343,
 'factorized_top_k/top_50_categorical_accuracy': 0.12475000321865082,
 'factorized_top_k/top_100_categorical_accuracy': 0.23270000517368317,
 'loss': 28244.771484375,
 'regularization_loss': 0,
 'total_loss': 28244.771484375}

# 予測

In [70]:
# Create a model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
# recommends movies out of the entire movies dataset.
index.index(movies.batch(100).map(model.movie_model), movies)

# Get recommendations.
_, titles = index(tf.constant(["42"]))
print(f"Recommendations for user 42: {titles[0, :3]}")

Recommendations for user 42: [b'Bridges of Madison County, The (1995)'
 b'Father of the Bride Part II (1995)' b'Rudy (1993)']


# モデルの保存

In [26]:
# Export the query model.
with tempfile.TemporaryDirectory() as tmp:
  path = os.path.join(tmp, "model")

  # Save the index.
  index.save(path)

  # Load it back; can also be done in TensorFlow Serving.
  loaded = tf.keras.models.load_model(path)

  # Pass a user id in, get top predicted movie titles back.
  scores, titles = loaded(["42"])

  print(f"Recommendations: {titles[0][:3]}")



INFO:tensorflow:Assets written to: /tmp/tmp9ir699ka/model/assets


INFO:tensorflow:Assets written to: /tmp/tmp9ir699ka/model/assets






Recommendations: [b'Bridges of Madison County, The (1995)'
 b'Father of the Bride Part II (1995)' b'Rudy (1993)']


In [27]:
loaded(["42"])

(<tf.Tensor: shape=(1, 10), dtype=float32, numpy=
 array([[2.0326157, 1.9691347, 1.912091 , 1.9103494, 1.9034474, 1.8912519,
         1.8807452, 1.833153 , 1.8292801, 1.8137307]], dtype=float32)>,
 <tf.Tensor: shape=(1, 10), dtype=string, numpy=
 array([[b'Bridges of Madison County, The (1995)',
         b'Father of the Bride Part II (1995)', b'Rudy (1993)',
         b'101 Dalmatians (1996)', b'While You Were Sleeping (1995)',
         b'Jack (1996)', b'Sleepless in Seattle (1993)',
         b"Preacher's Wife, The (1996)", b'Michael (1996)',
         b'Lion King, The (1994)']], dtype=object)>)

In [80]:
@tf.function(input_signature=[tf.TensorSpec(shape=[None], dtype=tf.string)])
def serving(input_array):
    return index(input_array)

In [81]:
tf.saved_model.save(index, export_dir='./test', signatures=serving)



INFO:tensorflow:Assets written to: ./test/assets


INFO:tensorflow:Assets written to: ./test/assets


In [76]:
index(tf.constant(["42"], dtype=tf.string))

(<tf.Tensor: shape=(1, 10), dtype=float32, numpy=
 array([[2.0326157, 1.9691347, 1.912091 , 1.9103494, 1.9034474, 1.8912519,
         1.8807452, 1.833153 , 1.8292801, 1.8137307]], dtype=float32)>,
 <tf.Tensor: shape=(1, 10), dtype=string, numpy=
 array([[b'Bridges of Madison County, The (1995)',
         b'Father of the Bride Part II (1995)', b'Rudy (1993)',
         b'101 Dalmatians (1996)', b'While You Were Sleeping (1995)',
         b'Jack (1996)', b'Sleepless in Seattle (1993)',
         b"Preacher's Wife, The (1996)", b'Michael (1996)',
         b'Lion King, The (1994)']], dtype=object)>)

In [30]:
# !tar -zcvf model.tar.gz model

In [31]:
# BUCKET_NAME="tensorflow_recommenders_example"
# REGION="asia-northeast1-a"

# from google.cloud import storage
# from google.oauth2 import service_account
# import os
# import json

# client = storage.Client()

# buckets = client.list_buckets()

# for bucket in buckets:
#     pprint.pprint(bucket)

# bucket_name = "tensorflow_recommenders_example"

# file_name = 'model.tar.gz'

# bucket = client.get_bucket(bucket_name)

# blob = bucket.blob(file_name)

# blob.upload_from_filename(file_name)

# # https://cloud.google.com/storage/docs/uploading-objects?hl=ja#storage-upload-object-python
# !gsutil cp -r model gs://tensorflow_recommenders_example

# modelのデプロイ

In [32]:
tf.__version__

'2.4.0'