In [1]:
"""
MovieInfo is a namedTuple that contains the movie information. When created without actual value, default values are assigned to the object.
"""

import collections

class MovieInfo(
    collections.namedtuple(
        "MovieInfo", ["movie_id", "timestamp", "rating", "title", "genres"])):
  """Data holder of basic information of a movie."""
  __slots__ = ()

  def __new__(cls,
              movie_id=0,
              timestamp=0,
              rating=0.0,
              title="",
              genres=""):
    return super(MovieInfo, cls).__new__(cls, movie_id, timestamp, rating,
                                         title, genres)


In [2]:
import pandas as pd


In [4]:
movies_df = pd.read_csv("/content/drive/MyDrive/movie_data/ml-1m//movies.dat", sep="::", names=["movie_id", "title", "genres"], encoding = 'latin-1')

  movies_df = pd.read_csv("/content/drive/MyDrive/movie_data/ml-1m//movies.dat", sep="::", names=["movie_id", "title", "genres"], encoding = 'latin-1')


In [5]:
ratings_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(
            '/content/drive/MyDrive/movie_data/ml-1m/ratings.dat', sep='::', names=ratings_cols)
ratings["unix_timestamp"] = ratings["unix_timestamp"].apply(int)

  ratings = pd.read_csv(


In [6]:
# creating dictionary of users and ratings using default dict:
user_movie_dict = collections.defaultdict(list)
movie_counts = collections.Counter()

for user_id, movie_id, rating, unix_timestamp in ratings.values:
    user_movie_dict[user_id].append(
        MovieInfo(movie_id=movie_id, timestamp=int(unix_timestamp), rating=rating))
    movie_counts[movie_id] += 1
# Sort per-user timeline by timestamp
for (user_id, context) in user_movie_dict.items():
  context.sort(key=lambda x: x.timestamp)
  user_movie_dict[user_id] = context



In [7]:
#user movie dictionary for user 1
print(user_movie_dict[1])


[MovieInfo(movie_id=3186, timestamp=978300019, rating=4, title='', genres=''), MovieInfo(movie_id=1270, timestamp=978300055, rating=5, title='', genres=''), MovieInfo(movie_id=1721, timestamp=978300055, rating=4, title='', genres=''), MovieInfo(movie_id=1022, timestamp=978300055, rating=5, title='', genres=''), MovieInfo(movie_id=2340, timestamp=978300103, rating=3, title='', genres=''), MovieInfo(movie_id=1836, timestamp=978300172, rating=5, title='', genres=''), MovieInfo(movie_id=3408, timestamp=978300275, rating=4, title='', genres=''), MovieInfo(movie_id=2804, timestamp=978300719, rating=5, title='', genres=''), MovieInfo(movie_id=1207, timestamp=978300719, rating=4, title='', genres=''), MovieInfo(movie_id=1193, timestamp=978300760, rating=5, title='', genres=''), MovieInfo(movie_id=720, timestamp=978300760, rating=3, title='', genres=''), MovieInfo(movie_id=260, timestamp=978300760, rating=4, title='', genres=''), MovieInfo(movie_id=919, timestamp=978301368, rating=4, title='', 

In [8]:
#no of movies user1 has rated:
print(len(user_movie_dict[1]))

53


In [9]:
# movie dictionary:

movies_dict = {
      movie_id: MovieInfo(movie_id=movie_id, title=title, genres=genres)
      for movie_id, title, genres in movies_df.values
  }
movies_dict[0] = MovieInfo()

In [10]:
print(movies_dict[1])

MovieInfo(movie_id=1, timestamp=0, rating=0.0, title='Toy Story (1995)', genres="Animation|Children's|Comedy")


In [11]:
import tensorflow as tf


In [12]:
# creating list of sequences with movie information for every user.

examples = []
max_seq_len = 10
for user in user_movie_dict.values():
  if len(user) <= 10: continue
  for label_idx in range(10, len(user)):
    start_idx = max(0, label_idx - max_seq_len)
    synthetic_session = user[start_idx:label_idx]
    # Pad context with out-of-vocab movie id 0.
    while len(synthetic_session) < max_seq_len:
      synthetic_session.append(MovieInfo()) # adding padding
    label_movie_id = int(user[label_idx].movie_id)
    synthetic_session_movie_id = [int(movie.movie_id) for movie in synthetic_session]
    feature = {
          "synthetic_session_movie_id":
              tf.train.Feature(
                  int64_list=tf.train.Int64List(value=synthetic_session_movie_id)),
          "label_movie_id":
              tf.train.Feature(
                  int64_list=tf.train.Int64List(value=[label_movie_id]))
      }
    tf_example = tf.train.Example(features=tf.train.Features(feature=feature))
    examples.append(tf_example)

In [13]:
# split train and test data:
total_examples = len(examples)
split = int(0.9 * total_examples)
print(total_examples)
train_examples, test_examples = examples[:split], examples[split:]
print(f"{len(train_examples)}: {len(test_examples)}")

939809
845828: 93981


In [14]:
#creating the training file
with tf.io.TFRecordWriter('/content/drive/MyDrive/tensorflow_movie_rec/train_movielens_1m.tfrecord') as file_writer:
    length = len(train_examples)
    progress_bar = tf.keras.utils.Progbar(length)
    for example in train_examples:
      file_writer.write(example.SerializeToString())
      progress_bar.add(1)




In [15]:
#creating the test file
with tf.io.TFRecordWriter('/content/drive/MyDrive/tensorflow_movie_rec/test_movielens_1m.tfrecord') as file_writer:
    length = len(test_examples)
    progress_bar = tf.keras.utils.Progbar(length)
    for example in test_examples:
      file_writer.write(example.SerializeToString())
      progress_bar.add(1)

