In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_recommenders as tfrs
from ast import literal_eval
from datetime import datetime

In [2]:
credits = pd.read_csv('../data/credits.csv')
keywords = pd.read_csv('../data/keywords.csv')
movies = pd.read_csv('../data/movies_metadata.csv').\
                     drop(['belongs_to_collection', 'homepage', 'imdb_id', 'poster_path', 'status', 'title', 'video'], axis=1).\
                     drop([19730, 29503, 35587]) # Incorrect data type

movies['id'] = movies['id'].astype('int64')

df = movies.merge(keywords, on='id').\
    merge(credits, on='id')

df['original_language'] = df['original_language'].fillna('')
df['runtime'] = df['runtime'].fillna(0)
df['tagline'] = df['tagline'].fillna('')

df.dropna(inplace=True)
def get_text(text, obj='name'):
    text = literal_eval(text)
    
    if len(text) == 1:
        for i in text:
            return i[obj]
    else:
        s = []
        for i in text:
            s.append(i[obj])
        return ', '.join(s)
    
df['genres'] = df['genres'].apply(get_text)
df['production_companies'] = df['production_companies'].apply(get_text)
df['production_countries'] = df['production_countries'].apply(get_text)
df['crew'] = df['crew'].apply(get_text)
df['spoken_languages'] = df['spoken_languages'].apply(get_text)
df['keywords'] = df['keywords'].apply(get_text)

# New columns
df['characters'] = df['cast'].apply(get_text, obj='character')
df['actors'] = df['cast'].apply(get_text)

df.drop('cast', axis=1, inplace=True)
df = df[~df['original_title'].duplicated()]
df = df.reset_index(drop=True)

  movies = pd.read_csv('../data/movies_metadata.csv').\


In [3]:
# Function to add new ratings to the dataset
def add_rating(user_id, movie_title, rating):
    global ratings_df

    new_rating = pd.DataFrame({
        'userId': [str(user_id)],
        'original_title': [movie_title],
        'rating': [float(rating)],
        'date': [datetime.now()]
    })

    ratings_df = pd.concat([ratings_df, new_rating], ignore_index=True)
    update_model()

In [3]:
ratings_df = pd.read_csv('../data/ratings.csv')

ratings_df['date'] = ratings_df['timestamp'].apply(lambda x: datetime.fromtimestamp(x))
ratings_df.drop('timestamp', axis=1, inplace=True)

ratings_df = ratings_df.merge(df[['id', 'original_title', 'genres', 'overview']], left_on='movieId',right_on='id', how='left')
ratings_df = ratings_df[~ratings_df['id'].isna()]
ratings_df.drop('id', axis=1, inplace=True)
ratings_df.reset_index(drop=True, inplace=True)

ratings_df.head()

Unnamed: 0,userId,movieId,rating,date,original_title,genres,overview
0,1,110,1.0,2015-03-10 05:52:09,Trois couleurs : Rouge,"Drama, Mystery, Romance",Red This is the third film from the trilogy by...
1,1,147,4.5,2015-03-10 06:07:15,Les Quatre Cents Coups,Drama,"For young Parisian boy Antoine Doinel, life is..."
2,1,858,5.0,2015-03-10 05:52:03,Sleepless in Seattle,"Comedy, Drama, Romance",A young boy who tries to set his dad up on a d...
3,1,1246,5.0,2015-03-10 05:52:36,Rocky Balboa,Drama,When he loses a highly publicized virtual boxi...
4,1,1968,4.0,2015-03-10 06:02:28,Fools Rush In,"Drama, Comedy, Romance",Alex Whitman (Matthew Perry) is a designer fro...


In [4]:
movies_df = df[['id', 'original_title']]
movies_df.rename(columns={'id':'movieId'}, inplace=True)
movies_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df.rename(columns={'id':'movieId'}, inplace=True)


Unnamed: 0,movieId,original_title
0,862,Toy Story
1,8844,Jumanji
2,15602,Grumpier Old Men
3,31357,Waiting to Exhale
4,11862,Father of the Bride Part II


In [5]:
ratings_df['userId'] = ratings_df['userId'].astype(str)

ratings = tf.data.Dataset.from_tensor_slices(dict(ratings_df[['userId', 'original_title', 'rating']]))
movies = tf.data.Dataset.from_tensor_slices(dict(movies_df[['original_title']]))

ratings = ratings.map(lambda x: {
    "original_title": x["original_title"],
    "userId": x["userId"],
    "rating": float(x["rating"])
})

movies = movies.map(lambda x: x["original_title"])

In [6]:
movie_titles = movies.batch(1_000)
user_ids = ratings.batch(1_000).map(lambda x: x["userId"])

unique_movie_titles = np.unique(np.concatenate(list(movie_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

print('Unique Movies: {}'.format(len(unique_movie_titles)))
print('Unique users: {}'.format(len(unique_user_ids)))

Unique Movies: 42373
Unique users: 265509


In [23]:
def update_model():
    global unique_movie_titles, unique_user_ids, movie_model, index

    ratings = tf.data.Dataset.from_tensor_slices(
        dict(ratings_df[['userId', 'original_title', 'rating']]))

    ratings = ratings.map(lambda x: {
        "original_title": x["original_title"],
        "userId": x["userId"],
        "rating": float(x["rating"])
    })

    user_ids = ratings.batch(1_000).map(lambda x: x["userId"])
    movie_titles = ratings.batch(1_000).map(lambda x: x["original_title"])

    unique_user_ids = np.unique(np.concatenate(list(user_ids)))

    # Update the movie model and index
    movie_model = tf.keras.Sequential([
        tf.keras.layers.StringLookup(
            vocabulary=unique_movie_titles, mask_token=None),
        tf.keras.layers.Embedding(len(unique_movie_titles) + 1, 64)
    ])
    movie_model.build(input_shape=())
    movie_model.load_weights('../model/tfrs.h5', by_name=True)

    index = tfrs.layers.factorized_top_k.BruteForce(movie_model)
    index.index_from_dataset(
        tf.data.Dataset.zip((movies.batch(100), movies.batch(100).map(movie_model)))
    )

In [26]:
def predict_movie(user, top_n=3):
    # Create the movie model and load the pre-trained weights.
    movie_model = tf.keras.Sequential([
        tf.keras.layers.StringLookup(
            vocabulary=unique_movie_titles, mask_token=None),
        tf.keras.layers.Embedding(len(unique_movie_titles) + 1, 64)
    ])
    # Load the weights from the pre-trained model.
    movie_model.build(input_shape=())
    movie_model.load_weights('../model/tfrs.h5', by_name=True)

    # Create a model that takes in raw query features, and
    index = tfrs.layers.factorized_top_k.BruteForce(movie_model)
    
    # Index from the entire movies dataset.
    index.index_from_dataset(
        tf.data.Dataset.zip((movies.batch(100), movies.batch(100).map(movie_model)))
    )

    # Get recommendations.
    _, titles = index(tf.constant([str(user)]))
    
    # Extract only the titles as strings
    recommended_titles = [title.decode("utf-8") for title in titles[0, :top_n].numpy()]

    
    print('Top {} recommendations for user {}:\n'.format(top_n, user))
    for i, title in enumerate(titles[0, :top_n].numpy()):
        # Print movie title
        print('{}. {}'.format(i+1, title.decode("utf-8")))
        
        # Retrieve and print genres overview for the recommended movie
        movie_row = df[df['original_title'] == title.decode("utf-8")]
        genres_overview = movie_row['genres'].values[0]
        print('Genres Overview:', genres_overview)
        print('\n')
        
        
    save_titles_to_file(user, recommended_titles)

In [8]:
ratings_df[ratings_df['userId'] == '101']

Unnamed: 0,userId,movieId,rating,date,original_title,genres,overview
3814,101,3,3.0,1996-12-16 05:45:25,Varjoja paratiisissa,"Drama, Comedy","An episode in the life of Nikander, a garbage ..."
3815,101,5,4.0,1996-12-16 05:45:25,Four Rooms,"Crime, Comedy",It's Ted the Bellhop's first night on the job....
3816,101,6,5.0,1996-12-16 05:45:25,Judgment Night,"Action, Thriller, Crime","While racing to a boxing match, Frank, Mike, J..."
3817,101,17,3.0,1996-12-16 05:44:22,The Dark,"Horror, Thriller, Mystery",Adèle and her daughter Sarah are traveling on ...
3818,101,25,2.0,1996-12-16 05:44:22,Jarhead,"Drama, War",Jarhead is a film about a US Marine Anthony Sw...
3819,101,62,3.0,1996-12-16 05:44:22,2001: A Space Odyssey,"Science Fiction, Mystery, Adventure",Humanity finds a mysterious object buried bene...
3820,101,95,4.0,1996-12-16 05:44:22,Armageddon,"Action, Thriller, Science Fiction, Adventure",When an asteroid threatens to collide with Ear...
3821,101,104,3.0,1996-12-16 05:46:24,Lola rennt,"Action, Drama, Thriller",Lola receives a phone call from her boyfriend ...
3822,101,112,3.0,1996-12-16 05:45:25,Italiensk for begyndere,"Comedy, Drama, Romance",This fifth Danish Dogme film is about six vuln...
3823,101,141,3.0,1996-12-16 05:44:22,Donnie Darko,"Fantasy, Drama, Mystery","After narrowly escaping a bizarre accident, a ..."


In [33]:
def save_titles_to_file(user, titles, filename='recommended_titles.txt'):
    # Combine user and movie titles in the desired format
    data_to_save = [str(user)] + titles
    data_line = ';'.join(data_to_save)

    with open(filename, 'w') as file:
        # Append the data line to the file
        file.write(data_line + '\n')

In [34]:
predict_movie(123, 5)

Top 5 recommendations for user 123:

1. Jurassic World
Genres Overview: Action, Adventure, Science Fiction, Thriller


2. 活着
Genres Overview: Drama, Romance, War


3. 旺角黑夜
Genres Overview: Drama, Action, Thriller, Foreign


4. Nueve Reinas
Genres Overview: Crime, Drama, Thriller


5. The Prodigal
Genres Overview: Drama


