In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from utils.dataloader import DataLoader
import torch
import networkx as nx
from collections import Counter

In [2]:
!pip install torch
!pip install torch-geometric



In [None]:
dataset_size = "100k" 

movie_data = DataLoader(size=dataset_size)

# Load movie data
items_data = movie_data.load_items(
    process_title=True, process_year=True, process_genres=True, genres_as_binary=False
)

users_data= movie_data.load_user_features(True, True)
# Load rating
ratings = pd.read_csv("./data/train_set.csv")

fetched_movies = pd.read_csv("./data/fetched_movies.csv")
semantics_info = fetched_movies[['MovieLens_ID', 'Director', 'Actors']]
semantics_info = semantics_info.rename(columns={'MovieLens_ID': 'movie_id', 'Actors': 'actors', 'Director': 'director'})

items_data = pd.merge(items_data, semantics_info, on='movie_id', how='left')

#rename movie_id to item
items_data.rename(columns={'movie_id': 'item'}, inplace=True)

In [6]:

def getSemanticInfoList(semantic_column):
    all_semantic_info = set()
    semantic_counter = Counter()

    for actors in semantic_column:
        if pd.isnull(actors): 
            continue
        actor_list = [actor.strip() for actor in str(actors).split(",")]
        all_semantic_info.update(actor_list)  # update can add multiple datas to the set
        semantic_counter.update(actor_list)

    all_semantic_info_list = list(all_semantic_info)
    return all_semantic_info_list

In [7]:
items_data.actors

0       Tom Hanks, Tim Allen, Don Rickles, Jim Varney,...
1       Pierce Brosnan, Sean Bean, Izabella Scorupco, ...
2       Tim Roth, Jennifer Beals, Antonio Banderas, Va...
3       John Travolta, Gene Hackman, Rene Russo, Danny...
4       Sigourney Weaver, Holly Hunter, Dermot Mulrone...
                              ...                        
1677                     Aleksei Ananishnov, Gudrun Geyer
1678                                                  NaN
1679    Gwyneth Paltrow, John Hannah, John Lynch, Jean...
1680                                      Martin Lawrence
1681    Donald Sutherland, Brad Dourif, Vittorio Mezzo...
Name: actors, Length: 1682, dtype: object

In [9]:
user_list = list(set(ratings.user.tolist()) | set(users_data.userid))
item_list = list(set(ratings.item.tolist()) | set(items_data.item))

rate_list = list(set(ratings.rating.tolist()))
genre_list = ["unknown",
        "Action",
        "Adventure",
        "Animation",
        "Children's",
        "Comedy",
        "Crime",
        "Documentary",
        "Drama",
        "Fantasy",
        "Film-Noir",
        "Horror",
        "Musical",
        "Mystery",
        "Romance",
        "Sci-Fi",
        "Thriller",
        "War",
        "Western",]
actor_list = getSemanticInfoList(items_data.actors)
director_list = getSemanticInfoList(items_data.director)

gender_list = list(set(users_data.gender.tolist()))
age_list = list(set(users_data.age.tolist()))

occupation_list = list(set(users_data.occupation.tolist()))
zipcode_list = list(set(users_data.zipcode.tolist()))


In [10]:
def item_converting(row, director_list, actor_list):
        genre_idx = torch.zeros(1, len(genre_list)).long()
        for genre in str(row['genres']).split("|"):
            idx = genre_list.index(genre)
            genre_idx[0, idx] = 1  # one-hot vector
        
        director_idx = torch.zeros(1, len(director_list)).long()
        director_id = []
        for director in str(row['director']).split(", "):
            if pd.isnull(director) or director.lower() == 'nan': 
                continue
            idx = director_list.index(director)
            director_idx[0, idx] = 1
            director_id.append(idx+1)  # id starts from 1, not index
        actor_idx = torch.zeros(1, len(actor_list)).long()
        actor_id = []
        for actor in str(row['actors']).split(", "):
            if pd.isnull(actor) or actor.lower() == 'nan':
                continue
            idx = actor_list.index(actor)
            actor_idx[0, idx] = 1
            actor_id.append(idx+1)
        return torch.cat((genre_idx,), 1),torch.cat((genre_idx, director_idx, actor_idx), 1), director_id, actor_id


In [11]:
# hash map for item
movie_fea_hete = {}
movie_fea_homo = {}
m_directors = {}
m_actors = {}
for idx, row in items_data.iterrows():
    # print(f"Index: {idx}, Item: {row['item']}, Title: {row['title']}, Director: {row['director']}")
    m_info = item_converting(row, director_list, actor_list)
    movie_fea_homo[row['item']] = m_info[0] # genre info
    movie_fea_hete[row['item']] = m_info[1] # genre, director, actor info
    m_directors[row['item']] = m_info[2]
    m_actors[row['item']] = m_info[3]

In [12]:
def user_converting(row, gender_list, age_list, occupation_list, zipcode_list):
        gender_idx = torch.zeros(1, len(gender_list)).long()
        for gender in str(row['gender']).split(","):
            idx = gender_list.index(gender)
            gender_idx[0, idx] = 1  # one-hot vector
    
        age_idx = torch.zeros(1, len(age_list)).long()
        for age in str(row['age']).split(","):
            idx = age_list.index(age)
            age_idx[0, idx] = 1  # one-hot vector
        
        occupation_idx = torch.zeros(1, len(occupation_list)).long()
        idx = occupation_list.index(int(row['occupation']))
        occupation_idx[0, idx] = 1  # one-hot vector
        
        zipcode_idx = torch.zeros(1, len(zipcode_list)).long()
        for zipcode in str(row['zipcode']).split(","):
            idx = zipcode_list.index(zipcode)
            zipcode_idx[0, idx] = 1  # one-hot vector
       
        return torch.cat((gender_idx, age_idx, occupation_idx, zipcode_idx), 1) 

In [13]:
# hash map for user
user_fea = {}
for idx, row in users_data.iterrows():
    u_info = user_converting(row, gender_list, age_list, occupation_list, zipcode_list)
    user_fea[row['userid']] = u_info

In [14]:
def getUserMovieInteractions(ratings):
    user_movie_interactions = {}

    for _, row in ratings.iterrows():    
        user_id = int(row['user'])
        item = int(row['item'])

        # Check if user_id exists in the dictionary
        if user_id not in user_movie_interactions:
            user_movie_interactions[user_id] = []

        # Append the item to the user's interaction list
        user_movie_interactions[user_id].append(item)

    return user_movie_interactions

In [15]:
user_movie_interactions = getUserMovieInteractions(ratings) # UM info {u1:[m1,m2], u2:[m2,m3]}

In [16]:
import collections
def reverse_dict(d):
    re_d = collections.defaultdict(list)
    for k, v_list in d.items():
        for v in v_list:
            re_d[v].append(k)
    return dict(re_d)

In [17]:
a_movies = reverse_dict(m_actors) # actor's movies
d_movies = reverse_dict(m_directors) # director's movies
movie_user_interactions = reverse_dict(user_movie_interactions)

In [18]:
u_m_u_movies = {} #UMUM
u_m_a_movies = {} #UMAM {{u1: {m1: {m1,m2,m3}(m1's actor1)}, {m2}}}
u_m_d_movies = {} #UMDM

for user, movies in user_movie_interactions.items():
    u_m_u_movies[user] = {}
    u_m_a_movies[user] = {}
    u_m_d_movies[user] = {}
    for movie in movies:
        # Construct UMAM：User->Movie->Actor->Movie
        u_m_a_movies[user][movie] = set([movie])
        for actor in m_actors.get(movie, []):
            actor_movies = a_movies[actor]
            u_m_a_movies[user][movie].update(actor_movies)
        
        u_m_d_movies[user][movie] = set([movie])
        # Construct UMDM：User->Movie->Director->Movie
        for director in m_directors.get(movie, []):
            director_movies = d_movies[director]
            u_m_d_movies[user][movie].update(director_movies)
        
        # Construct UMUM：User->Movie->User->Movie
        u_m_u_movies[user][movie] = set([movie])   # add itself to avoid empty tensor when build the support set
        u_m_u_movies[user][movie].update(user_movie_interactions[user].copy())
        if movie in movie_user_interactions: 
            for _user in movie_user_interactions.get(movie, []):
                user_movies = user_movie_interactions.get(_user, [])
                u_m_u_movies[user][movie].update(user_movies)
print(len(u_m_u_movies), len(u_m_a_movies), len(u_m_d_movies))


943 943 943


In [None]:
features_hetes_length = len(genre_list)+len(actor_list)+len(director_list)

umum_embeddings = {}
umam_embeddings = {}
umdm_embeddings = {}

for u_id in user_list:
    umum_features_hetes = torch.zeros(1, features_hetes_length)
    umam_features_hetes = torch.zeros(1, features_hetes_length)
    umdm_features_hetes = torch.zeros(1, features_hetes_length)
    for m_id in user_movie_interactions[u_id]:
        #UMUM
        for x in u_m_u_movies[u_id][m_id]:
            umum_features_hetes += movie_fea_hete[x] # genre, director, actor info
        
        #UMAM
        for x in u_m_a_movies[u_id][m_id]:
            umam_features_hetes += movie_fea_hete[x]

        #UMDM
        for x in u_m_d_movies[u_id][m_id]:
            umdm_features_hetes += movie_fea_hete[x]

    umum_embeddings[u_id] = umum_features_hetes
    umam_embeddings[u_id] = umam_features_hetes
    umdm_embeddings[u_id] = umdm_features_hetes
    

In [68]:
umum_features_hetes

tensor([[  195., 39373., 21951.,  ...,    83.,   334.,   152.]])

In [None]:
torch.save(umum_embeddings, "./data/umum_embeddings.pt")
torch.save(umam_embeddings, "./data/umam_embeddings.pt")
torch.save(umdm_embeddings, "./data/umdm_embeddings.pt")
torch.save(user_fea, "./data/content_based_embeddings.pt")

In [None]:
torch.load("./data/content_based_embeddings.pt")