In [1]:
import numpy as np
import pandas as pd
import  scipy.io as sio
import scipy.sparse as sp
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# load raw data, delete movies with no actor or director
movies = pd.read_csv('../raw/IMDB/movie_metadata.csv', encoding='utf-8').dropna(
    axis=0, subset=['actor_1_name', 'director_name']).reset_index(drop=True)

In [3]:
# extract labels, and delete movies with unwanted genres
# 0 for action, 1 for comedy, 2 for drama, -1 for others
movie_label = np.zeros((len(movies)), dtype=int)
for movie_idx, genres in movies['genres'].iteritems():
    movie_label[movie_idx] = -1
    genrelist = genres.split('|')
    for genre in genres.split('|'):
        types = ('Action' in genre) + ('Comedy' in genre) + ('Drama' in genre)
        if types > 1: break
        if genre == 'Action':
            movie_label[movie_idx] = 0
            break
        elif genre == 'Comedy':
            movie_label[movie_idx] = 1
            break
        elif genre == 'Drama':
            movie_label[movie_idx] = 2
            break
unwanted_idx = np.where(movie_label == -1)[0]
movies = movies.drop(unwanted_idx).reset_index(drop=True)
movie_label = np.delete(movie_label, unwanted_idx, 0)

In [4]:
# get director list and actor list
directors = list(set(movies['director_name'].dropna()))
directors.sort()
actors = list(set(movies['actor_1_name'].dropna().to_list() +
                  movies['actor_2_name'].dropna().to_list() +
                  movies['actor_3_name'].dropna().to_list()))
actors.sort()

In [5]:
# build the adjacency matrix for the graph consisting of movies, directors and actors
# 0 for movies, 1 for directors, 2 for actors
dim = len(movies) + len(directors) + len(actors)
type_mask = np.zeros((dim), dtype=int)
type_mask[len(movies):len(movies)+len(directors)] = 1
type_mask[len(movies)+len(directors):] = 2

adjM = np.zeros((dim, dim), dtype=int)
for movie_idx, row in movies.iterrows():
    if row['director_name'] in directors:
        director_idx = directors.index(row['director_name'])
        adjM[movie_idx, len(movies) + director_idx] = 1
        adjM[len(movies) + director_idx, movie_idx] = 1
    if row['actor_1_name'] in actors:
        actor_idx = actors.index(row['actor_1_name'])
        adjM[movie_idx, len(movies) + len(directors) + actor_idx] = 1
        adjM[len(movies) + len(directors) + actor_idx, movie_idx] = 1
    if row['actor_2_name'] in actors:
        actor_idx = actors.index(row['actor_2_name'])
        adjM[movie_idx, len(movies) + len(directors) + actor_idx] = 1
        adjM[len(movies) + len(directors) + actor_idx, movie_idx] = 1
    if row['actor_3_name'] in actors:
        actor_idx = actors.index(row['actor_3_name'])
        adjM[movie_idx, len(movies) + len(directors) + actor_idx] = 1
        adjM[len(movies) + len(directors) + actor_idx, movie_idx] = 1


In [6]:
# extract bag-of-word representations of plot keywords for each movie
# X is a sparse matrix
vectorizer = CountVectorizer(min_df=2)
movie_X = vectorizer.fit_transform(movies['plot_keywords'].fillna('').values)
# assign features to directors and actors as the means of their associated movies' features
adjM_da2m = adjM[len(movies):, :len(movies)]
adjM_da2m_normalized = np.diag(1 / adjM_da2m.sum(axis=1)).dot(adjM_da2m)
director_actor_X = csr_matrix(adjM_da2m_normalized).dot(movie_X)
full_X = sp.vstack([movie_X, director_actor_X])

In [7]:
movieslen = movies.shape[0]
directorlen = len(directors)
actorlen = len(actors)

In [8]:
movie_feature = movie_X
actor_feature = full_X[-actorlen:, ]
director_feature = full_X[movieslen:movieslen+directorlen, :]

In [9]:
M_D = csr_matrix(adjM[:movieslen, movieslen:movieslen+directorlen])
M_A = csr_matrix(adjM[:movieslen, -actorlen:])
D_M = csr_matrix(adjM[movieslen:movieslen+directorlen, :movieslen])
A_M = csr_matrix(adjM[-actorlen:, :movieslen])

In [10]:
MDM = M_D.dot(D_M)
MAM = M_A.dot(A_M)
AMA = A_M.dot(M_A)
DMD = D_M.dot(M_D)

In [11]:
# movie 的 Train, Valid  随机选择
np.random.seed(20210521)
train_valid_movie_ac = list(np.random.choice(np.where(movie_label==0)[0],300, replace=False))
train_valid_movie_co = list(np.random.choice(np.where(movie_label==1)[0],300, replace=False))
train_valid_movie_dr = list(np.random.choice(np.where(movie_label==2)[0],300, replace=False))

train_movie_idx = np.array(train_valid_movie_ac[:150] + train_valid_movie_co[:150] + train_valid_movie_dr[:150])
train_movie_idx.sort()
train_movie_label = movie_label[train_movie_idx]

valid_movie_idx = np.array(train_valid_movie_ac[150:] + train_valid_movie_co[150:] + train_valid_movie_dr[150:])
valid_movie_idx.sort()
valid_movie_label = movie_label[valid_movie_idx]

test_movie_idx = np.array(list((set(np.arange(movie_label.shape[0])) - set(train_movie_idx)) - set(valid_movie_idx)))
test_movie_idx.sort()
test_movie_label = movie_label[test_movie_idx]

In [12]:
from sklearn.preprocessing import OneHotEncoder

In [15]:
onehot_encoder = OneHotEncoder()
onehot_movie_label = onehot_encoder.fit_transform(movie_label.reshape(len(movie_label), 1))

In [16]:
train_movie_idx = train_movie_idx.astype(np.int)
valid_movie_idx = valid_movie_idx.astype(np.int)
test_movie_idx = test_movie_idx.astype(np.int)
onehot_movie_label = onehot_movie_label.astype(np.int)

In [19]:
new_IMDB = dict()
new_IMDB['actor_feature'] = actor_feature
new_IMDB['movie_feature'] = movie_feature
new_IMDB['director_feature'] = director_feature
new_IMDB['MA'] = M_A
new_IMDB['MD'] = M_D
new_IMDB['MDM'] = MDM
new_IMDB['MAM'] = MAM
new_IMDB['AMA'] = AMA
new_IMDB['DMD'] = DMD
new_IMDB['movie_label'] = onehot_movie_label
new_IMDB['train_movie_idx'] = train_movie_idx
new_IMDB['val_movie_idx'] = valid_movie_idx
new_IMDB['test_movie_idx'] = test_movie_idx

In [21]:
sio.savemat('new_imdb.mat', new_IMDB)