In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval

In [2]:
df = pd.read_csv('movie_data.csv')[['title_id', 'title', 'genres', 'dir_list', 'cast_list']]
df = df.dropna()
df = df[df['dir_list'].str.contains('Director')]
df = df[df['cast_list'].str.contains('Star')]
df.index = range(len(df))

In [3]:
df['genres'] = df['genres'].str.strip().str.split(', ')

In [4]:
def get_dir(dir_list):
    return dir_list[1:]

df['dir_list'] = df['dir_list'].apply(literal_eval).apply(get_dir)

In [5]:
def get_cast(cast_list):
    return [name[:-1] if ',' in name else name for name in cast_list[1:10]]

df['cast_list'] = df['cast_list'].apply(literal_eval).apply(get_cast)

In [6]:
def get_name(name_list):
    return [name.replace(' ', '').replace('-', '').lower() for name in name_list]

df['genres'] = df['genres'].apply(get_name)
df['dir_list'] = df['dir_list'].apply(get_name)
df['cast_list'] = df['cast_list'].apply(get_name)

In [7]:
def get_soup(movie):
    return ' '.join([' '.join(movie[field]) for field in ['genres', 'dir_list', 'cast_list']])

df['soup'] = df.apply(get_soup, axis = 1)

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['soup'])

In [9]:
from sklearn.metrics.pairwise import cosine_similarity
memo = {}
for i in range(len(df)):
    cosine_sim = cosine_similarity(count_matrix[i], count_matrix)[0]
    memo[i] = [item[0] for item in sorted(list(enumerate(cosine_sim)), key = lambda x: x[1], reverse = True)[:16]]

In [18]:
recom_titleid = pd.DataFrame(index = range(len(df)), columns = range(16))
recom_title = pd.DataFrame(index = range(len(df)), columns = range(16))
for i in range(len(df)):
    recom_titleid.loc[i] = df.loc[memo[i], 'title_id'].to_numpy()
    recom_title.loc[i] = df.loc[memo[i], 'title'].to_numpy()
recom_titleid.columns = ['title_id'] + ['rank_{}'.format(i) for i in range(1, 16)]
recom_title.columns = ['title'] + ['rank_{}'.format(i) for i in range(1, 16)]

In [19]:
recom_titleid.to_csv('recom_titleid.csv', index = False)
recom_title.to_csv('recom_title.csv', index = False)