In [2]:
import pandas as pd
import numpy as np
from ast import literal_eval

In [3]:
#df = pd.read_csv('movie_data_pop.csv')[['title_id', 'title', 'genres', 'dir_list', 'cast_list']]
df = pd.read_csv('movie_data_pop.csv')[['title_id', 'title', 'genres', 'dir_list', 'cast_list', 'description']]
df = df.dropna()
df = df[df['dir_list'].str.contains('Director')]
df = df[df['cast_list'].str.contains('Star')]
df = df[df['description'] != '\n']
df.index = range(len(df))

In [4]:
df['genres'] = df['genres'].str.strip().str.split(', ')

In [None]:
def get_dir(dir_list):
    return dir_list[1:]

df['dir_list'] = df['dir_list'].apply(literal_eval).apply(get_dir)

In [None]:
def get_cast(cast_list):
    return [name[:-1] if ',' in name else name for name in cast_list[1:]]

df['cast_list'] = df['cast_list'].apply(literal_eval).apply(get_cast)

In [None]:
def get_name(name_list):
    return [name.replace(' ', '').lower() for name in name_list]

df['genres'] = df['genres'].apply(get_name)
df['dir_list'] = df['dir_list'].apply(get_name)
df['cast_list'] = df['cast_list'].apply(get_name)

In [None]:
def get_soup(movie):
    return ' '.join([' '.join(movie[field]) for field in ['genres', 'dir_list', 'cast_list']])

df['soup'] = df.apply(get_soup, axis = 1)

In [None]:
df['description'] = df['description'].str.replace('[,,.,(,),!,:,?;,"]', '').str.lower()
df['soup'] = df['soup'].str.replace('[,,.,(,),!,:,?;,"]', '').str.lower()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['soup'])
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['description'])

In [None]:
from sklearn.preprocessing import normalize
count_matrix = normalize(count_matrix)

In [None]:
from scipy.sparse import csr_matrix, hstack
total_matrix = hstack([count_matrix, tfidf_matrix], format = 'csr')
total_matrix = normalize(total_matrix)

In [None]:
from sklearn.metrics.pairwise import linear_kernel
movies, cosine_similarities = {}, {}
for i in range(len(df)):
    movies[i], cosine_similarities[i] = [], []
    cosine_sim = linear_kernel(total_matrix[i], total_matrix)[0]
    for item in sorted(list(enumerate(cosine_sim)), key = lambda x: x[1], reverse = True)[:16]:
        movies[i].append(item[0])
        cosine_similarities[i].append(item[1])

In [None]:
recom_titleid = pd.DataFrame(index = range(len(df)), columns = range(16))
recom_title = pd.DataFrame(index = range(len(df)), columns = range(16))
sim_record = pd.DataFrame(index = range(len(df)), columns = range(16))
for i in range(len(df)):
    recom_titleid.loc[i] = df.loc[movies[i], 'title_id'].to_numpy()
    recom_title.loc[i] = df.loc[movies[i], 'title'].to_numpy()
    sim_record.loc[i] = cosine_similarities[i]
recom_titleid.columns = ['title_id'] + ['rank_{}'.format(i) for i in range(1, 16)]
recom_title.columns = ['title'] + ['rank_{}'.format(i) for i in range(1, 16)]
sim_record.columns = ['title'] + ['rank_{}'.format(i) for i in range(1, 16)]

In [None]:
recom_titleid.to_csv('recom_titleid.csv', index = False)
recom_title.to_csv('recom_title.csv', index = False)
sim_record.to_csv('sim_record.csv', index = False)
df[['title_id', 'title']].to_csv('id_title.csv', index = False)