# Movie Recommender (Japanese)

In [1]:
import nagisa # A library for splitting Japanese text necessary to make countvectorizer work.
import nltk
import numpy as np
import pandas as pd


from rake_nltk import Rake # Used to extract English keywords
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

import warnings
warnings.filterwarnings("ignore")

To extract keywords in Japanese, I implement the customized RAKE (Rapid Automatic Keyword Extraction) algorithm from this [github repository](https://github.com/carol975/RAKE_JPN).

In [2]:
jpn_stop_words = ["あそこ","あっ","あの","あのかた","あの人","あり","あります","ある","あれ","い","いう","います","いる","う","うち","え","お","および","おり","おります","か","かつて","から","が","き","ここ","こちら","こと","この","これ","これら","さ","さらに","し","しかし","する","ず","せ","せる","そこ","そして","その","その他","その後","それ","それぞれ","それで","た","ただし","たち","ため","たり","だ","だっ","だれ","つ","て","で","でき","できる","です","では","でも","と","という","といった","とき","ところ","として","とともに","とも","と共に","どこ","どの","な","ない","なお","なかっ","ながら","なく","なっ","など","なに","なら","なり","なる","なん","に","において","における","について","にて","によって","により","による","に対して","に対する","に関する","の","ので","のみ","は","ば","へ","ほか","ほとんど","ほど","ます","また","または","まで","も","もの","ものの","や","よう","より","ら","られ","られる","れ","れる","を","ん","何","及び","彼","彼女","我々","特に","私","私達","貴方","貴方方"]

In [3]:
# Customized RAKE algorithm
import MeCab
import string
import unicodedata


class Rake_JP:
    def __init__(self):
        self.tagger = MeCab.Tagger("-Owakati")
    
    def remove_punctuation(self,text):
        text = unicodedata.normalize("NFKC", text) 
        table = str.maketrans("", "", string.punctuation  + "「」、。・※" + string.digits)
        text = text.translate(table)

        return text
        
    def get_word_score(self, word_list):
        freq = {}
        deg = {}

        for word in word_list:
            freq[word] = (freq.get(word) or 0) + 1
            deg[word] = (deg.get(word) or 0) + len(word) - 1
      
        scores = {}
        for word in word_list:
            scores[word] = deg[word]/freq[word]
        
        scores = {k:v for k, v in  sorted(scores.items(), key=lambda item: item[1], reverse=True)}
      
        return scores
    
    def get_keywords(self, text, limit=0):
        parsed_text = self.tagger.parse(text)
        raw_word_list = self.remove_punctuation(parsed_text).split()
        word_list = [word for word in raw_word_list if word not in jpn_stop_words ]
        
        score_list = self.get_word_score(word_list)
        
        if limit == 0:
            return list(score_list.keys())
        else:
            return list(score_list.keys())[:limit]

## Load the dataset

Top 250 IMDB movie dataset is obtained from [data.world](https://data.world/studentoflife/imdb-top-250-lists-and-5000-or-so-data-records) website.
The dataset is originally in English, but to build a Japanese recommender system I manually translated those data (such as movie title, genre, director, actors, and plot) into Japanese using Google Sheet and GOOGLETRANSLATE formula. It is also important to note that some movies were removed since their data could not be translated into Japanese.

In [4]:
# Load the data
df = pd.read_csv('data/IMDB_Top250movies2_OMDB_Detailed.csv', encoding="utf-8")
df.head(2)

Unnamed: 0.1,Unnamed: 0,Title,Title_JP,Genre,Genre_JP,Director,Director_JP,Actors,Actors_JP,Plot,Plot_JP
0,1,The Shawshank Redemption,ショーシャンクの空に,"Crime, Drama",犯罪ドラマ,Frank Darabont,フランク・ダラボント,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",ティム・ロビンズ、モーガン・フリーマン、ボブ・ガントン、ウィリアム・サドラー,Two imprisoned men bond over a number of years...,2人の投獄された男性が何年にもわたって絆を結び、共通の良識の行為を通じて慰めと最終的なred...
1,2,The Godfather,ゴッドファーザー,"Crime, Drama",犯罪ドラマ,Francis Ford Coppola,フランシス・フォード・コッポラ,"Marlon Brando, Al Pacino, James Caan, Richard ...",マーロン・ブランド、アル・パチーノ、ジェームズ・カーン、リチャード・S・カステラーノ,The aging patriarch of an organized crime dyna...,組織化された犯罪王朝の老化した家長は、彼の秘密帝国の消極的な息子に支配権を譲渡します。


In [5]:
# Keep only necessary columns (Title, Genre, Director, Actors, Plot)
df_cv = df[['Title', 'Title_JP', 'Genre', 'Genre_JP', 'Director', 'Director_JP', 'Actors', 'Actors_JP', 'Plot', 'Plot_JP']]
df_cv.head(2)

Unnamed: 0,Title,Title_JP,Genre,Genre_JP,Director,Director_JP,Actors,Actors_JP,Plot,Plot_JP
0,The Shawshank Redemption,ショーシャンクの空に,"Crime, Drama",犯罪ドラマ,Frank Darabont,フランク・ダラボント,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",ティム・ロビンズ、モーガン・フリーマン、ボブ・ガントン、ウィリアム・サドラー,Two imprisoned men bond over a number of years...,2人の投獄された男性が何年にもわたって絆を結び、共通の良識の行為を通じて慰めと最終的なred...
1,The Godfather,ゴッドファーザー,"Crime, Drama",犯罪ドラマ,Francis Ford Coppola,フランシス・フォード・コッポラ,"Marlon Brando, Al Pacino, James Caan, Richard ...",マーロン・ブランド、アル・パチーノ、ジェームズ・カーン、リチャード・S・カステラーノ,The aging patriarch of an organized crime dyna...,組織化された犯罪王朝の老化した家長は、彼の秘密帝国の消極的な息子に支配権を譲渡します。


In [6]:
# Check entry types and missing values
df_cv.info()
df_cv.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185 entries, 0 to 184
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Title        185 non-null    object
 1   Title_JP     185 non-null    object
 2   Genre        185 non-null    object
 3   Genre_JP     185 non-null    object
 4   Director     185 non-null    object
 5   Director_JP  185 non-null    object
 6   Actors       185 non-null    object
 7   Actors_JP    185 non-null    object
 8   Plot         185 non-null    object
 9   Plot_JP      185 non-null    object
dtypes: object(10)
memory usage: 14.6+ KB


Title          0
Title_JP       0
Genre          0
Genre_JP       0
Director       0
Director_JP    0
Actors         0
Actors_JP      0
Plot           0
Plot_JP        0
dtype: int64

## Japanese Recommender

### Data pre-processing

In [7]:
# Remove remaining English characters in the string
df_cv['Title_JP'] = df_cv['Title_JP'].str.replace(r'[a-zA-Z]', '')
df_cv['Genre_JP'] = df_cv['Genre_JP'].str.replace(r'[a-zA-Z]', '')
df_cv['Director_JP'] = df_cv['Director_JP'].str.replace(r'[a-zA-Z]', '')
df_cv['Actors_JP'] = df_cv['Actors_JP'].str.replace(r'[a-zA-Z]', '')
df_cv['Plot_JP'] = df_cv['Plot_JP'].str.replace(r'[a-zA-Z]', '')

In [8]:
# Generate a new columns to store JP keywords obtained from customized RAKE algorithm
df_cv['Key_words_JP'] = ''
r = Rake_JP()

for index, row in df_cv.iterrows():
    # Get dictionary with keywords and scores
    key_words_dict_scores_JP = r.get_keywords(row['Plot_JP'])    
    row['Key_words_JP'] = list(key_words_dict_scores_JP)   

In [9]:
# Extract genre, first three main actors, and directors into a list
df_cv['Genre_JP'] = df_cv['Genre_JP'].map(lambda x: x.split('、'))
df_cv['Actors_JP'] = df_cv['Actors_JP'].map(lambda x: x.split('、')[:3])
df_cv['Director_JP'] = df_cv['Director_JP'].map(lambda x: x.split('、'))

# Merge first & last names into one word to create unique names 
for index, row in df_cv.iterrows():
    row['Genre_JP'] = [x.lower().replace(' ','').replace('・','') for x in row['Genre_JP']]
    row['Actors_JP'] = [x.lower().replace(' ','').replace('・','') for x in row['Actors_JP']]
    row['Director_JP'] = [x.lower().replace(' ','').replace('・','') for x in row['Director_JP']]

### Generate word representation for each movie

In [10]:
# Combine 'Genre_JA', 'Director_JA', 'Actors_JA', 'Key_words_JA' into a column
df_cv['all_words_JP'] = ''
columns = ['Genre_JP', 'Director_JP', 'Actors_JP', 'Key_words_JP']
for index, row in df_cv.iterrows():
    words = ''
    for col in columns:
        words += ''.join(row[col]) + ''
    row['all_words_JP'] = words
    

# Create a new df from df_cv with only 3 columns 'Title', 'Title_JP', 'all_words_JP'
df_jp = df_cv[['Title', 'Title_JP', 'all_words_JP']]
df_jp.head(2)

Unnamed: 0,Title,Title_JP,all_words_JP
0,The Shawshank Redemption,ショーシャンクの空に,犯罪ドラマフランクダラボントティムロビンズモーガンフリーマンボブガントンわたっ見つけ投獄男性...
1,The Godfather,ゴッドファーザー,犯罪ドラマフランシスフォードコッポラマーロンブランドアルパチーノジェームズカーン組織犯罪王朝...


### Generate vector representation

In [12]:
def tokenize_jp(text):
    doc = nagisa.filter(text, filter_postags=['助詞', '補助記号', '助動詞'])
    return doc.words


vectorizer_JP = CountVectorizer(tokenizer=tokenize_jp)
matrix_JP = vectorizer_JP.fit_transform(df_jp['all_words_JP'])
matrix_JP

<185x2578 sparse matrix of type '<class 'numpy.int64'>'
	with 4232 stored elements in Compressed Sparse Row format>

In [13]:
df_word_JP = pd.DataFrame(matrix_JP.toarray(),
                        columns=vectorizer_JP.get_feature_names())
df_word_JP.head(5)

Unnamed: 0,​,いか,いく,うっかり,うつ病,うまく,おそらく,おもちゃ,かかし,かけ,...,高橋,魅惑,魔法,魔術,麻痺,麻薬,黒,黒人,黒澤,黙示
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# Calculate cosine similarity, cosine similarity: similarity = cos(angle) = range from 0 (different) to 1 (similar)
cosine_sim_JP = cosine_similarity(matrix_JP, matrix_JP)
print(cosine_sim_JP)

[[1.         0.13650473 0.09100315 ... 0.0952381  0.         0.05006262]
 [0.13650473 1.         0.26086957 ... 0.09100315 0.         0.        ]
 [0.09100315 0.26086957 1.         ... 0.09100315 0.         0.        ]
 ...
 [0.0952381  0.09100315 0.09100315 ... 1.         0.         0.05006262]
 [0.         0.         0.         ... 0.         1.         0.        ]
 [0.05006262 0.         0.         ... 0.05006262 0.         1.        ]]


In [15]:
# Create a Series for movie titles which can be used as indices
# English title is used as the base to compare the recommended movies using English text and those movies using Japanese text
indices = pd.Series(df_jp['Title'])
indices[:5]

0    The Shawshank Redemption
1               The Godfather
2      The Godfather: Part II
3             The Dark Knight
4                12 Angry Men
Name: Title, dtype: object

### Implement the recommender system

In [17]:
# The function takes in a movie title as an input and returns the top 10 recommended (similar) movies based on content-based filtering

def recommend_JP(title, cosine_sim = cosine_sim_JP):
    recommended_movies = []
    idx = indices[indices == title].index[0]   # to get the index of the movie title matching the input movie
    score_series = pd.Series(cosine_sim_JP[idx]).sort_values(ascending = False)   # similarity scores in descending order
    top_10_indices = list(score_series.iloc[1:11].index)   # to get the indices of top 10 most similar movies
 
    
    for i in top_10_indices:   # to append the titles of top 10 similar movies to the recommended_movies list
        recommended_movies.append(list(df_jp['Title'])[i])
        
    return recommended_movies

## English Recommender

### Data pre-processing

In [18]:
# Remove punctuations from Plot
df_cv['Plot'] = df_cv['Plot'].str.replace('[^\w\s]','')

In [19]:
# Generate a new columns to store EN keywords obtained from RAKE
df_cv['Key_words'] = ''
r = Rake()

for index, row in df_cv.iterrows():
    r.extract_keywords_from_text(row['Plot']) 
    
    # to get dictionary with key words and their scores
    key_words_dict_scores = r.get_word_degrees()    
     # to assign list of key words to new column
    row['Key_words'] = list(key_words_dict_scores.keys()) 

In [20]:
# Extract genre, first three main actors, and directors into a list
df_cv['Genre'] = df_cv['Genre'].map(lambda x: x.split(','))
df_cv['Actors'] = df_cv['Actors'].map(lambda x: x.split(',')[:3])
df_cv['Director'] = df_cv['Director'].map(lambda x: x.split(','))

# Merge first & last name into one word to create unique names
for index, row in df_cv.iterrows():
    row['Genre'] = [x.lower().replace(' ','') for x in row['Genre']]
    row['Actors'] = [x.lower().replace(' ','') for x in row['Actors']]
    row['Director'] = [x.lower().replace(' ','') for x in row['Director']]

### Generate word representation

In [21]:
# Combine 'Genre', 'Director', 'Actors', 'Key_words' into a column
df_cv['all_words_EN'] = ''
columns = ['Genre', 'Director', 'Actors', 'Key_words']

for index, row in df_cv.iterrows():
    words = ''
    for col in columns:
        words += ' '.join(row[col]) + ' '
    row['all_words_EN'] = words
    

df_cv['all_words_EN'] = df_cv['all_words_EN'].str.strip().str.replace('   ', ' ').str.replace('  ', ' ')

df_en = df_cv[['Title','all_words_EN']]
df_en.head(2)

Unnamed: 0,Title,all_words_EN
0,The Shawshank Redemption,crime drama frankdarabont timrobbins morganfre...
1,The Godfather,crime drama francisfordcoppola marlonbrando al...


### Create vector representation

In [22]:
# Generate the count matrix
vectorizer_EN = CountVectorizer()
matrix_EN = vectorizer_EN.fit_transform(df_en['all_words_EN'])
matrix_EN

<185x2384 sparse matrix of type '<class 'numpy.int64'>'
	with 3897 stored elements in Compressed Sparse Row format>

In [24]:
# cosine similarity: similarity = cos(angle) = range from 0 (different) to 1 (similar)
cosine_sim_EN = cosine_similarity(matrix_EN, matrix_EN)
print(cosine_sim_EN)

[[1.         0.15789474 0.13764944 ... 0.10814761 0.         0.05735393]
 [0.15789474 1.         0.36706517 ... 0.16222142 0.         0.05735393]
 [0.13764944 0.36706517 1.         ... 0.14142136 0.         0.05      ]
 ...
 [0.10814761 0.16222142 0.14142136 ... 1.         0.         0.05892557]
 [0.         0.         0.         ... 0.         1.         0.0521286 ]
 [0.05735393 0.05735393 0.05       ... 0.05892557 0.0521286  1.        ]]


### Implement recommendation model

In [26]:
# this function takes in a movie title as input and returns the top 10 recommended (similar) movies

def recommend_EN(title, cosine_sim = cosine_sim_EN):
    recommended_movies = []
    idx = indices[indices == title].index[0]   # to get the index of the movie title matching the input movie
    score_series = pd.Series(cosine_sim_EN[idx]).sort_values(ascending = False)   # similarity scores in descending order
    top_10_indices = list(score_series.iloc[1:11].index)   # to get the indices of top 10 most similar movies
    # [1:11] to exclude 0 (index 0 is the input movie itself)
    
    for i in top_10_indices:   # to append the titles of top 10 similar movies to the recommended_movies list
        recommended_movies.append(list(df['Title'])[i])
        
    return recommended_movies

## Compare recommended results

In [32]:
print(recommend_JP('The Dark Knight'), '\n', '\n', recommend_EN('The Dark Knight'))

['The Dark Knight Rises', 'Batman Begins', 'The Green Mile', 'Rashomon', 'Witness for the Prosecution', 'Rush', 'The Prestige', 'The Godfather', 'V for Vendetta', 'The 400 Blows'] 
 
 ['Batman Begins', 'The Dark Knight Rises', 'The Green Mile', 'Dangal', 'The Prestige', 'The 400 Blows', 'Rashomon', 'The Shawshank Redemption', 'Baby Driver', 'La Haine']


In [33]:
print(recommend_JP('The Green Mile'), '\n', '\n', recommend_EN('The Green Mile'))

['To Kill a Mockingbird', 'Cool Hand Luke', 'Witness for the Prosecution', 'Rashomon', 'The Dark Knight', 'La Haine', 'Dog Day Afternoon', 'The Shawshank Redemption', 'The Godfather', 'Touch of Evil'] 
 
 ['The Shawshank Redemption', 'La Haine', 'Harry Potter and the Deathly Hallows: Part 2', 'The 400 Blows', "Pan's Labyrinth", 'The Dark Knight', 'The Silence of the Lambs', 'Spotlight', 'Touch of Evil', 'Shutter Island']


In [35]:
print(recommend_JP('Batman Begins'), '\n', '\n', recommend_EN('Batman Begins'))

['The Dark Knight Rises', 'The Dark Knight', 'Star Wars: Episode V - The Empire Strikes Back', 'The Prestige', 'Inception', 'Raiders of the Lost Ark', 'Interstellar', 'Yojimbo', 'Gladiator', 'The General'] 
 
 ['The Dark Knight', 'The Dark Knight Rises', 'The Prestige', 'Yojimbo', 'Baby Driver', 'Reservoir Dogs', 'The Usual Suspects', 'The Wolf of Wall Street', 'Heat', 'Touch of Evil']
