## Connect to Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
path = 'drive/My Drive/Colab Notebooks/RecSys/'

## Libraries

In [24]:
!pip install rake_nltk
!pip install nagisa

import pandas as pd
import numpy as np
import nagisa
import nltk

from rake_nltk import Rake
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


nagisa library is used for splitting Japanese text in order to achieve this project goal.

## Load the Dataset

Load the IMDB movie dataset obtained from [data.world](https://data.world/studentoflife/imdb-top-250-lists-and-5000-or-so-data-records) website.

In [6]:
df = pd.read_csv(path + '/data/IMDB_Top250movies2_OMDB_Detailed.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Title,Title-JA,Year,Rated,Released,Runtime,Genre,Genre-JA,Director,...,tomatoConsensus,tomatoUserMeter,tomatoUserRating,tomatoUserReviews,tomatoURL,DVD,BoxOffice,Production,Website,Response
0,1,The Shawshank Redemption,ショーシャンクの空に,1994,R,14 Oct 1994,142 min,"Crime, Drama",犯罪ドラマ,Frank Darabont,...,,,,,http://www.rottentomatoes.com/m/shawshank_rede...,27 Jan 1998,,Columbia Pictures,,True
1,2,The Godfather,ゴッドファーザー,1972,R,24 Mar 1972,175 min,"Crime, Drama",犯罪ドラマ,Francis Ford Coppola,...,,,,,http://www.rottentomatoes.com/m/godfather/,09 Oct 2001,,Paramount Pictures,http://www.thegodfather.com,True
2,3,The Godfather: Part II,ゴッドファーザー：パートII,1974,R,20 Dec 1974,202 min,"Crime, Drama",犯罪ドラマ,Francis Ford Coppola,...,,,,,http://www.rottentomatoes.com/m/godfather_part...,24 May 2005,,Paramount Pictures,http://www.thegodfather.com/,True
3,4,The Dark Knight,ダークナイト,2008,PG-13,18 Jul 2008,152 min,"Action, Crime, Drama",アクション、犯罪、ドラマ,Christopher Nolan,...,,,,,http://www.rottentomatoes.com/m/the_dark_knight/,09 Dec 2008,"$533,316,061",Warner Bros. Pictures/Legendary,http://thedarkknight.warnerbros.com/,True
4,5,12 Angry Men,12人の怒っている男性,1957,APPROVED,01 Apr 1957,96 min,"Crime, Drama",犯罪ドラマ,Sidney Lumet,...,,,,,http://www.rottentomatoes.com/m/1000013-12_ang...,06 Mar 2001,,Criterion Collection,http://www.criterion.com/films/27871-12-angry-men,True


In [8]:
# Keep only necessary columns (Title, Genre, Director, Actors, Plot)
df_for_pre = df[['Title', 'Title-JA', 'Genre', 'Genre-JA', 'Director', 'Director-JA', 'Actors', 'Actors-JA', 'Plot', 'Plot-JA']]
df_for_pre.head()

Unnamed: 0,Title,Title-JA,Genre,Genre-JA,Director,Director-JA,Actors,Actors-JA,Plot,Plot-JA
0,The Shawshank Redemption,ショーシャンクの空に,"Crime, Drama",犯罪ドラマ,Frank Darabont,フランク・ダラボント,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",ティム・ロビンズ、モーガン・フリーマン、ボブ・ガントン、ウィリアム・サドラー,Two imprisoned men bond over a number of years...,2人の投獄された男性が何年にもわたって絆を結び、共通の良識の行為を通じて慰めと最終的なred...
1,The Godfather,ゴッドファーザー,"Crime, Drama",犯罪ドラマ,Francis Ford Coppola,フランシス・フォード・コッポラ,"Marlon Brando, Al Pacino, James Caan, Richard ...",マーロン・ブランド、アル・パチーノ、ジェームズ・カーン、リチャード・S・カステラーノ,The aging patriarch of an organized crime dyna...,組織化された犯罪王朝の老化した家長は、彼の秘密帝国の消極的な息子に支配権を譲渡します。
2,The Godfather: Part II,ゴッドファーザー：パートII,"Crime, Drama",犯罪ドラマ,Francis Ford Coppola,フランシス・フォード・コッポラ,"Al Pacino, Robert Duvall, Diane Keaton, Robert...",アル・パチーノ、ロバート・デュバル、ダイアン・キートン、ロバート・デ・ニーロ,The early life and career of Vito Corleone in ...,1920年代のニューヨークのヴィトコルレオーネの初期の人生とキャリアは、息子のマイケルが家族...
3,The Dark Knight,ダークナイト,"Action, Crime, Drama",アクション、犯罪、ドラマ,Christopher Nolan,クリストファー・ノーラン,"Christian Bale, Heath Ledger, Aaron Eckhart, M...",クリスチャン・ベール、ヒース・レジャー、アーロン・エックハート、マイケル・ケイン,When the menace known as the Joker emerges fro...,ジョーカーとして知られる脅威が彼の神秘的な過去から現れたとき、彼はゴッサムの人々に大混乱と混...
4,12 Angry Men,12人の怒っている男性,"Crime, Drama",犯罪ドラマ,Sidney Lumet,シドニールメット,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",マーティン・バルサム、ジョン・フィードラー、リー・J・コブ、例えばマーシャル,A jury holdout attempts to prevent a miscarria...,ju審員は、同僚に証拠を再考することを強制することにより、正義の流産を防止しようとします。


In [9]:
# Check entry types, and missing values
df_for_pre.info()
print('\nMissing values:  ', df_for_pre.isnull().sum().values.sum())
df_for_pre.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Title        250 non-null    object
 1   Title-JA     250 non-null    object
 2   Genre        250 non-null    object
 3   Genre-JA     250 non-null    object
 4   Director     250 non-null    object
 5   Director-JA  250 non-null    object
 6   Actors       250 non-null    object
 7   Actors-JA    250 non-null    object
 8   Plot         250 non-null    object
 9   Plot-JA      250 non-null    object
dtypes: object(10)
memory usage: 19.7+ KB

Missing values:   0


Title          0
Title-JA       0
Genre          0
Genre-JA       0
Director       0
Director-JA    0
Actors         0
Actors-JA      0
Plot           0
Plot-JA        0
dtype: int64

## Data Pre-processing

In [11]:
jpn_stop_words = ["あそこ","あっ","あの","あのかた","あの人","あり","あります","ある","あれ","い","いう","います","いる","う","うち","え","お","および","おり","おります","か","かつて","から","が","き","ここ","こちら","こと","この","これ","これら","さ","さらに","し","しかし","する","ず","せ","せる","そこ","そして","その","その他","その後","それ","それぞれ","それで","た","ただし","たち","ため","たり","だ","だっ","だれ","つ","て","で","でき","できる","です","では","でも","と","という","といった","とき","ところ","として","とともに","とも","と共に","どこ","どの","な","ない","なお","なかっ","ながら","なく","なっ","など","なに","なら","なり","なる","なん","に","において","における","について","にて","によって","により","による","に対して","に対する","に関する","の","ので","のみ","は","ば","へ","ほか","ほとんど","ほど","ます","また","または","まで","も","もの","ものの","や","よう","より","ら","られ","られる","れ","れる","を","ん","何","及び","彼","彼女","我々","特に","私","私達","貴方","貴方方"]

In [21]:
# Rake_JA
import string
import unicodedata

!pip install mecab-python3
import MeCab


class Rake_JA:
    def __init__(self):
        self.tagger = MeCab.Tagger("-Owakati")
    
    def remove_punctuation(self,text):
        text = unicodedata.normalize("NFKC", text)  # 全角記号をざっくり半角へ置換（でも不完全）
        # 記号を消し去るための魔法のテーブル作成
        table = str.maketrans("", "", string.punctuation  + "「」、。・※" + string.digits)
        text = text.translate(table)

        return text
        
    def get_word_score(self, word_list):
        freq = {}
        deg = {}

        for word in word_list:
            freq[word] = (freq.get(word) or 0) + 1
            deg[word] = (deg.get(word) or 0) + len(word) - 1 # word length must be > 1 to be considered as a Japanese 'word'
      
        scores = {}
        for word in word_list:
            scores[word] = deg[word]/freq[word]
        
        scores = {k:v for k, v in  sorted(scores.items(), key=lambda item: item[1], reverse=True)}
      
        return scores
    
    def get_keywords(self, text, limit=0):
        parsed_text = self.tagger.parse(text)
        raw_word_list = self.remove_punctuation(parsed_text).split()
        word_list = [word for word in raw_word_list if word not in jpn_stop_words ]
        
        score_list = self.get_word_score(word_list)
        
        if limit == 0:
            return list(score_list.keys())
        else:
            return list(score_list.keys())[:limit]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [25]:
# Remove punctuations from Plot
df_for_pre['Plot'] = df_for_pre['Plot'].str.replace('[^\w\s]','')
df_for_pre.head()

  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Title,Title-JA,Genre,Genre-JA,Director,Director-JA,Actors,Actors-JA,Plot,Plot-JA,Key_words
0,The Shawshank Redemption,ショーシャンクの空に,"Crime, Drama",犯罪ドラマ,Frank Darabont,フランク・ダラボント,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",ティム・ロビンズ、モーガン・フリーマン、ボブ・ガントン、ウィリアム・サドラー,Two imprisoned men bond over a number of years...,2人の投獄された男性が何年にもわたって絆を結び共通の良識の行為を通じて慰めと最終的なredい...,
1,The Godfather,ゴッドファーザー,"Crime, Drama",犯罪ドラマ,Francis Ford Coppola,フランシス・フォード・コッポラ,"Marlon Brando, Al Pacino, James Caan, Richard ...",マーロン・ブランド、アル・パチーノ、ジェームズ・カーン、リチャード・S・カステラーノ,The aging patriarch of an organized crime dyna...,組織化された犯罪王朝の老化した家長は彼の秘密帝国の消極的な息子に支配権を譲渡します,
2,The Godfather: Part II,ゴッドファーザー：パートII,"Crime, Drama",犯罪ドラマ,Francis Ford Coppola,フランシス・フォード・コッポラ,"Al Pacino, Robert Duvall, Diane Keaton, Robert...",アル・パチーノ、ロバート・デュバル、ダイアン・キートン、ロバート・デ・ニーロ,The early life and career of Vito Corleone in ...,1920年代のニューヨークのヴィトコルレオーネの初期の人生とキャリアは息子のマイケルが家族の...,
3,The Dark Knight,ダークナイト,"Action, Crime, Drama",アクション、犯罪、ドラマ,Christopher Nolan,クリストファー・ノーラン,"Christian Bale, Heath Ledger, Aaron Eckhart, M...",クリスチャン・ベール、ヒース・レジャー、アーロン・エックハート、マイケル・ケイン,When the menace known as the Joker emerges fro...,ジョーカーとして知られる脅威が彼の神秘的な過去から現れたとき彼はゴッサムの人々に大混乱と混乱...,
4,12 Angry Men,12人の怒っている男性,"Crime, Drama",犯罪ドラマ,Sidney Lumet,シドニールメット,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",マーティン・バルサム、ジョン・フィードラー、リー・J・コブ、例えばマーシャル,A jury holdout attempts to prevent a miscarria...,ju審員は同僚に証拠を再考することを強制することにより正義の流産を防止しようとします,


In [26]:
# Extract key words from Plot to a list
df_for_pre['Key_words'] = ''   # initializing a new column
r = Rake()   # use Rake to discard stop words (based on english stopwords from NLTK)

for index, row in df_for_pre.iterrows():
    r.extract_keywords_from_text(row['Plot'])   # to extract key words from Plot, default in lower case
    key_words_dict_scores = r.get_word_degrees()    # to get dictionary with key words and their scores
    row['Key_words'] = list(key_words_dict_scores.keys())   # to assign list of key words to new column

df_for_pre

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


LookupError: ignored