# DATA MINING FOR MOVIE RECOMMENDER SYSTEM USING CONTENT BASED FILTERING

IMPORTING LIBRARIES

In [226]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from gensim.models import Doc2Vec
from sklearn import utils
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from gensim.models import Word2Vec

DATA EXPLORATORY

In [227]:
#tahap awal adalah meload dataset yang digunakan
movies_data = pd.read_csv('./dataset/movies.csv', usecols=['movieId', 'title', 'genres'])
links_data = pd.read_csv('./dataset/links.csv')
ratings_data = pd.read_csv('./dataset/ratings.csv')
tags_data = pd.read_csv('./dataset/tags.csv')


1.Dataset Movies

In [228]:
#sampel movies.csv
movies_data.head(10)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [229]:
#fungsi ini untuk mencheck jumlah baris dan kolom pada dataset
movies_data.shape


(9742, 3)

In [230]:
#Kemudian akan dilakukan pengecekan data type setiap kolom
movies_data.dtypes

movieId     int64
title      object
genres     object
dtype: object

In [231]:
#kita akan melakukan pengecekan apakah ada missing values pada dataset movies.csv
movies_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [232]:
#Selanjutnya kita akan melakukan pengecekan apakah ada data ganda (duplicated)
movies_data.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
9737    False
9738    False
9739    False
9740    False
9741    False
Length: 9742, dtype: bool

2.Ratings Dataset

In [233]:
#sampel ratings.csv
ratings_data.sample(10)


Unnamed: 0,userId,movieId,rating,timestamp
36808,249,31696,4.0,1355366599
28904,199,8984,3.0,1113258696
39832,274,3882,2.5,1171492360
25332,177,5264,2.0,1435837555
2588,19,1722,3.0,965711128
4085,27,801,2.0,962685525
68609,447,349,4.0,836960630
64236,414,7139,4.0,1102694701
8701,59,2174,2.0,953610692
8360,57,2300,4.0,969753201


In [234]:
#Mencheck jumlah baris dan kolom dataset Ratings
ratings_data.shape

(100836, 4)

In [235]:
#Mencheck tipe data setiap kolom
ratings_data.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [236]:
#Mencheck apakah terdapat missing values
ratings_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [237]:
#Mencheck apakah terdapat data ganda (duplicated)
ratings_data.duplicated()

0         False
1         False
2         False
3         False
4         False
          ...  
100831    False
100832    False
100833    False
100834    False
100835    False
Length: 100836, dtype: bool

3.Dataset links

In [238]:
#Sampel links.csv
links_data.sample(10)

Unnamed: 0,movieId,imdbId,tmdbId
3589,4920,35140,32847.0
4954,7482,70034,9461.0
7907,95170,118692,13313.0
2393,3175,177789,926.0
5366,8950,361862,4553.0
7496,82854,1320261,38745.0
6981,66783,758746,13207.0
1333,1805,120890,617.0
2223,2952,119256,8052.0
7911,95199,1586265,76494.0


In [239]:
#Mencheck jumlah baris dan kolom dataset links
links_data.shape

(9742, 3)

In [240]:
#Mencheck tipe data setiap kolom
links_data.dtypes

movieId      int64
imdbId       int64
tmdbId     float64
dtype: object

In [241]:
#Mencheck apakah terdapat missing values
links_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9742 non-null   int64  
 1   imdbId   9742 non-null   int64  
 2   tmdbId   9734 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 228.5 KB


In [242]:
#Selanjutnya kita akan melakukan pengecekan apakah ada data ganda (duplicated)
links_data.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
9737    False
9738    False
9739    False
9740    False
9741    False
Length: 9742, dtype: bool

4.Dataset Tags

In [243]:
#Sampel tags.csv
tags_data.sample(10)

Unnamed: 0,userId,movieId,tag,timestamp
3421,599,296,cult film,1498456345
374,62,158966,individualism,1526076344
3648,599,2959,mindfuck,1498456912
2095,474,6238,immigrants,1138307096
916,424,66097,Tim Burton,1457844494
2904,567,1203,good dialogue,1525283605
1277,474,1080,Bible,1137375841
3345,573,35836,hilarious,1186589105
3317,567,180985,bad music,1525285320
2761,477,79702,video games,1282923860


In [244]:
#Mencheck jumlah baris dan kolom dataset tags
tags_data.shape

(3683, 4)

In [245]:
#Mencheck tipe data setiap kolom
tags_data.dtypes

userId        int64
movieId       int64
tag          object
timestamp     int64
dtype: object

In [246]:
#Mencheck apakah terdapat missing values
tags_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   userId     3683 non-null   int64 
 1   movieId    3683 non-null   int64 
 2   tag        3683 non-null   object
 3   timestamp  3683 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 115.2+ KB


In [247]:
#Selanjutnya kita akan melakukan pengecekan apakah ada data ganda (duplicated)
links_data.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
9737    False
9738    False
9739    False
9740    False
9741    False
Length: 9742, dtype: bool

Pada projek ini konten yang digunakan pada CONTENT BASED FILTERING ini adalah GENRE FILM

## Data Preproccessing

In [248]:
movies_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Pada dataset ini, kami menemukan bahwa dataset yang digunakan sudah cukup clean, dan tidak kmi temukan noise, jadi tidak perlu lagi melakukan punctutation removal, lowercasing, stop removal,stemming, dll

In [194]:
genre_labels = set()
for i in movies_data['genres'].str.split(' ').values:
    genre_labels = genre_labels.union(set(i))

# Fungsi untuk menghitung frekuensi kata kunci pada genre film yang muncul dan dimasukkan kedalam dictionary
def count_freq(dataset, ref_col, genreLabel):
    keyword_count = dict()
    for j in genreLabel:
         keyword_count[j] = 0
    for genreLabel_word in dataset[ref_col].str.split(' '):
        if type(genreLabel_word) == float and pd.isnull(genreLabel_word):
            continue
        for j in [j for j in genreLabel_word if j in genreLabel]:
            if pd.notnull(j):
                 keyword_count[j] += 1
                    #mengconvert dictionary kedalam list untuk meng sort keyword berdasarkan frequensi
    word_frequency = []
    for word, freq in keyword_count.items():
        word_frequency.append([word, freq])
    word_frequency.sort(key = lambda x:x[1], reverse = True)
    return word_frequency, keyword_count

word_frequency, dum = count_freq(movies_data, 'genres', genre_labels)
word_frequency[:5]


[['Drama', 1053],
 ['Comedy', 946],
 ['Comedy|Drama', 435],
 ['Comedy|Romance', 363],
 ['Drama|Romance', 349]]

In [203]:
#Mengubah genre string menjadi string array
mov_ratings_data['genres'] =mov_ratings_data['genres'].str.split('|')
#Mengkonversi genre menjadi string value
mov_ratings_data['genres'] = mov_ratings_data['genres'].fillna("").astype('str')


## Feature Extraction

In [204]:
# Feature Extraction menggunakan tf-id
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies_data['genres'])
tfidf_matrix

<9742x177 sparse matrix of type '<class 'numpy.float64'>'
	with 36628 stored elements in Compressed Sparse Row format>

In [205]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim[:4, :4]


array([[1.        , 0.31379419, 0.0611029 , 0.05271111],
       [0.31379419, 1.        , 0.        , 0.        ],
       [0.0611029 , 0.        , 1.        , 0.35172407],
       [0.05271111, 0.        , 0.35172407, 1.        ]])

In [254]:
# Membuat 1-dimensional array dengan movie title
titles = movies_data['title']
indices = pd.Series(movies_data.index, index=movies_data['title'])

## Conten Based Filtering

In [252]:
#Fungsi yang rekomendasi sistem berdasarkan skor cosine similiarity 
def genre_recommendations(title):
    idx = indices[title]
    simliarity_scores = list(enumerate(cosine_sim[idx]))
    simliarity_scores = sorted(simliarity_scores, key=lambda x: x[1], reverse=True)
    simliarity_scores = simliarity_scores[1:21]
    movie_indices = [i[0] for i in simliarity_scores]
    return titles.iloc[movie_indices]

Cara menggunakan sistem ini yaitu, dengan memasukkan judul (title) film, kemudian sistem akan menfilter dan memberikan list film yang memiliki genre yang mirip (similiar)

In [253]:
 genre_recommendations('Toy Story (1995)').head(10)

1706                                          Antz (1998)
2355                                   Toy Story 2 (1999)
2809       Adventures of Rocky and Bullwinkle, The (2000)
3000                     Emperor's New Groove, The (2000)
3568                                Monsters, Inc. (2001)
6194                                     Wild, The (2006)
6486                               Shrek the Third (2007)
6948                       Tale of Despereaux, The (2008)
7760    Asterix and the Vikings (Astérix et les Viking...
8219                                         Turbo (2013)
Name: title, dtype: object

Seperti yang dapat kita lihat ketika kita memasukkan, judul film "Toy Story ( 1995)" maka sistem memberikan list rekomendasi film yang memiliki genre yang hampir sama.