# DATA MINING FOR MOVIE RECOMMENDER SYSTEM USING CONTENT BASED FILTERING

IMPORTING LIBRARIES

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from gensim.models import Doc2Vec
from sklearn import utils
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler


DATA EXPLORATORY

In [2]:
#tahap awal adalah meload dataset yang digunakan
movies_data = pd.read_csv('./dataset/movies.csv', usecols=['movieId', 'title', 'genres'])
links_data = pd.read_csv('./dataset/links.csv')
ratings_data = pd.read_csv('./dataset/ratings.csv')
tags_data = pd.read_csv('./dataset/tags.csv')


1.Dataset Movies

In [3]:
#sampel movies.csv
movies_data.head(10)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [4]:
#fungsi ini untuk mencheck jumlah baris dan kolom pada dataset
movies_data.shape


(9742, 3)

In [5]:
#Kemudian akan dilakukan pengecekan data type setiap kolom
movies_data.dtypes

movieId     int64
title      object
genres     object
dtype: object

In [6]:
#kita akan melakukan pengecekan apakah ada missing values pada dataset movies.csv
movies_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [7]:
#Selanjutnya kita akan melakukan pengecekan apakah ada data ganda (duplicated)
movies_data.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
9737    False
9738    False
9739    False
9740    False
9741    False
Length: 9742, dtype: bool

2.Ratings Dataset

In [8]:
#sampel ratings.csv
ratings_data.sample(10)


Unnamed: 0,userId,movieId,rating,timestamp
71853,462,8464,3.0,1201445840
98612,607,2737,4.0,964744628
36299,247,91529,4.0,1467644141
92538,597,2640,4.0,941640505
91693,594,7023,4.5,1108972356
52273,339,75985,4.5,1460346679
7215,50,5669,2.5,1514238595
47174,307,3270,2.0,1186172881
5100,33,994,5.0,939646939
90123,586,5882,3.5,1529899805


In [9]:
#Mencheck jumlah baris dan kolom dataset Ratings
ratings_data.shape

(100836, 4)

In [10]:
#Mencheck tipe data setiap kolom
ratings_data.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [11]:
#Mencheck apakah terdapat missing values
ratings_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [12]:
#Mencheck apakah terdapat data ganda (duplicated)
ratings_data.duplicated()

0         False
1         False
2         False
3         False
4         False
          ...  
100831    False
100832    False
100833    False
100834    False
100835    False
Length: 100836, dtype: bool

3.Dataset links

In [13]:
#Sampel links.csv
links_data.sample(10)

Unnamed: 0,movieId,imdbId,tmdbId
2564,3431,82250,14373.0
2553,3418,103074,1541.0
5955,34437,412019,308.0
2467,3284,67848,27841.0
3335,4518,95488,11347.0
8209,103539,1714206,157386.0
650,838,116191,3573.0
9579,174815,4481514,345914.0
9408,165103,2387499,331313.0
777,1019,46672,173.0


In [14]:
#Mencheck jumlah baris dan kolom dataset links
links_data.shape

(9742, 3)

In [15]:
#Mencheck tipe data setiap kolom
links_data.dtypes

movieId      int64
imdbId       int64
tmdbId     float64
dtype: object

In [16]:
#Mencheck apakah terdapat missing values
links_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9742 non-null   int64  
 1   imdbId   9742 non-null   int64  
 2   tmdbId   9734 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 228.5 KB


In [17]:
#Selanjutnya kita akan melakukan pengecekan apakah ada data ganda (duplicated)
links_data.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
9737    False
9738    False
9739    False
9740    False
9741    False
Length: 9742, dtype: bool

4.Dataset Tags

In [18]:
#Sampel tags.csv
tags_data.sample(10)

Unnamed: 0,userId,movieId,tag,timestamp
1568,474,2108,weather forecaster,1138032081
471,125,100083,sarcasm,1474377015
1806,474,3481,Nick Hornby,1137181093
121,62,27660,sci-fi,1525554491
821,424,2700,parody,1457844393
3019,567,4878,psychological,1525282595
497,184,2579,Christopher Nolan,1537094326
3491,599,296,neo-noir,1498456486
571,305,4995,mathematics,1464428783
677,357,48516,undercover cop,1348627156


In [19]:
#Mencheck jumlah baris dan kolom dataset tags
tags_data.shape

(3683, 4)

In [20]:
#Mencheck tipe data setiap kolom
tags_data.dtypes

userId        int64
movieId       int64
tag          object
timestamp     int64
dtype: object

In [21]:
#Mencheck apakah terdapat missing values
tags_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   userId     3683 non-null   int64 
 1   movieId    3683 non-null   int64 
 2   tag        3683 non-null   object
 3   timestamp  3683 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 115.2+ KB


In [22]:
#Selanjutnya kita akan melakukan pengecekan apakah ada data ganda (duplicated)
links_data.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
9737    False
9738    False
9739    False
9740    False
9741    False
Length: 9742, dtype: bool

Pada projek ini konten yang digunakan pada CONTENT BASED FILTERING ini adalah GENRE FILM

## Data Preproccessing

In [23]:
movies_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Pada dataset ini, kami menemukan bahwa dataset yang digunakan sudah cukup clean, dan tidak kmi temukan noise, jadi tidak perlu lagi melakukan punctutation removal, lowercasing, stop removal,stemming, dll

In [32]:
genre_labels = set()
for i in movies_data['genres'].str.split(' ').values:
    genre_labels = genre_labels.union(set(i))

# Fungsi untuk menghitung frekuensi kata kunci pada genre film yang muncul dan dimasukkan kedalam dictionary
def count_freq(dataset, ref_col, genreLabel):
    keyword_count = dict()
    for j in genreLabel:
         keyword_count[j] = 0
    for genreLabel_word in dataset[ref_col].str.split(' '):
        if type(genreLabel_word) == float and pd.isnull(genreLabel_word):
            continue
        for j in [j for j in genreLabel_word if j in genreLabel]:
            if pd.notnull(j):
                 keyword_count[j] += 1
                    #mengconvert dictionary kedalam list untuk meng sort keyword berdasarkan frequensi
    word_frequency = []
    for word, freq in keyword_count.items():
        word_frequency.append([word, freq])
    word_frequency.sort(key = lambda x:x[1], reverse = True)
    return word_frequency, keyword_count

word_frequency, dum = count_freq(movies_data, 'genres', genre_labels)
word_frequency[:5]


[['Drama', 1053],
 ['Comedy', 946],
 ['Comedy|Drama', 435],
 ['Comedy|Romance', 363],
 ['Drama|Romance', 349]]

In [34]:
#Mengubah genre string menjadi string array
movies_data['genres'] =movies_data['genres'].str.split('|')
#Mengkonversi genre menjadi string value
movies_data['genres'] =movies_data['genres'].fillna("").astype('str')


## Feature Extraction

In [26]:
# Feature Extraction menggunakan tf-id
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies_data['genres'])
tfidf_matrix

<9742x177 sparse matrix of type '<class 'numpy.float64'>'
	with 36628 stored elements in Compressed Sparse Row format>

In [27]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim[:4, :4]


array([[1.        , 0.31379419, 0.0611029 , 0.05271111],
       [0.31379419, 1.        , 0.        , 0.        ],
       [0.0611029 , 0.        , 1.        , 0.35172407],
       [0.05271111, 0.        , 0.35172407, 1.        ]])

In [28]:
# Membuat 1-dimensional array dengan movie title
titles = movies_data['title']
indices = pd.Series(movies_data.index, index=movies_data['title'])

## Conten Based Filtering

In [29]:
#Fungsi yang rekomendasi sistem berdasarkan skor cosine similiarity 
def genre_recommendations(title):
    idx = indices[title]
    simliarity_scores = list(enumerate(cosine_sim[idx]))
    simliarity_scores = sorted(simliarity_scores, key=lambda x: x[1], reverse=True)
    simliarity_scores = simliarity_scores[1:21]
    movie_indices = [i[0] for i in simliarity_scores]
    return titles.iloc[movie_indices]

Cara menggunakan sistem ini yaitu, dengan memasukkan judul (title) film, kemudian sistem akan menfilter dan memberikan list film yang memiliki genre yang mirip (similiar)

In [36]:
 genre_recommendations('Jumanji (1995)').head(10)

53                     Indian in the Cupboard, The (1995)
109                     NeverEnding Story III, The (1994)
767                       Escape to Witch Mountain (1975)
1514            Darby O'Gill and the Little People (1959)
1556                                  Return to Oz (1985)
1617                        NeverEnding Story, The (1984)
1618    NeverEnding Story II: The Next Chapter, The (1...
1799                        Santa Claus: The Movie (1985)
3574    Harry Potter and the Sorcerer's Stone (a.k.a. ...
6075    Chronicles of Narnia: The Lion, the Witch and ...
Name: title, dtype: object

Seperti yang dapat kita lihat ketika kita memasukkan, judul film "Toy Story ( 1995)" maka sistem memberikan list rekomendasi film yang memiliki genre yang hampir sama.