# DATA MINING FOR MOVIE RECOMMENDER SYSTEM USING CONTENT BASED FILTERING

IMPORTING LIBRARIES

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from gensim.models import Doc2Vec
from sklearn import utils
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import gensim
from gensim.models.doc2vec import TaggedDocument

DATA EXPLORATORY

In [2]:
#tahap awal adalah meload dataset yang digunakan
movies_data = pd.read_csv('./dataset/movies.csv')
links_data = pd.read_csv('./dataset/links.csv')
ratings_data = pd.read_csv('./dataset/ratings.csv')
tags_data = pd.read_csv('./dataset/tags.csv')


1.Dataset Movies

In [3]:
#sampel movies.csv
movies_data.sample(10)


Unnamed: 0,movieId,title,genres
4148,5968,Miami Blues (1990),Comedy|Crime|Drama
7862,93982,"Raven, The (2012)",Mystery|Thriller
5452,26085,Mutiny on the Bounty (1962),Adventure|Drama|Romance
4038,5723,Continental Divide (1981),Comedy|Romance
1539,2074,"Night Porter, The (Portiere di notte, Il) (1974)",Crime|Drama|Romance
3947,5560,À nous la liberté (Freedom for Us) (1931),Comedy|Musical
3255,4399,Diary of a Chambermaid (Journal d'une femme de...,Comedy|Drama
3344,4529,Bagdad Cafe (Out of Rosenheim) (1987),Comedy|Drama
5768,31223,Racing Stripes (2005),Children|Comedy
2923,3919,Hellraiser III: Hell on Earth (1992),Horror


In [4]:
#fungsi ini untuk mencheck jumlah baris dan kolom pada dataset
movies_data.shape


(9742, 3)

In [5]:
#Kemudian akan dilakukan pengecekan data type setiap kolom
movies_data.dtypes

movieId     int64
title      object
genres     object
dtype: object

In [6]:
#kita akan melakukan pengecekan apakah ada missing values pada dataset movies.csv
movies_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [7]:
#Selanjutnya kita akan melakukan pengecekan apakah ada data ganda (duplicated)
movies_data.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
9737    False
9738    False
9739    False
9740    False
9741    False
Length: 9742, dtype: bool

2.Ratings Dataset

In [8]:
#sampel ratings.csv
ratings_data.sample(10)


Unnamed: 0,userId,movieId,rating,timestamp
32448,221,3039,4.0,1111177928
98305,606,25833,4.0,1173381141
48349,313,1356,2.0,1030474853
65109,416,2762,2.5,1187496431
56158,372,159,2.0,874416196
25407,177,7022,2.5,1435526211
88519,571,3708,2.0,966900772
52194,339,41566,3.0,1460185253
31478,217,3461,2.0,955941925
1414,14,434,4.0,835441066


In [9]:
#Mencheck jumlah baris dan kolom dataset Ratings
ratings_data.shape

(100836, 4)

In [10]:
#Mencheck tipe data setiap kolom
ratings_data.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [11]:
#Mencheck apakah terdapat missing values
ratings_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [12]:
#Mencheck apakah terdapat data ganda (duplicated)
ratings_data.duplicated()

0         False
1         False
2         False
3         False
4         False
          ...  
100831    False
100832    False
100833    False
100834    False
100835    False
Length: 100836, dtype: bool

3.Dataset links

In [13]:
#Sampel links.csv
links_data.sample(10)

Unnamed: 0,movieId,imdbId,tmdbId
4457,6581,66249,5185.0
3377,4593,97328,10551.0
2584,3453,195778,13539.0
2501,3342,86969,11296.0
6215,45658,410400,38344.0
6162,44397,454841,9792.0
8843,132424,2726560,228205.0
2116,2812,160401,22314.0
7110,70708,964185,11928.0
8859,133377,3327624,301728.0


In [14]:
#Mencheck jumlah baris dan kolom dataset links
links_data.shape

(9742, 3)

In [15]:
#Mencheck tipe data setiap kolom
links_data.dtypes

movieId      int64
imdbId       int64
tmdbId     float64
dtype: object

In [16]:
#Mencheck apakah terdapat missing values
links_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9742 non-null   int64  
 1   imdbId   9742 non-null   int64  
 2   tmdbId   9734 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 228.5 KB


In [17]:
#Selanjutnya kita akan melakukan pengecekan apakah ada data ganda (duplicated)
links_data.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
9737    False
9738    False
9739    False
9740    False
9741    False
Length: 9742, dtype: bool

4.Dataset Tags

In [18]:
#Sampel tags.csv
tags_data.sample(10)

Unnamed: 0,userId,movieId,tag,timestamp
3044,567,6214,disturbing,1525283383
2082,474,6183,Day and Hudson,1138039366
1287,474,1089,violence,1137207100
1929,474,4558,twins,1137374798
700,424,50,suspense,1457842315
2494,477,32,time travel,1242494304
302,62,116897,short stories,1528152852
2544,477,1196,original plot,1262795808
1987,474,5294,religion,1138039177
3232,567,117887,heartwarming,1525285938


In [19]:
#Mencheck jumlah baris dan kolom dataset tags
tags_data.shape

(3683, 4)

In [20]:
#Mencheck tipe data setiap kolom
tags_data.dtypes

userId        int64
movieId       int64
tag          object
timestamp     int64
dtype: object

In [21]:
#Mencheck apakah terdapat missing values
tags_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   userId     3683 non-null   int64 
 1   movieId    3683 non-null   int64 
 2   tag        3683 non-null   object
 3   timestamp  3683 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 115.2+ KB


In [22]:
#Selanjutnya kita akan melakukan pengecekan apakah ada data ganda (duplicated)
links_data.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
9737    False
9738    False
9739    False
9740    False
9741    False
Length: 9742, dtype: bool

## Data Preproccessing

## 1. Merging Dataset


In [None]:
Dataset masih terbagi-bagi dalam bebepa file .csv , jadi perlu dilakukan Merging dataset agar 