# Proyek Akhir Machine Learning: Sistem Rekomendasi Buku

- **Nama:** Kevin Arnandes
- **Email:** kevinarnandes21@gmail.com
- **ID Dicoding:** kevinarnandes

# Data Collection

## Import libary

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


## Gathering Data

In [None]:
# Load Dataset
url_tour = "https://raw.githubusercontent.com/Vinzzztty/playground-data-analyst/main/Dataset/pariwisata_jogja/tour.csv"
url_rating = "https://raw.githubusercontent.com/Vinzzztty/playground-data-analyst/main/Dataset/pariwisata_jogja/tour_rating.csv"
url_user = "https://raw.githubusercontent.com/Vinzzztty/playground-data-analyst/main/Dataset/pariwisata_jogja/user.csv"

tour = pd.read_csv(url_tour)
rating = pd.read_csv(url_rating)
user = pd.read_csv(url_user)

print('Jumlah data tempat wisata: ', len(tour.Place_Id.unique()))
print('Jumlah data rating tempat wisata: ', len(rating.Place_Id.unique()))
print('Jumlah data user: ', len(user.User_Id.unique()))

Jumlah data tempat wisata:  126
Jumlah data rating tempat wisata:  126
Jumlah data user:  300


# Data Understanding

## Univariate Exploratory Data Analysis

### Tour Variabel

In [None]:
tour.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 126 entries, 0 to 125
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Place_Id      126 non-null    float64
 1   Place_Name    126 non-null    object 
 2   Description   126 non-null    object 
 3   Category      126 non-null    object 
 4   City          126 non-null    object 
 5   Price         126 non-null    int64  
 6   Rating        126 non-null    float64
 7   Time_Minutes  60 non-null     float64
 8   Coordinate    126 non-null    object 
 9   Latitude      126 non-null    float64
 10  Longitude     126 non-null    float64
dtypes: float64(5), int64(1), object(5)
memory usage: 11.0+ KB


In [None]:
tour.shape

(126, 11)

In [None]:
print("Banyak data: ", len(tour.Place_Id.unique()))
print("Kategori tempat wisata: ", tour.Category.unique())

Banyak data:  126
Kategori tempat wisata:  ['Taman Hiburan' 'Budaya' 'Cagar Alam' 'Bahari' 'Pusat Perbelanjaan']


### Rating Variabel

In [None]:
rating.head()

Unnamed: 0,User_Id,Place_Id,Place_Ratings
0,1,101,4
1,1,154,2
2,1,103,3
3,1,208,5
4,1,89,3


In [None]:
rating.describe()

Unnamed: 0,User_Id,Place_Id,Place_Ratings
count,2870.0,2870.0,2870.0
mean,152.132056,147.662021,3.10453
std,86.489316,36.950298,1.394187
min,1.0,85.0,1.0
25%,78.0,115.0,2.0
50%,153.0,149.0,3.0
75%,227.0,179.0,4.0
max,300.0,210.0,5.0


In [None]:
rating.Place_Ratings.value_counts()

Place_Ratings
5    622
4    601
3    589
2    571
1    487
Name: count, dtype: int64

### User Variabel

In [None]:
user.head()

Unnamed: 0,User_Id,Location,Age
0,1,"Semarang, Jawa Tengah",20.0
1,2,"Bekasi, Jawa Barat",21.0
2,3,"Cirebon, Jawa Barat",23.0
3,4,"Bekasi, Jawa Barat",21.0
4,5,"Lampung, Sumatera Selatan",20.0


In [None]:
user.describe()

Unnamed: 0,User_Id,Age
count,300.0,300.0
mean,150.5,28.7
std,86.746758,6.393716
min,1.0,18.0
25%,75.75,24.0
50%,150.5,29.0
75%,225.25,34.0
max,300.0,40.0


Rata rata umur adalah 28 tahun

# Data Pre Processing

## Menggabungkan seluruh rating wisata

In [None]:
# Menggabungkan seluruh Place_Id pada kategori Tour
tour_all = np.concatenate((
    tour.Place_Id.unique(),
    rating.Place_Id.unique(),
))

# Mengurutkan data dan menghapus data yang sama
tour_all = np.sort(np.unique(tour_all))

print('Jumlah seluruh data wisata berdasarkan Place_Id: ', len(tour_all))

Jumlah seluruh data wisata berdasarkan Place_Id:  126


## Menggabungkan Seluruh User

In [None]:
# Menggabungkan seluruh userID
user_all = np.concatenate((
    user.User_Id.unique(),
    rating.User_Id.unique(),
))

# Menghapus data yang sama kemudian mengurutkannya
user_all = np.sort(np.unique(user_all))

print('Jumlah seluruh user: ', len(user_all))

Jumlah seluruh user:  300


## Mengetahui Jumlah Rating

In [None]:
tour.groupby('Place_Id').sum()

Unnamed: 0_level_0,Place_Name,Description,Category,City,Price,Rating,Time_Minutes,Coordinate,Latitude,Longitude
Place_Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
85.0,Taman Pintar Yogyakarta,Taman Pintar Yogyakarta (bahasa Jawa: Hanacara...,Taman Hiburan,Yogyakarta,6000,4.5,120.0,"{'lat': -7.800671500000001, 'lng': 110.3676551}",-7.800671,110.367655
86.0,Keraton Yogyakarta,Keraton Ngayogyakarta Hadiningrat atau Keraton...,Budaya,Yogyakarta,15000,4.6,0.0,"{'lat': -7.8052845, 'lng': 110.3642031}",-7.805284,110.364203
87.0,Sindu Kusuma Edupark (SKE),Sindu Kusuma Edupark (SKE) merupakan sebuah de...,Taman Hiburan,Yogyakarta,20000,4.2,120.0,"{'lat': -7.767297300000001, 'lng': 110.3542486}",-7.767297,110.354249
88.0,Museum Benteng Vredeburg Yogyakarta,Museum Benteng Vredeburg (bahasa Jawa: ???????...,Budaya,Yogyakarta,3000,4.6,120.0,"{'lat': -7.800201599999999, 'lng': 110.3663044}",-7.800202,110.366304
89.0,De Mata Museum Jogja,Museum De Mata merupakan salah satu museum yan...,Budaya,Yogyakarta,50000,4.4,0.0,"{'lat': -7.816315599999999, 'lng': 110.3871442}",-7.816316,110.387144
...,...,...,...,...,...,...,...,...,...,...
206.0,Wisata Kaliurang,"Jogja selalu menarik untuk dikulik, terlebih t...",Cagar Alam,Yogyakarta,8000,4.4,60.0,"{'lat': -7.6120675, 'lng': 110.4205209}",-7.612068,110.420521
207.0,Heha Sky View,HeHa Sky View adalah salah satu tempat wisata ...,Taman Hiburan,Yogyakarta,15000,4.4,120.0,"{'lat': -7.8496144, 'lng': 110.478324}",-7.849614,110.478324
208.0,Taman Sungai Mudal,"Taman Sungai Mudal, sebuah objek wisata alam t...",Cagar Alam,Yogyakarta,10000,4.6,150.0,"{'lat': -7.762813599999998, 'lng': 110.1161626}",-7.762814,110.116163
209.0,Pantai Sanglen,Pantai Sanglen. Lokasinya berada di Desa Kemad...,Bahari,Yogyakarta,10000,4.5,0.0,"{'lat': -8.1367456, 'lng': 110.5716362}",-8.136746,110.571636


## Menggabungkan Data dengan Fitur Nama Tempat

In [None]:
# Definisikan dataframe rating ke dalam variabel all_tour_rate
all_tour_rate = rating

all_tour_rate

Unnamed: 0,User_Id,Place_Id,Place_Ratings
0,1,101,4
1,1,154,2
2,1,103,3
3,1,208,5
4,1,89,3
...,...,...,...
2865,300,107,2
2866,300,193,5
2867,300,108,5
2868,300,103,5


In [None]:
# Menggabungkan all_tour_rate dengan dataframe tour berdasarkan Place_Id
all_tour = pd.merge(all_tour_rate, tour[['Place_Id', 'Place_Name', 'Category', 'Rating']], on='Place_Id', how='left')

all_tour

Unnamed: 0,User_Id,Place_Id,Place_Ratings,Place_Name,Category,Rating
0,1,101,4,Kampung Wisata Sosro Menduran,Budaya,4.0
1,1,154,2,Pantai Ngrawe (Mesra),Bahari,4.5
2,1,103,3,Tugu Pal Putih Jogja,Taman Hiburan,4.7
3,1,208,5,Taman Sungai Mudal,Cagar Alam,4.6
4,1,89,3,De Mata Museum Jogja,Budaya,4.4
...,...,...,...,...,...,...
2865,300,107,2,Bangsal Pagelaran,Budaya,4.6
2866,300,193,5,Candi Sewu,Budaya,4.6
2867,300,108,5,Embung Tambakboyo,Taman Hiburan,4.4
2868,300,103,5,Tugu Pal Putih Jogja,Taman Hiburan,4.7


# Data Preparation

## Mencari tahu apakah ada Missing Value

In [None]:
all_tour.isnull().sum()

User_Id          0
Place_Id         0
Place_Ratings    0
Place_Name       0
Category         0
Rating           0
dtype: int64

## Melihat kategori wisata

In [None]:
all_tour.Category.unique()

array(['Budaya', 'Bahari', 'Taman Hiburan', 'Cagar Alam',
       'Pusat Perbelanjaan'], dtype=object)

## Preparation Data

In [None]:
# Membuat variabel preparation yang berisi dataframe all_tour kemudian diurutkan berdasarkan Place_Id
preparation = all_tour

preparation.sort_values('Place_Id')

Unnamed: 0,User_Id,Place_Id,Place_Ratings,Place_Name,Category,Rating
810,88,85,1,Taman Pintar Yogyakarta,Taman Hiburan,4.5
1503,159,85,4,Taman Pintar Yogyakarta,Taman Hiburan,4.5
392,43,85,4,Taman Pintar Yogyakarta,Taman Hiburan,4.5
2681,282,85,1,Taman Pintar Yogyakarta,Taman Hiburan,4.5
1561,165,85,4,Taman Pintar Yogyakarta,Taman Hiburan,4.5
...,...,...,...,...,...,...
2774,290,210,5,Pantai Congot,Bahari,4.3
1949,206,210,1,Pantai Congot,Bahari,4.3
142,17,210,4,Pantai Congot,Bahari,4.3
405,44,210,3,Pantai Congot,Bahari,4.3


### Menghapus data Duplikat

In [None]:
# Melihat data duplikat
preparation.duplicated().sum()

23

In [None]:
# Menghapus data duplikat
preparation = preparation.drop_duplicates('Place_Id')

preparation

Unnamed: 0,User_Id,Place_Id,Place_Ratings,Place_Name,Category,Rating
0,1,101,4,Kampung Wisata Sosro Menduran,Budaya,4.0
1,1,154,2,Pantai Ngrawe (Mesra),Bahari,4.5
2,1,103,3,Tugu Pal Putih Jogja,Taman Hiburan,4.7
3,1,208,5,Taman Sungai Mudal,Cagar Alam,4.6
4,1,89,3,De Mata Museum Jogja,Budaya,4.4
...,...,...,...,...,...,...
367,40,87,2,Sindu Kusuma Edupark (SKE),Taman Hiburan,4.2
382,41,175,3,Museum Ullen Sentalu,Budaya,4.7
387,42,180,5,Pantai Depok Jogja,Bahari,4.3
413,45,146,4,Bukit Wisata Pulepayung,Cagar Alam,4.5


### Mengkonversi data series menjadi bentuk list

In [None]:
# Mengonversi data series Place_Id menjadi dalam bentuk list
destination_id = preparation['Place_Id'].tolist()

# Mengonversi data series Place_Name menjadi dalam bentuk list
destination_name = preparation['Place_Name'].tolist()

# Mengonversi data series Category menjadi dalam bentuk list
destination_category = preparation['Category'].tolist()

print(len(destination_id))
print(len(destination_name))
print(len(destination_category))

126
126
126


In [None]:
# Membuat dictionary untuk data destination_id, destination_name, destination_category
destination = pd.DataFrame({
    'id': destination_id,
    'destination_name': destination_name,
    'category': destination_category
})
destination

Unnamed: 0,id,destination_name,category
0,101,Kampung Wisata Sosro Menduran,Budaya
1,154,Pantai Ngrawe (Mesra),Bahari
2,103,Tugu Pal Putih Jogja,Taman Hiburan
3,208,Taman Sungai Mudal,Cagar Alam
4,89,De Mata Museum Jogja,Budaya
...,...,...,...
121,87,Sindu Kusuma Edupark (SKE),Taman Hiburan
122,175,Museum Ullen Sentalu,Budaya
123,180,Pantai Depok Jogja,Bahari
124,146,Bukit Wisata Pulepayung,Cagar Alam


# Model Development dengan Content Based Filtering

In [None]:
data = destination
data.sample(5)

Unnamed: 0,id,destination_name,category
92,165,Green Village Gedangsari,Taman Hiburan
71,191,Pantai Drini,Bahari
9,185,Pantai Krakal,Bahari
36,116,Jurang Tembelan Kanigoro,Taman Hiburan
124,146,Bukit Wisata Pulepayung,Cagar Alam


## TF-IDF Vectorizer

In [None]:
# Inisialisasi TFIDFVectorizer
tf = TfidfVectorizer()

# Melakukan perhitungan idf pada data category
tf.fit(data['category'])

# Mapping array dari fitur inedx integer ke fitur utama
tf.get_feature_names_out()

array(['alam', 'bahari', 'budaya', 'cagar', 'hiburan', 'perbelanjaan',
       'pusat', 'taman'], dtype=object)

In [None]:
# Melakukan Fit lalu ditransformasikan ke bentuk matrix
tfidf_matrix = tf.fit_transform(data['category'])

# Melihat ukuran matrix tfidf
tfidf_matrix.shape

(126, 8)

In [None]:
# Mengubah vektor tf-idf dalam bentuk matriks dengan fungsi todense()
tfidf_matrix.todense()

matrix([[0.        , 0.        , 1.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.70710678],
        ...,
        [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.70710678, 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.70710678]])

### Membuat dataframe tf-idf matrix

In [None]:
# Membuat dataframe untuk melihat tf-idf matrix
# Kolom diisi dengan jenis masakan
# Baris diisi dengan nama resto

pd.DataFrame(
    tfidf_matrix.todense(),
    columns=tf.get_feature_names_out(),
    index=data.destination_name
).sample(8, axis=1).sample(10, axis=0)

Unnamed: 0_level_0,bahari,cagar,taman,budaya,alam,pusat,perbelanjaan,hiburan
destination_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Galaxy Waterpark Jogja,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.707107
Monumen Yogya Kembali,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
Geoforest Watu Payung Turunan,0.0,0.707107,0.0,0.0,0.707107,0.0,0.0,0.0
Monumen Sanapati,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
Puncak Kebun Buah Mangunan,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.707107
Taman Sungai Mudal,0.0,0.707107,0.0,0.0,0.707107,0.0,0.0,0.0
Pantai Timang,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Museum Gunung Merapi,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
Desa Wisata Tembi,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.707107
Pantai Indrayanti,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Cosine Similarity

### Menghitung cosine similarity pada matrix tf-idf

In [None]:
# Menghitung cosine similarity pada matrix tf-idf
cosine_sim = cosine_similarity(tfidf_matrix)
cosine_sim

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 0., 1.],
       ...,
       [0., 1., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 1., ..., 0., 0., 1.]])

### Melihat matrix kesamaan setiap destinasi

In [None]:
# Membuat dataframe dari variabel cosine_sim dengan baris dan kolom berupa nama resto
cosine_sim_df = pd.DataFrame(cosine_sim, index=data['destination_name'], columns=data['destination_name'])
print('Shape:', cosine_sim_df.shape)

# Melihat similarity matrix pada setiap resto
cosine_sim_df.sample(5, axis=1).sample(10, axis=0)

Shape: (126, 126)


destination_name,"Bukit Paralayang, Watugupit",Pantai Sepanjang,Pantai Nguluran,Tugu Pal Putih Jogja,Pantai Congot
destination_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Tebing Breksi,0.0,0.0,0.0,0.0,0.0
Pantai Krakal,0.0,1.0,1.0,0.0,1.0
Monumen Batik Yogyakarta,0.0,0.0,0.0,0.0,0.0
Goa Rancang Kencono,1.0,0.0,0.0,0.0,0.0
Candi Ijo,0.0,0.0,0.0,0.0,0.0
Candi Prambanan,0.0,0.0,0.0,0.0,0.0
Kebun Teh Nglinggo,1.0,0.0,0.0,0.0,0.0
Kampung Wisata Dipowinatan,0.0,0.0,0.0,0.0,0.0
Pantai Ngandong,0.0,1.0,1.0,0.0,1.0
Kampung Wisata Taman Sari,0.0,0.0,0.0,1.0,0.0


# Model Evaluation

## Mendapatkan Rekomendasi

In [95]:
def destination_recommendations(nama_destinasi, similarity_data=cosine_sim_df, items=data[['destination_name', 'category']], k=5):
  """
  Rekomendasi Destinasi Wisata berdasarkan kemiripan dataframe

  Parameter:
  ---
  nama_destinasi : tipe data string (str)
                Nama Destinasi (index kemiripan dataframe)
  similarity_data : tipe data pd.DataFrame (object)
                      Kesamaan dataframe, simetrik, dengan destinasi sebagai
                      indeks dan kolom
  items : tipe data pd.DataFrame (object)
            Mengandung kedua nama dan fitur lainnya yang digunakan untuk mendefinisikan kemiripan
  k : tipe data integer (int)
        Banyaknya jumlah rekomendasi yang diberikan
  ---


  Pada index ini, kita mengambil k dengan nilai similarity terbesar
  pada index matrix yang diberikan (i).
  """

  index = similarity_data.loc[:,nama_destinasi].to_numpy().argpartition(
      range(-1, -k, -1))

  closest = similarity_data.columns[index[-1:-(k+2):-1]]

  closest = closest.drop(nama_destinasi, errors='ignore')

  return pd.DataFrame(closest).merge(items).head(k)





In [96]:
data[data.destination_name.eq('Pasar Beringharjo')]

Unnamed: 0,id,destination_name,category
110,174,Pasar Beringharjo,Pusat Perbelanjaan


In [97]:
# Mendapatkan rekomendasi destinasi yang mirip dengan Candi Sewu
user_input = 'Pasar Beringharjo'

print("Top Recommendation Destinasi:", user_input)
print(destination_recommendations(user_input))

Top Recommendation Destinasi: Pasar Beringharjo
               destination_name            category
0  Pasar Kebon Empring Bintaran  Pusat Perbelanjaan
1   Kawasan Wisata Sosrowijayan  Pusat Perbelanjaan
2         Air Terjun Sri Gethuk          Cagar Alam
3           Goa Rancang Kencono          Cagar Alam
4      Kampung Wisata Kadipaten              Budaya
