# Import Library

In [1]:
!pip install Sastrawi
import pandas as pd
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
factory = StemmerFactory()
stemmer = factory.create_stemmer()
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
import scipy.sparse as sp
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import numpy as np
import pickle



You should consider upgrading via the 'C:\Users\USER\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [2]:
dataset= pd.read_csv("https://drive.usercontent.google.com/download?id=1P-5QZOlRQIoXyBjH4o4f7UNtPiIUYVJA&export=download&authuser=0&confirm=t&uuid=a252448b-7da8-44eb-bbf8-2a71e61fc027&at=APcmpozEaP7rmd49cPXfMf--50ui:1744382664848")

In [3]:
df = dataset[['place_name','place_description','category','city','rating','description_location','longitude','latitude','user_ratings_total']]
# Handle missing values
text_columns = ['place_description', 'description_location']
for col in text_columns:
    df[col] = df[col].fillna('')

categorical_columns = ['category', 'city']
for col in categorical_columns:
    df[col] = df[col].fillna(df[col].mode()[0])

numeric_columns = ['rating']
for col in numeric_columns:
    df[col] = df[col].fillna(df[col].median())

# Combine text features
df['combined_text'] = df['place_description'] + " " + df['description_location']
df['combined_text'] = df['combined_text'].apply(lambda x: stemmer.stem(x))
df['combined_text'] = df['combined_text'].apply(lambda x: x.lower())
df['combined_text'] = df['combined_text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

# Define Indonesian stopwords
indonesian_stopwords = ['yang', 'dan', 'di', 'ke', 'dari', 'ini', 'itu', 'untuk', 'pada', 'dengan',
                        'adalah', 'atau', 'jika', 'saya', 'kita', 'akan', 'tidak', 'tersebut']

# Create TF-IDF features
vectorizer = TfidfVectorizer(stop_words=indonesian_stopwords, max_features=1000)
tfidf_matrix = vectorizer.fit_transform(df['combined_text'])

# Encode categorical features
le_category = LabelEncoder()
df['category_encoded'] = le_category.fit_transform(df['category'])

le_city = LabelEncoder()
df['city_encoded'] = le_city.fit_transform(df['city'])

# Scale numerical features
scaler = StandardScaler()
df[['rating_scaled']] = scaler.fit_transform(df[['rating']])

# Combine all features
other_features = df[['category_encoded', 'city_encoded', 'rating_scaled']]
other_features_sparse = sp.csr_matrix(other_features.values)
final_feature_matrix = sp.hstack([tfidf_matrix, other_features_sparse])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna('')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df[col].mode()[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df[col].median())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_ind

In [4]:
# Calculate cosine similarity
similarity_matrix = cosine_similarity(final_feature_matrix, dense_output=False)

# Recommendation function
def get_recommendations(place_index, similarity_matrix, df, top_n=5):
    if place_index < 0 or place_index >= similarity_matrix.shape[0]:
        raise ValueError("Invalid place index.")
    similarity_scores = list(enumerate(similarity_matrix[place_index].toarray().flatten()))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    recommended_indices = [i[0] for i in similarity_scores]
    return recommended_indices

In [5]:
# Get recommendations
recommended_indices = get_recommendations(0, similarity_matrix, dataset, top_n=5)
recommended_places = dataset.iloc[recommended_indices]
print("Recommended indices:", recommended_indices)
print(recommended_places[['place_name', 'category', 'city', 'rating',
                          'place_img', 'gallery_photo_img1', 'gallery_photo_img2', 'gallery_photo_img3']])

Recommended indices: [102, 512, 22, 656, 262]
                       place_name  \
102           Parangloe Waterfall   
512         Wisata Alam Wai Tiddo   
22         Pantai Barugaya Punaga   
656  Taman Wisata Holyland Malino   
262                   Buttu Macca   

                                              category  \
102  tourist_attraction, point_of_interest, establi...   
512  tourist_attraction, point_of_interest, establi...   
22   tourist_attraction, point_of_interest, establi...   
656  tourist_attraction, point_of_interest, establi...   
262  tourist_attraction, point_of_interest, establi...   

                       city  rating  \
102  Sulawesi Selatan 92173     4.5   
512  Sulawesi Selatan 91991     4.6   
22   Sulawesi Selatan 92261     4.4   
656  Sulawesi Selatan 92173     4.4   
262  Sulawesi Selatan 91752     4.4   

                                             place_img gallery_photo_img1  \
102  https://maps.googleapis.com/maps/api/place/pho...                

In [6]:
df = df.merge(
	dataset[['place_name', 'place_img', 'gallery_photo_img1', 'gallery_photo_img2', 'gallery_photo_img3']],
	on='place_name',
	how='left'
)

# Simpan model ke file pickle
with open('recommendation_model.pkl', 'wb') as f:
    pickle.dump([similarity_matrix, vectorizer, le_category, le_city, scaler, df], f)

**Variabel yang Disimpan**
  * similarity_matrix: Matriks kesamaan kosinus.
  * vectorizer: Objek TF-IDF Vectorizer.
  * le_category: Label Encoder untuk kategori.
  * le_city: Label Encoder untuk kota.
  * scaler: Standard Scaler untuk rating.
  * df: DataFrame yang berisi data tempat (diganti dari dataset untuk konsistensi).
---
**Cara Memuat Model:**


```
import pickle
with open('recommendation_model.pkl', 'rb') as f:
    similarity_matrix, vectorizer, le_category, le_city, scaler, df = pickle.load(f)

```




