In [7]:
# Instal library Sastrawi untuk bahasa Indonesia
!pip install Sastrawi

import pandas as pd
import numpy as np
import re
import os
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

# Load dataset
# Pastikan nama file sesuai dengan yang di folder kiri
filepath = '/content/sinopsis_novel.csv'

if os.path.exists(filepath):
    try:
        df = pd.read_csv(filepath, encoding='latin1')
        print(f"Sukses")
    except:
        # Kalau gagal
        try:
             df = pd.read_csv(filepath, sep=';', encoding='latin1')
             print(f"Sukses!")
        except:
             print("Masih gagal baca.")

    if 'df' in locals():
        print(f"Total novel ditemukan: {len(df)}")
        print(df.head(3)) # Tampilkan 3 data teratas buat ngecek
else:
    print("File tidak ditemukan.")

Sukses!
Total novel ditemukan: 235
    No                 Judul       Penulis                        Genre  \
0  1.0                DAMIAN      Ezra san  Romansa, Petualangan, mafia   
1  2.0  Pertanian Ajaib Aria     Amarine05             Fantasi, Romansa   
2  3.0             OSTIARIUS  icebreaker20               Fantasi, Horor   

                                            Sinopsis  \
0  Christina dituduh membunuh seseorang, semua bu...   
1  Aria, seorang Gastronom dan Culinary Historian...   
2  Satu sentuhan pada benda terkutuk mengubah seg...   

                                          URL Sumber  Unnamed: 6  Unnamed: 7  \
0     https://www.wattpad.com/story/396155941-damian         NaN         NaN   
1  https://www.wattpad.com/story/401230362-pertan...         NaN         NaN   
2  https://www.wattpad.com/story/399473345-ostiarius         NaN         NaN   

   Unnamed: 8  Unnamed: 9  Unnamed: 10 Unnamed: 11  
0         NaN         NaN          NaN         NaN  
1         Na

In [8]:
# Inisialisasi Stemmer dan Stopword
factory_stem = StemmerFactory()
stemmer = factory_stem.create_stemmer()
factory_stop = StopWordRemoverFactory()
stopword = factory_stop.create_stop_word_remover()

# Fungsi untuk membersihkan teks sinopsis
def preprocessing(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = stopword.remove(text)
    text = stemmer.stem(text)
    return text

print("Sedang memproses teks... (Mohon tunggu sebentar)")
# Terapkan fungsi ke kolom Sinopsis
df['sinopsis_clean'] = df['Sinopsis'].apply(preprocessing)
print("Preprocessing selesai.")

Sedang memproses teks... (Mohon tunggu sebentar)
Preprocessing selesai.


DOWNLOAD DATA BERSIH

In [9]:
from google.colab import files

if 'sinopsis_clean' in df.columns:
    nama_file_bersih = 'dataset_bersih.csv'
    df.to_csv(nama_file_bersih, index=False)

    files.download(nama_file_bersih)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [10]:
# 1. Proses TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['sinopsis_clean'])

# 2. Proses LSA (SVD)
# Menggunakan 10 komponen topik
lsa_model = TruncatedSVD(n_components=10, random_state=42)
lsa_matrix = lsa_model.fit_transform(tfidf_matrix)

# 3. Hitung Cosine Similarity
sim_tfidf = cosine_similarity(tfidf_matrix, tfidf_matrix)
sim_lsa = cosine_similarity(lsa_matrix, lsa_matrix)

print("Model berhasil dibangun.")

Model berhasil dibangun.


In [11]:
def rekomendasi_novel(judul):
    # Cek ketersediaan judul
    if judul not in df['Judul'].values:
        print("Judul tidak ditemukan. Pastikan penulisan sesuai (Huruf Besar/Kecil).")
        return

    # Ambil index judul
    idx = df[df['Judul'] == judul].index[0]

    # Ambil skor kemiripan dari kedua metode
    score_tfidf = list(enumerate(sim_tfidf[idx]))
    score_lsa = list(enumerate(sim_lsa[idx]))

    # Urutkan dari yang paling mirip
    score_tfidf = sorted(score_tfidf, key=lambda x: x[1], reverse=True)[1:6]
    score_lsa = sorted(score_lsa, key=lambda x: x[1], reverse=True)[1:6]

    print(f"Hasil Rekomendasi untuk: {judul}")
    print("-" * 50)
    print("METODE TF-IDF (Berdasarkan Kata Kunci):")
    for i in score_tfidf:
        judul_rec = df.iloc[i[0]]['Judul']
        genre_rec = df.iloc[i[0]]['Genre']
        print(f"- {judul_rec} ({genre_rec}) | Skor: {round(i[1], 3)}")

    print("\nMETODE LSA (Berdasarkan Makna/Semantik):")
    for i in score_lsa:
        judul_rec = df.iloc[i[0]]['Judul']
        genre_rec = df.iloc[i[0]]['Genre']
        print(f"- {judul_rec} ({genre_rec}) | Skor: {round(i[1], 3)}")

# CONTOH PENGGUNAAN
judul_test = df.iloc[0]['Judul']
rekomendasi_novel(judul_test)

Hasil Rekomendasi untuk: DAMIAN
--------------------------------------------------
METODE TF-IDF (Berdasarkan Kata Kunci):
- UNTIL HE FOUND ME (Romansa, Thriller, Fantasi, Mafia) | Skor: 0.114
- EPHEMERAL (Romansa, Keluarga) | Skor: 0.109
- Trapped in a Psycopathic Novel (Romansa, Fantasi) | Skor: 0.103
- Sanggar Tanpa Pulang (Horor, Misteri) | Skor: 0.096
- This is Not A Good Love Story (Keluarga, Romansa) | Skor: 0.092

METODE LSA (Berdasarkan Makna/Semantik):
- OSTIARIUS (Fantasi, Horor) | Skor: 0.911
- Sanggar Tanpa Pulang (Horor, Misteri) | Skor: 0.899
- JUDAS (Romansa, Mafia) | Skor: 0.894
- UNTIL HE FOUND ME (Romansa, Thriller, Fantasi, Mafia) | Skor: 0.878
- TerraCotta (Komedi, Cinta, Keluarga) | Skor: 0.846
