# TUGAS 1 Temukan minimal 5 penelitian dalam rentang waktu 2020 – 2025 (JURNAL PAPPER)

# TUGAS 2 MENGUMPULKAN DATA 50 PUTUSAN DI PENGADILAN NARKOTIKA DAN PSIKOTROPIKA TANGERANG

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install pandas requests beautifulsoup4 pdfminer.six lxml > /dev/null 2>&1

In [None]:
import argparse
import io
import os
import re
import time
import urllib
from concurrent.futures import ThreadPoolExecutor, wait
from datetime import date
import pandas as pd
import requests
from bs4 import BeautifulSoup
from pdfminer.high_level import extract_text

##TAHUN 2024

In [None]:
def create_path(folder_name):
    path = os.path.join(os.getcwd(), folder_name)
    if not os.path.exists(path):
        os.makedirs(path)
    return path

def open_page(link):
    count = 0
    while count < 3:
        try:
            return BeautifulSoup(requests.get(link).text, "lxml")
        except:
            count += 1
            time.sleep(5)


def get_detail(soup, keyword):
    try:
        text = (
            soup.find(lambda tag: tag.name == "td" and keyword in tag.text)
            .find_next()
            .get_text()
            .strip()
        )
        return text
    except:
        return ""


def get_pdf(url, path_pdf):
    try:
        file = urllib.request.urlopen(url)
        file_name = os.path.basename(url)
        file_content = file.read()
        with open(f"{path_pdf}/{file_name}", "wb") as out_file:
            out_file.write(file_content)
        return io.BytesIO(file_content), file_name
    except:
        return None, None


def clean_text(text):
    text = text.replace("M a h ka m a h A g u n g R e p u blik In d o n esia\n", "")
    text = text.replace("Disclaimer\n", "")
    text = text.replace(
        "Kepaniteraan Mahkamah Agung Republik Indonesia berusaha untuk selalu mencantumkan informasi paling kini dan akurat sebagai bentuk komitmen Mahkamah Agung untuk pelayanan publik, transparansi dan akuntabilitas\n",
        "",
    )
    text = text.replace(
        "pelaksanaan fungsi peradilan. Namun dalam hal-hal tertentu masih dimungkinkan terjadi permasalahan teknis terkait dengan akurasi dan keterkinian informasi yang kami sajikan, hal mana akan terus kami perbaiki dari waktu kewaktu.\n",
        "",
    )
    text = text.replace(
        "Dalam hal Anda menemukan inakurasi informasi yang termuat pada situs ini atau informasi yang seharusnya ada, namun belum tersedia, maka harap segera hubungi Kepaniteraan Mahkamah Agung RI melalui :\n",
        "",
    )
    text = text.replace(
        "Email : kepaniteraan@mahkamahagung.go.id    Telp : 021-384 3348 (ext.318)\n",
        "",
    )
    return text


def extract_data(link, keyword_url, path_output, path_pdf, today):
    soup = open_page(link)
    table = soup.find("table", {"class": "table"})
    judul = table.find("h2").text if table.find("h2") else ""

    nomor = get_detail(table, "Nomor")
    tingkat_proses = get_detail(table, "Tingkat Proses")
    klasifikasi = get_detail(table, "Klasifikasi")
    kata_kunci = get_detail(table, "Kata Kunci")
    tahun = get_detail(table, "Tahun")
    tanggal_register = get_detail(table, "Tanggal Register")
    lembaga_peradilan = get_detail(table, "Lembaga Peradilan")
    jenis_lembaga_peradilan = get_detail(table, "Jenis Lembaga Peradilan")
    hakim_ketua = get_detail(table, "Hakim Ketua")
    hakim_anggota = get_detail(table, "Hakim Anggota")
    panitera = get_detail(table, "Panitera")
    amar = get_detail(table, "Amar")
    amar_lainnya = get_detail(table, "Amar Lainnya")
    catatan_amar = get_detail(table, "Catatan Amar")
    tanggal_musyawarah = get_detail(table, "Tanggal Musyawarah")
    tanggal_dibacakan = get_detail(table, "Tanggal Dibacakan")
    kaidah = get_detail(table, "Kaidah")
    status = get_detail(table, "Status")
    abstrak = get_detail(table, "Abstrak")

       # === Tambahan filter agar tidak mengambil putusan dengan status BERKEKUATAN HUKUM TETAP ===
    if "berkekuatan hukum tetap" in status.lower():
        print(f"❌ Dilewati karena sudah inkracht: {judul}")
        return  # langsung hentikan fungsi ini


    try:
        link_pdf = soup.find("a", href=re.compile(r"/pdf/"))["href"]
        file_pdf, file_name_pdf = get_pdf(link_pdf, path_pdf)
        text_pdf = extract_text(file_pdf)
        text_pdf = clean_text(text_pdf)
    except:
        link_pdf = ""
        text_pdf = ""
        file_name_pdf = ""

    data = [
        judul,
        nomor,
        tingkat_proses,
        klasifikasi,
        kata_kunci,
        tahun,
        tanggal_register,
        lembaga_peradilan,
        jenis_lembaga_peradilan,
        hakim_ketua,
        hakim_anggota,
        panitera,
        amar,
        amar_lainnya,
        catatan_amar,
        tanggal_musyawarah,
        tanggal_dibacakan,
        kaidah,
        status,
        abstrak,
        link,
        link_pdf,
        file_name_pdf,
        text_pdf,
    ]
    result = pd.DataFrame(
        [data],
        columns=[
            "judul",
            "nomor",
            "tingkat_proses",
            "klasifikasi",
            "kata_kunci",
            "tahun",
            "tanggal_register",
            "lembaga_peradilan",
            "jenis_lembaga_peradilan",
            "hakim_ketua",
            "hakim_anggota",
            "panitera",
            "amar",
            "amar_lainnya",
            "catatan_amar",
            "tanggal_musyawarah",
            "tanggal_dibacakan",
            "kaidah",
            "status",
            "abstrak",
            "link",
            "link_pdf",
            "file_name_pdf",
            "text_pdf",
        ],
    )

    keyword_url = keyword_url.replace("/", " ")
    if keyword_url.startswith("https"):
        keyword_url = ""
    destination = f"{path_output}/putusan_ma_{keyword_url}_{today}"
    if not os.path.isfile(f"{destination}.csv"):
        result.to_csv(f"{destination}.csv", header=True, index=False)
    else:
        result.to_csv(f"{destination}.csv", mode="a", header=False, index=False)


def run_process(keyword_url, page, sort_date, path_output, path_pdf, today):
    if keyword_url.startswith("https"):
        link = f"{keyword_url}&page={page}"
    else:
        link = f"https://putusan3.mahkamahagung.go.id/search.html?q={keyword_url}&page={page}"
    if sort_date:
        link = f"{link}&obf=TANGGAL_PUTUS&obm=desc"

    soup = open_page(link)
    links = soup.find_all("a", {"href": re.compile("/direktori/putusan")})

    for link in links:
        extract_data(link["href"], keyword_url, path_output, path_pdf, today)


def run_scraper(keyword=None, url=None, sort_date=True, download_pdf=True):
    if not keyword and not url:
        print("Please provide a keyword or URL")
        return

    path_output = '/content/drive/MyDrive/UTS TKI/CSV'
    path_pdf = '/content/drive/MyDrive/UTS TKI/PDF'
    today = date.today().strftime("%Y-%m-%d")

    link = f"https://putusan3.mahkamahagung.go.id/search.html?q={keyword}&page=1"
    if url:
        link = url

    soup = open_page(link)
    last_page = int(soup.find_all("a", {"class": "page-link"})[-1].get("data-ci-pagination-page"))

    if url:
        print(f"Scraping with url: {url} - {20 * last_page} data - {last_page} page")
    else:
        print(f"Scraping with keyword: {keyword} - {20 * last_page} data - {last_page} page")

    if url:
        keyword_url = url
    else:
        keyword_url = keyword

    futures = []
    with ThreadPoolExecutor(max_workers=4) as executor:
        for page in range(last_page):
            futures.append(
                executor.submit(run_process, keyword_url, page + 1, sort_date, path_output, path_pdf, today)
            )
    wait(futures)

In [None]:
# Download Putusan di Pengadilan NARKOTIKA DAN PSIKOTROPIKA TANGERANG
run_scraper(url="https://putusan3.mahkamahagung.go.id/search.html?q=&jenis_doc=putusan&cat=3c40e48bbab311301a21c445b3c7fe57&jd=&tp=&court=097598PN66|097598PN66%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20&t_put=2024&t_reg=&t_upl=&t_pr=")

Scraping with url: https://putusan3.mahkamahagung.go.id/search.html?q=&jenis_doc=putusan&cat=3c40e48bbab311301a21c445b3c7fe57&jd=&tp=&court=097598PN66|097598PN66%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20&t_put=2024&t_reg=&t_upl=&t_pr= - 300 data - 15 page


##TAHUN 2025

In [None]:
def create_path(folder_name):
    path = os.path.join(os.getcwd(), folder_name)
    if not os.path.exists(path):
        os.makedirs(path)
    return path

def open_page(link):
    count = 0
    while count < 3:
        try:
            return BeautifulSoup(requests.get(link).text, "lxml")
        except:
            count += 1
            time.sleep(5)


def get_detail(soup, keyword):
    try:
        text = (
            soup.find(lambda tag: tag.name == "td" and keyword in tag.text)
            .find_next()
            .get_text()
            .strip()
        )
        return text
    except:
        return ""


def get_pdf(url, path_pdf):
    try:
        file = urllib.request.urlopen(url)
        file_name = os.path.basename(url)
        file_content = file.read()
        with open(f"{path_pdf}/{file_name}", "wb") as out_file:
            out_file.write(file_content)
        return io.BytesIO(file_content), file_name
    except:
        return None, None


def clean_text(text):
    text = text.replace("M a h ka m a h A g u n g R e p u blik In d o n esia\n", "")
    text = text.replace("Disclaimer\n", "")
    text = text.replace(
        "Kepaniteraan Mahkamah Agung Republik Indonesia berusaha untuk selalu mencantumkan informasi paling kini dan akurat sebagai bentuk komitmen Mahkamah Agung untuk pelayanan publik, transparansi dan akuntabilitas\n",
        "",
    )
    text = text.replace(
        "pelaksanaan fungsi peradilan. Namun dalam hal-hal tertentu masih dimungkinkan terjadi permasalahan teknis terkait dengan akurasi dan keterkinian informasi yang kami sajikan, hal mana akan terus kami perbaiki dari waktu kewaktu.\n",
        "",
    )
    text = text.replace(
        "Dalam hal Anda menemukan inakurasi informasi yang termuat pada situs ini atau informasi yang seharusnya ada, namun belum tersedia, maka harap segera hubungi Kepaniteraan Mahkamah Agung RI melalui :\n",
        "",
    )
    text = text.replace(
        "Email : kepaniteraan@mahkamahagung.go.id    Telp : 021-384 3348 (ext.318)\n",
        "",
    )
    return text


def extract_data(link, keyword_url, path_output, path_pdf, today):
    soup = open_page(link)
    table = soup.find("table", {"class": "table"})
    judul = table.find("h2").text if table.find("h2") else ""

    nomor = get_detail(table, "Nomor")
    tingkat_proses = get_detail(table, "Tingkat Proses")
    klasifikasi = get_detail(table, "Klasifikasi")
    kata_kunci = get_detail(table, "Kata Kunci")
    tahun = get_detail(table, "Tahun")
    tanggal_register = get_detail(table, "Tanggal Register")
    lembaga_peradilan = get_detail(table, "Lembaga Peradilan")
    jenis_lembaga_peradilan = get_detail(table, "Jenis Lembaga Peradilan")
    hakim_ketua = get_detail(table, "Hakim Ketua")
    hakim_anggota = get_detail(table, "Hakim Anggota")
    panitera = get_detail(table, "Panitera")
    amar = get_detail(table, "Amar")
    amar_lainnya = get_detail(table, "Amar Lainnya")
    catatan_amar = get_detail(table, "Catatan Amar")
    tanggal_musyawarah = get_detail(table, "Tanggal Musyawarah")
    tanggal_dibacakan = get_detail(table, "Tanggal Dibacakan")
    kaidah = get_detail(table, "Kaidah")
    status = get_detail(table, "Status")
    abstrak = get_detail(table, "Abstrak")

       # === Tambahan filter agar tidak mengambil putusan dengan status BERKEKUATAN HUKUM TETAP ===
    if "berkekuatan hukum tetap" in status.lower():
        print(f"❌ Dilewati karena sudah inkracht: {judul}")
        return  # langsung hentikan fungsi ini


    try:
        link_pdf = soup.find("a", href=re.compile(r"/pdf/"))["href"]
        file_pdf, file_name_pdf = get_pdf(link_pdf, path_pdf)
        text_pdf = extract_text(file_pdf)
        text_pdf = clean_text(text_pdf)
    except:
        link_pdf = ""
        text_pdf = ""
        file_name_pdf = ""

    data = [
        judul,
        nomor,
        tingkat_proses,
        klasifikasi,
        kata_kunci,
        tahun,
        tanggal_register,
        lembaga_peradilan,
        jenis_lembaga_peradilan,
        hakim_ketua,
        hakim_anggota,
        panitera,
        amar,
        amar_lainnya,
        catatan_amar,
        tanggal_musyawarah,
        tanggal_dibacakan,
        kaidah,
        status,
        abstrak,
        link,
        link_pdf,
        file_name_pdf,
        text_pdf,
    ]
    result = pd.DataFrame(
        [data],
        columns=[
            "judul",
            "nomor",
            "tingkat_proses",
            "klasifikasi",
            "kata_kunci",
            "tahun",
            "tanggal_register",
            "lembaga_peradilan",
            "jenis_lembaga_peradilan",
            "hakim_ketua",
            "hakim_anggota",
            "panitera",
            "amar",
            "amar_lainnya",
            "catatan_amar",
            "tanggal_musyawarah",
            "tanggal_dibacakan",
            "kaidah",
            "status",
            "abstrak",
            "link",
            "link_pdf",
            "file_name_pdf",
            "text_pdf",
        ],
    )

    keyword_url = keyword_url.replace("/", " ")
    if keyword_url.startswith("https"):
        keyword_url = ""
    destination = f"{path_output}/putusan_ma_{keyword_url}_{today}"
    if not os.path.isfile(f"{destination}.csv"):
        result.to_csv(f"{destination}.csv", header=True, index=False)
    else:
        result.to_csv(f"{destination}.csv", mode="a", header=False, index=False)


def run_process(keyword_url, page, sort_date, path_output, path_pdf, today):
    if keyword_url.startswith("https"):
        link = f"{keyword_url}&page={page}"
    else:
        link = f"https://putusan3.mahkamahagung.go.id/search.html?q={keyword_url}&page={page}"
    if sort_date:
        link = f"{link}&obf=TANGGAL_PUTUS&obm=desc"

    soup = open_page(link)
    links = soup.find_all("a", {"href": re.compile("/direktori/putusan")})

    for link in links:
        extract_data(link["href"], keyword_url, path_output, path_pdf, today)


def run_scraper(keyword=None, url=None, sort_date=True, download_pdf=True):
    if not keyword and not url:
        print("Please provide a keyword or URL")
        return

    path_output = '/content/drive/MyDrive/UTS TKI/CSV 2025'
    path_pdf = '/content/drive/MyDrive/UTS TKI/PDF 2025'
    today = date.today().strftime("%Y-%m-%d")

    link = f"https://putusan3.mahkamahagung.go.id/search.html?q={keyword}&page=1"
    if url:
        link = url

    soup = open_page(link)
    last_page = int(soup.find_all("a", {"class": "page-link"})[-1].get("data-ci-pagination-page"))

    if url:
        print(f"Scraping with url: {url} - {20 * last_page} data - {last_page} page")
    else:
        print(f"Scraping with keyword: {keyword} - {20 * last_page} data - {last_page} page")

    if url:
        keyword_url = url
    else:
        keyword_url = keyword

    futures = []
    with ThreadPoolExecutor(max_workers=4) as executor:
        for page in range(last_page):
            futures.append(
                executor.submit(run_process, keyword_url, page + 1, sort_date, path_output, path_pdf, today)
            )
    wait(futures)

In [None]:
# Download Putusan di Pengadilan NARKOTIKA DAN PSIKOTROPIKA TANGERANG
run_scraper(url="https://putusan3.mahkamahagung.go.id/search.html?q=&jenis_doc=putusan&cat=3c40e48bbab311301a21c445b3c7fe57&jd=&tp=&court=097598PN66|097598PN66%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20&t_put=2025&t_reg=&t_upl=&t_pr=")

Scraping with url: https://putusan3.mahkamahagung.go.id/search.html?q=&jenis_doc=putusan&cat=3c40e48bbab311301a21c445b3c7fe57&jd=&tp=&court=097598PN66|097598PN66%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20&t_put=2025&t_reg=&t_upl=&t_pr= - 40 data - 2 page


## Overview.csv

In [None]:
!pip install pdfplumber
!pip install PyPDF2

import pdfplumber
import os
import re
import pandas as pd
from PyPDF2 import PdfReader

# =======================
# KONFIGURASI FOLDER
# =======================
input_folder = '/content/drive/MyDrive/TUGAS 3 UTS TKI'  # Folder berisi 50 PDF putusan
output_csv = '/content/drive/MyDrive/TUGAS 2 UTS TKI/Overview.csv'
os.makedirs(os.path.dirname(output_csv), exist_ok=True)

# =======================
# FUNGSI EKSTRAKSI TEKS
# =======================
def read_pdf_text(pdf_path):
    """Baca seluruh teks dari file PDF"""
    text = ""
    try:
        reader = PdfReader(pdf_path)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    except Exception as e:
        print(f"⚠️ Gagal membaca PDF {pdf_path}: {e}")
    return text

def extract_metadata(text):
    """Ekstraksi metadata penting dari isi putusan"""
    # No Putusan
    match_no = re.search(r'(nomor|no)[\s:.]*([^\s\n;,]*)', text, re.IGNORECASE)
    no_putusan = match_no.group(2).strip() if match_no else ''

    # Lembaga Peradilan
    match_lembaga = re.search(r'pengadilan\s+negeri\s+[a-z\s]+', text, re.IGNORECASE)
    lembaga = match_lembaga.group(0).title().strip() if match_lembaga else 'PN Tidak Diketahui'

    # Barang Bukti
    match_bb = re.search(r'barang bukti(.*?)(menimbang|mengadili)', text, re.DOTALL | re.IGNORECASE)
    barang_bukti = match_bb.group(1).strip() if match_bb else ''

    return no_putusan, lembaga, barang_bukti

def extract_amar_putusan(text):
    """Ekstraksi bagian amar putusan"""
    match_amar = re.search(r'm\s*e\s*n\s*g\s*a\s*d\s*i\s*l\s*i\s*:?([\s\S]*)', text, re.IGNORECASE)
    if match_amar:
        amar_text = match_amar.group(1)
        closing_phrases = ['demikian diputuskan', 'ditetapkan di', 'panitera pengganti', 'hakim ketua']
        for phrase in closing_phrases:
            amar_text = re.split(phrase, amar_text, flags=re.IGNORECASE)[0]
        return amar_text.strip()
    return ''

# =======================
# PROSES EKSTRAKSI
# =======================
rows = []
print("🚀 Memulai ekstraksi data dari PDF...")

file_list = sorted(os.listdir(input_folder))
if not file_list:
    print(f"⚠️ Folder input '{input_folder}' kosong.")

for i, filename in enumerate(file_list):
    if filename.lower().endswith('.pdf'):
        pdf_path = os.path.join(input_folder, filename)
        text = read_pdf_text(pdf_path)

        no_putusan, lembaga, barang_bukti = extract_metadata(text)
        amar_putusan = extract_amar_putusan(text)

        rows.append({
            'No': i + 1,
            'No Putusan': no_putusan,
            'Lembaga Peradilan': lembaga,
            'Barang Bukti': barang_bukti,
            'Amar Putusan': amar_putusan
        })
        print(f"✅ Berhasil ekstrak: {filename}")

# =======================
# SIMPAN KE CSV
# =======================
if rows:
    df = pd.DataFrame(rows)
    df.to_csv(output_csv, index=False, encoding='utf-8-sig')
    print(f"\n🎯 Selesai! Data disimpan ke: {output_csv}")
else:
    print("\n⚠️ Tidak ada data yang berhasil diproses.")


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
🚀 Memulai ekstraksi data dari PDF...
✅ Berhasil ekstrak: zaeef323355aacbca849313530353033.pdf
✅ Berhasil ekstrak: zaeef3233622a06e9869313530353035.pdf
✅ Berhasil ekstrak: zaeef323384f470caeed313530353038.pdf
✅ Berhasil ekstrak: zaeef3235ad378e890d0313530363036.pdf
✅ Berhasil ekstrak: zaef00c44e399488b191323332303539.pdf
✅ Berhasil ekstrak: zaef02337c28b8728579313930393232(1).pdf
✅ Berhasil ekstrak: zaef02337c28b8728579313930393232.pdf
✅ Berhasil ekstrak: zaef02337dac3f0c9fba313930393234(1).pdf
✅ Berhasil ekstrak: zaef02337dac3f0c9fba313930393234.pdf
✅ Berhasil ekstrak: zaef02f6fd097670bb7e313832383530(1).pdf
✅ Berhasil ekstrak: zaef02f6fd097670bb7e313832383530

# TUGAS 3  Preprocessing & Indexing berdasarkan file Overview.csv yang sudah di hasilkan dari Tugas 2.

In [None]:

!pip install Sastrawi scikit-learn pandas nltk

# --- Import library ---
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

# --- Load dataset hasil Tugas 2 ---
file_path = '/content/drive/MyDrive/TUGAS 2 UTS TKI/Overview.csv'
df = pd.read_csv(file_path)

print("✅ Dataset berhasil dimuat!")
print(df.head())

#  Tahap 1: Preprocessing Teks


# Gabungkan kolom teks penting jadi satu dokumen besar per putusan
df['Teks_Gabungan'] = (
    df['Barang Bukti'].fillna('') + ' ' +
    df['Amar Putusan'].fillna('')
)

# Fungsi preprocessing: lowercase, hapus tanda baca, stopword removal, stemming
factory = StemmerFactory()
stemmer = factory.create_stemmer()
stop_words = set(stopwords.words('indonesian'))

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Hapus karakter non-alfabet
    text = re.sub(r'[^a-z\s]', ' ', text)
    # Tokenisasi sederhana
    tokens = text.split()
    # Hapus stopwords & stemming
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Terapkan preprocessing ke seluruh dokumen
df['Preprocessed'] = df['Teks_Gabungan'].apply(preprocess_text)

print("\n🧹 Contoh hasil preprocessing:")
print(df[['No', 'Preprocessed']].head())


# Tahap 2: Indexing Menggunakan TF-IDF

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['Preprocessed'])

print("\n📈 TF-IDF indexing selesai!")
print("Jumlah dokumen:", tfidf_matrix.shape[0])
print("Jumlah fitur (kata unik):", tfidf_matrix.shape[1])

# Tahap 3: Fungsi Pencarian Sederhana


def search(query, top_n=5):
    """Melakukan pencarian teks sederhana dengan cosine similarity"""
    from sklearn.metrics.pairwise import cosine_similarity

    # Preprocessing query
    query_processed = preprocess_text(query)
    query_vec = vectorizer.transform([query_processed])

    # Hitung cosine similarity
    similarity = cosine_similarity(query_vec, tfidf_matrix).flatten()

    # Ambil top-n hasil
    results = df.copy()
    results['Similarity'] = similarity
    results = results.sort_values(by='Similarity', ascending=False).head(top_n)

    return results[['No', 'No Putusan', 'Lembaga Peradilan', 'Similarity', 'Barang Bukti', 'Amar Putusan']]

# 🧭 Contoh Uji Pencarian

query = "narkotika sabu-sabu"
hasil = search(query)

print(f"\n🔎 Hasil pencarian untuk query: '{query}'\n")
print(hasil)

#  (Opsional) Simpan hasil preprocessing & index ke file CSV
df.to_csv('/content/drive/MyDrive/TUGAS 3 UTS TKI/Preprocessed_Index.csv', index=False, encoding='utf-8-sig')
print("\n💾 File hasil preprocessing dan indexing disimpan ke: /content/drive/MyDrive/TUGAS 3 UTS TKI/Preprocessed_Index.csv")


Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


✅ Dataset berhasil dimuat!
   No            No Putusan  \
0   1  1942/Pid.Sus/2023/PN   
1   2  1941/Pid.Sus/2023/PN   
2   3  1940/Pid.Sus/2023/PN   
3   4   275/Pid.Sus/2024/PN   
4   5   264/Pid.Sus/2024/PN   

                                   Lembaga Peradilan  \
0  Pengadilan Negeri  Tangerang Yang Mengadili Pe...   
1  Pengadilan Negeri  Tangerang Yang Mengadili Pe...   
2  Pengadilan Negeri  Tangerang Yang Mengadili Pe...   
3  Pengadilan Negeri Tangerang Yang Mengadili Per...   
4  Pengadilan Negeri  Tangerang Yang Mengadili Pe...   

                                        Barang Bukti  \
0  yang diajukan di persidangan;\nSetelah  menden...   
1  yang diajukan di persidangan;\nSetelah  menden...   
2  yang diajukan di persidangan;\nSetelah  menden...   
3  yang diajukan di persidangan;\nSetelah  menden...   
4  yang diajukan di persidangan;\nSetelah  menden...   

                                        Amar Putusan  
0  perkara pidana dengan\nacara pemeriksaan biasa...  
1 