In [5]:
import pandas as pd
import pickle
import json
from rank_bm25 import BM25Okapi

# Load BM25 model
with open('../bm25_model.pkl', 'rb') as f:
    bm25 = pickle.load(f)

# Load BERT embeddings
with open('../bert_embeddings.pkl', 'rb') as f:
    df = pickle.load(f)

# Load metadata
with open('../data/json/all_meta.json') as f:
    meta_data = [json.loads(line) for line in f]
meta_df = pd.DataFrame(meta_data)

# Extract BM25 model and IDs
bm25_model = bm25['bm25']
bm25_id = bm25['id']

# Filter metadata by 'verdict'
meta_df_filtered = meta_df[meta_df['verdict'] == "guilty"]
print(meta_df_filtered)

# Tokenize query
tokenized_query = "pembunuhan".split()

# Get BM25 scores
scores = bm25_model.get_scores(tokenized_query)

# Pair scores with IDs
scores = list(zip(bm25_id, scores))

# Filter scores to keep only those with IDs in the filtered metadata
filtered_scores = [(id, score) for id, score in scores if id in meta_df_filtered['id'].values]

# Sort scores by descending order to find the best match
filtered_scores.sort(key=lambda x: x[1], reverse=True)
print(filtered_scores)

# Check if any scores are left after filtering
if not filtered_scores:
    print("No matching documents found after filtering.")
else:
    for idx, (best_id, score) in enumerate(filtered_scores):
        # Get the best documents from the merged DataFrame
        df_filtered = df[df['id'] == best_id]
        meta_df_best = meta_df_filtered[meta_df_filtered['id'] == best_id]

        best_doc = pd.merge(df_filtered, meta_df_best, on='id')
        # print(f"Rank {idx + 1} - Score: {best_id}")
        # print(f"Result {idx + 1}:")
        # print(f"Verdict: {verdict}")

    verdict  indictment  lawyer    id  owner
0    guilty    subsider   False  2743  agree
1    guilty  alternatif   False  2744  agree
2    guilty     tunggal   False  2745  agree
3    guilty          NA   False  2746  agree
4    guilty     tunggal   False  2747  agree
..      ...         ...     ...   ...    ...
988  guilty     tunggal   False  2738  jafar
989  guilty     tunggal   False  2739  jafar
990  guilty          NA   False  2740  jafar
991  guilty  alternatif   False  2741  jafar
992  guilty     tunggal   False  2742  jafar

[954 rows x 5 columns]
[(3069, 10.816137455900497), (3197, 10.492215397103097), (2956, 7.794900408858748), (2478, 6.985690737020457), (2675, 4.2754324814415225), (2464, 3.188199098611991), (3271, 1.8740252950340333), (2743, 0.0), (2744, 0.0), (2745, 0.0), (2746, 0.0), (2747, 0.0), (2748, 0.0), (2749, 0.0), (2750, 0.0), (2751, 0.0), (2752, 0.0), (2753, 0.0), (2754, 0.0), (2755, 0.0), (2756, 0.0), (2757, 0.0), (2758, 0.0), (2759, 0.0), (2760, 0.0), (2761, 0