In [12]:
import pandas as pd
import pickle
import gzip

In [13]:
from collections import defaultdict
from modul.preprocessing import case_folding, tokenizing, stopword_removal, normalization, stemming
from modul.scoring import cosine_score
from modul.spelling_correction import generate_kgrams, jaccard_coefficient

# Input

In [14]:
query = input("Input your query: ")
query

'indonsia majju'

# Preprocessing

In [15]:
def preprocessing(query: str) -> list:
  query = case_folding(query)
  query = tokenizing(query)
  query = stopword_removal(query)
  query = normalization(query)
  query = stemming(query)
  return query

In [16]:
query = preprocessing(query)
query

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Adi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['indonsia', 'majju']

# Read Index

In [17]:
def find_document(query: list) -> pd.DataFrame:
  with gzip.open(filename='index/metadata_index.pkl.gz', mode='rb') as f:
    df: pd.DataFrame = pickle.load(f)
  with gzip.open(filename='index/inverted_index.pkl.gz', mode='rb') as f:
    inverted_index: defaultdict = pickle.load(f)
  relevant_ids = set()
  for term in query:
    if term in inverted_index:
      relevant_ids.update(inverted_index[term])
  result_df = df[df['id'].isin(relevant_ids)]
  return result_df

# Spelling Correction

In [18]:
def spelling_correction(query: list) -> list:
  with gzip.open(filename='index/dictionary_index.pkl.gz', mode='rb') as f:
    dictionary_index: dict = pickle.load(f)
  with gzip.open(filename='index/kgram_index.pkl.gz', mode='rb') as f:
    kgram_index: dict = pickle.load(f)
  corrected_query = []
  for word in query:
    query_kgrams = generate_kgrams(word)
    candidate_term_ids = set()
    for kgram in query_kgrams:
      if kgram in kgram_index:
        candidate_term_ids.update(kgram_index[kgram])
    candidate_term_dict = {key: dictionary_index[key] for key in candidate_term_ids if key in dictionary_index}
    best_match: str
    best_score = 0
    candidate_term = list(candidate_term_dict.values())
    for term in candidate_term:
      term_kgrams = generate_kgrams(term)
      score = jaccard_coefficient(query_kgrams, term_kgrams)
      if score > best_score:
        best_match = term
        best_score = score
    corrected_query.append(best_match if best_match else word)
  return corrected_query

In [19]:
correct_query = spelling_correction(query)
correct_query

['indonsia', 'maju']

# Process

In [20]:
if correct_query == query:
  result = find_document(query)
else:
  print(correct_query)
  result = find_document(correct_query)
result

['indonsia', 'maju']


Unnamed: 0,id,source,title,url,content,date,text_preprocessed
4,87,tempo,Bamsoet Ajak Komunitas Otomotif Kembangkan Per...,https://nasional.tempo.co/read/1698536/bamsoet...,INFO NASIONAL - Ketua MPR RI sekaligus Ketua U...,2023-03-04 06:38:57+00,"[bamsoet, ajak, komunitas, otomotif, kembang, ..."
29,134,tempo,"Putusan Penundaan Pemilu 2024, Partai Ummat: J...",https://nasional.tempo.co/read/1698871/putusan...,"TEMPO.CO, Jakarta - Politikus Partai Ummat Mus...",2023-03-05 09:42:00+00,"[putus, tunda, milu, 2024, partai, ummat, bole..."
34,103,tempo,Banyak Politikus Pindah Parpol Menjelang Pemil...,https://nasional.tempo.co/read/1698659/banyak-...,"TEMPO.CO, Jakarta -Pengamat politik Adi Prayit...",2023-03-04 15:03:34+00,"[politikus, pindah, parpol, jelang, milu, amat..."
49,41,tempo,"Dapat Gelar Doktor Honoris Causa, Erick Thohir...",https://nasional.tempo.co/read/1698341/dapat-g...,"TEMPO.CO, Jakarta - Menteri Badan Usaha Milik ...",2023-03-03 14:30:00+00,"[gelar, doktor, honoris, causa, erick, thohir,..."
63,144,tempo,"Prabowo Subianto dan Surya Paloh Bertemu, Demo...",https://nasional.tempo.co/read/1698914/prabowo...,"TEMPO.CO, Jakarta - Koordinator Juru Bicara DP...",2023-03-05 13:05:39+00,"[prabowo, subianto, surya, paloh, bertemu, dem..."
...,...,...,...,...,...,...,...
32614,63441,okezone,Perindo Sulteng Gelar Buka Bersama dan Bagikan...,https://news.okezone.com/read/2023/04/11/340/2...,DONGGALA - Partai Perindo menggelar acara buk...,2023-04-11 13:08:44+00,"[perindo, sulteng, gelar, buka, bagi, gerobak,..."
32626,63467,kumparan,"Gelar Safari Ramadhan, Bank BJB Beri Santunan ...",https://kumparan.com/kumparanbisnis/gelar-safa...,PT Bank Pembangunan Daerah Jawa Barat dan Bant...,2023-04-11 13:32:59+00,"[gelar, safari, ramadhan, bank, bjb, santun, a..."
32665,63494,okezone,Puluhan Anak Yatim Piatu Yayasan Rasulullah SA...,https://megapolitan.okezone.com/read/2023/04/1...,JAKARTA - MNC Peduli dan MNC Bank menggelar ...,2023-04-11 14:09:42+00,"[puluh, anak, yatim, piatu, yayasan, rasululla..."
32710,63542,kumparan,Jalin Silaturahmi dan Bangun Kolaborasi Bisnis...,https://kumparan.com/TDA/jalin-silaturahmi-dan...,Jakarta Barat - Komunitas Tangan Di Atas (TDA)...,2023-04-11 17:23:43+00,"[jalin, silaturahmi, bangun, kolaborasi, bisni..."


# Scoring

In [21]:
def scoring_document(query: list, result: pd.DataFrame, top_k: int = 10):
  with gzip.open(filename='index/dictionary_index.pkl.gz', mode='rb') as f:
    dictionary_index: dict = pickle.load(f)
  with gzip.open(filename='index/tfidf_index.pkl.gz', mode='rb') as f:
    tfidf_index: defaultdict = pickle.load(f)
  return cosine_score(query, result, dictionary_index, tfidf_index, top_k)

In [22]:
result = scoring_document(correct_query, result, top_k=10)
result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_results['cosine_score'] = pd.Series(top_results['id'].map(dict(top_docs)))


Unnamed: 0,id,source,title,url,content,date,text_preprocessed,cosine_score
13789,30652,kumparan,Menhub: Libur dan Cuti Lebaran Jadi 19-25 Apri...,https://kumparan.com/kumparanvideo/menhub-libu...,Menhub Budi Karya Sumadi mengumumkan cuti bers...,2023-03-24 12:51:32+00,"[menhub, libur, cuti, lebaran, 19, 25, april, ...",0.242024
18182,34946,okezone,Menhub Minta THR Karyawan Cair 18 April 2023,https://economy.okezone.com/read/2023/03/24/32...,JAKARTA - Menteri Perhubungan Budi Karya Sumad...,2023-03-24 10:54:54+00,"[menhub, thr, karyawan, cair, 18, april, 2023,...",0.191224
16438,32977,cnbcindonesia,Hore! Jokowi Tambah Cuti Bersama Lebaran 2023,https://www.cnbcindonesia.com/news/20230324155...,"Jakarta, CNBC Indonesia - Menteri Perhubungan ...",2023-03-24 08:53:10+00,"[hore, jokowi, cuti, lebaran, 2023, jakarta, c...",0.184944
12591,33000,cnbcindonesia,"Cuti Bersama Lebaran Ditambah dan Dimajukan, I...",https://www.cnbcindonesia.com/news/20230324160...,"Jakarta, CNBC Indonesia - Pemerintah telah mem...",2023-03-24 09:06:36+00,"[cuti, lebaran, tambah, maju, alas, jakarta, c...",0.178775
17984,34756,cnbcindonesia,Ini Lho! Alasan Jokowi Tambah Cuti Bersama Leb...,https://www.cnbcindonesia.com/news/20230326100...,"Jakarta, CNBC Indonesia - Presiden Joko Widodo...",2023-03-26 04:44:52+00,"[lho, alas, jokowi, cuti, lebaran, jakarta, cn...",0.174839
18510,35294,suara,Gebrakan Jokowi di Bulan Ramadhan: Larang Peja...,https://www.suara.com/news/2023/03/26/114500/g...,Suara.com - Presiden Joko Widodo mengeluarkan ...,2023-03-26 04:45:00+00,"[gebrakan, jokowi, ramadhan, larang, pejabat, ...",0.173747
32158,62971,kumparan,Catatan Calon Pendidik: Apakah Masa Depan Masi...,https://kumparan.com/nazwa-guseynova-kamila/ca...,Pendidikan yang bermutu dan berkualitas merupa...,2023-04-11 07:42:15+00,"[catat, calon, didik, butuh, guru, saya, pendi...",0.172458
12156,19452,cnnindonesia,Bawaslu Tegaskan Menteri Harus Cuti Jika Ingin...,https://www.cnnindonesia.com/nasional/20230318...,Badan Pengawas Pemilu ( Bawaslu ) menegaskan m...,2023-03-18 08:35:00+00,"[bawaslu, tegas, menteri, cuti, nyapres, badan...",0.171271
11606,30361,kumparan,Menhub Sebut Libur dan Cuti Bersama Lebaran 19...,https://kumparan.com/kumparannews/menhub-sebut...,Menhub Budi Karya Sumadi mengumumkan cuti bers...,2023-03-24 09:28:03+00,"[menhub, libur, cuti, lebaran, 19, 25, april, ...",0.161187
28576,57372,kumparan,Zulhas: Koalisi Besar di Bawah Orkestra Komand...,https://kumparan.com/kumparannews/zulhas-koali...,Ketum Zulkifli Hasan (Zulhas) tak menampik ren...,2023-04-08 11:26:15+00,"[zulhas, koalisi, orkestra, komando, jokowi, t...",0.160786
