In [84]:
import pandas as pd
import pickle
import gzip

In [85]:
from collections import defaultdict
from modul.preprocessing import case_folding, tokenizing, stopword_removal, normalization, stemming
from modul.scoring import cosine_score
from modul.spelling_correction import generate_kgrams, jaccard_coefficient

# Input

In [86]:
query = input("Input your query: ")
query

'Harga minyak dunia hari ini'

# Preprocessing

In [87]:
def preprocessing(query: str) -> list:
  query = case_folding(query)
  query = tokenizing(query)
  query = stopword_removal(query)
  query = normalization(query)
  query = stemming(query)
  return query

In [88]:
query = preprocessing(query)
query

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Adi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['harga', 'minyak', 'dunia']

# Read Index

In [89]:
def find_document(query: list) -> pd.DataFrame:
  with gzip.open(filename='index/metadata_index.pkl.gz', mode='rb') as f:
    df: pd.DataFrame = pickle.load(f)
  with gzip.open(filename='index/inverted_index.pkl.gz', mode='rb') as f:
    inverted_index: defaultdict = pickle.load(f)
  relevant_ids = set()
  for term in query:
    if term in inverted_index:
      relevant_ids.update(inverted_index[term])
  result_df = df[df['id'].isin(relevant_ids)]
  return result_df

# Spelling Correction

In [90]:
def spelling_correction(query: list) -> list:
  with gzip.open(filename='index/dictionary_index.pkl.gz', mode='rb') as f:
    dictionary_index: dict = pickle.load(f)
  with gzip.open(filename='index/kgram_index.pkl.gz', mode='rb') as f:
    kgram_index: dict = pickle.load(f)
  corrected_query = []
  for word in query:
    query_kgrams = generate_kgrams(word)
    candidate_term_ids = set()
    for kgram in query_kgrams:
      if kgram in kgram_index:
        candidate_term_ids.update(kgram_index[kgram])
    candidate_term_dict = {key: dictionary_index[key] for key in candidate_term_ids if key in dictionary_index}
    best_match: str
    best_score = 0
    candidate_term = list(candidate_term_dict.values())
    for term in candidate_term:
      term_kgrams = generate_kgrams(term)
      score = jaccard_coefficient(query_kgrams, term_kgrams)
      if score > best_score:
        best_match = term
        best_score = score
    corrected_query.append(best_match if best_match else word)
  return corrected_query

In [91]:
correct_query = spelling_correction(query)
correct_query

['harga', 'minyak', 'dunia']

# Process

In [92]:
if correct_query == query:
  result = find_document(query)
else:
  print(correct_query)
  result = find_document(correct_query)
result

Unnamed: 0,id,source,title,url,content,date,text_preprocessed
0,83,tempo,"Depo Plumpang Terbakar, Anggota DPR Minta Pert...",https://nasional.tempo.co/read/1698528/depo-pl...,"TEMPO.CO, Jakarta - Anggota Komisi VII DPR RI ...",2023-03-04 06:18:13+00,"[depo, plumpang, bakar, anggota, dpr, pertamin..."
1,84,tempo,Jokowi Perintahkan Wapres Ma'ruf Amin Tinjau L...,https://nasional.tempo.co/read/1698522/jokowi-...,"TEMPO.CO, Jakarta - Presiden Joko Widodo atau ...",2023-03-04 06:04:38+00,"[jokowi, perintah, wapres, ma, ruf, amin, tinj..."
3,86,tempo,Tim Dokkes Polri Telah Terima 14 Kantong Jenaz...,https://nasional.tempo.co/read/1698540/tim-dok...,"TEMPO.CO, Jakarta - Tim Kedokteran dan Kesehat...",2023-03-04 06:44:10+00,"[tim, dokkes, polri, terima, 14, kantong, jena..."
4,87,tempo,Bamsoet Ajak Komunitas Otomotif Kembangkan Per...,https://nasional.tempo.co/read/1698536/bamsoet...,INFO NASIONAL - Ketua MPR RI sekaligus Ketua U...,2023-03-04 06:38:57+00,"[bamsoet, ajak, komunitas, otomotif, kembang, ..."
5,92,tempo,"Korban Tewas Kebakaran Depo Plumpang 17 Orang,...",https://nasional.tempo.co/read/1698585/korban-...,"TEMPO.CO, Jakarta - Wakil Presiden Ma'ruf Amin...",2023-03-04 09:40:00+00,"[korban, tewas, bakar, depo, plumpang, 17, ora..."
...,...,...,...,...,...,...,...
32713,63545,suara,Niat Zakat Fitrah untuk Diri Sendiri dan Kelua...,https://www.suara.com/news/2023/04/12/011643/n...,Suara.com - Zakat fitrah merupakan kewajiban b...,2023-04-11 18:16:43+00,"[niat, zakat, fitrah, keluarga, lengkap, suara..."
32718,63551,okezone,Mengenal Legiun Asing Pertama di Dunia dan Kip...,https://nasional.okezone.com/read/2023/04/11/3...,JAKARTA - Koninklijke Nedelandsch Indisch Lege...,2023-04-11 19:03:27+00,"[kenal, legiun, asing, dunia, kiprah, nusantar..."
32720,63553,okezone,Momen Menegangkan Paspamres Nyaris Baku Tembak...,https://nasional.okezone.com/read/2023/04/12/3...,PASPAMPRES adalah pasukan yang bertugas melaks...,2023-04-11 19:04:00+00,"[momen, tegang, paspamres, baku, tembak, agen,..."
32724,63557,cnnindonesia,Cara Menghitung Fidyah Bagi Muslim yang Tak Bi...,https://www.cnnindonesia.com/ekonomi/202304111...,Umat muslim wajib berpuasa saat Ramadan tiba. ...,2023-04-11 19:20:02+00,"[hitung, fidyah, muslim, puasa, umat, muslim, ..."


# Scoring

In [93]:
def scoring_document(query: list, result: pd.DataFrame, top_k: int = 10):
  with gzip.open(filename='index/dictionary_index.pkl.gz', mode='rb') as f:
    dictionary_index: dict = pickle.load(f)
  with gzip.open(filename='index/tfidf_index.pkl.gz', mode='rb') as f:
    tfidf_index: defaultdict = pickle.load(f)
  return cosine_score(query, result, dictionary_index, tfidf_index, top_k)

In [94]:
result = scoring_document(correct_query, result, top_k=10)
result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_results['cosine_score'] = pd.Series(top_results['id'].map(dict(top_docs)))


Unnamed: 0,id,source,title,url,content,date,text_preprocessed,cosine_score
15628,28749,cnbcindonesia,"Minyak Mentah Dunia 'Nyungsep', Harga BBM Apri...",https://www.cnbcindonesia.com/news/20230322103...,"Jakarta, CNBC Indonesia - Saat ini harga minya...",2023-03-22 05:10:00+00,"[minyak, mentah, dunia, nyungsep, harga, bbm, ...",0.523786
14978,31352,cnbcindonesia,"Minyak Dunia Ambles, Harga BBM Pertalite Bisa ...",https://www.cnbcindonesia.com/news/20230324093...,"Jakarta, CNBC Indonesia - Pemerintah dalam hal...",2023-03-24 03:25:00+00,"[minyak, dunia, ambles, harga, bbm, pertalite,...",0.501057
11663,25522,cnbcindonesia,"Minyak Amrbuk, Harga BBM Pertalite April Bisa ...",https://www.cnbcindonesia.com/news/20230321111...,"Jakarta, CNBC Indonesia - Pemerintah dalam hal...",2023-03-21 04:13:35+00,"[minyak, amrbuk, harga, bbm, pertalite, april,...",0.451683
30517,60887,okezone,Harga Pertalite Bisa Turun di Bawah Rp10.000/L...,https://economy.okezone.com/read/2023/04/10/32...,JAKARTA Harga BBM Pertalite bisa turun di baw...,2023-04-10 09:49:13+00,"[harga, pertalite, turun, rp10, 000, liter, sy...",0.431939
25668,51287,kumparan,"Harga Minyak Dunia Fluktuatif, Aspebindo Saran...",https://kumparan.com/kumparanbisnis/harga-miny...,mentah dunia terus berfluktuasi dalam beberapa...,2023-04-05 07:06:29+00,"[harga, minyak, dunia, fluktuatif, aspebindo, ...",0.415123
10254,17562,okezone,Peringkat Kredit Arab Saudi Naik Usai Reformas...,https://economy.okezone.com/read/2023/03/18/32...,JAKARTA - S&P Global menaikkan peringkat kredi...,2023-03-18 02:01:49+00,"[peringkat, kredit, arab, saudi, reformasi, se...",0.407341
30618,61078,tempo,Pemerintah Sebut Harga Pertalite Bisa Diturunk...,https://bisnis.tempo.co/read/1713428/pemerinta...,"TEMPO.CO, Jakarta - Direktur Jenderal Minyak d...",2023-04-10 12:32:26+00,"[perin, harga, pertalite, turun, syarat, tempo...",0.403239
18774,35566,suara,Pidato Menteri Energi AS Jungkalkan Harga Miny...,https://www.suara.com/bisnis/2023/03/24/091819...,Suara.com - Harga minyak dunia turun pada Kami...,2023-03-24 02:18:19+00,"[pidato, menteri, energi, as, jungkal, harga, ...",0.400374
29447,59334,cnnindonesia,Data Lowongan Kerja AS Tekan Harga Minyak,https://www.cnnindonesia.com/ekonomi/202304050...,Harga minyak melemah di awal perdagangan Kami...,2023-04-06 00:23:24+00,"[data, lowong, kerja, as, tekan, harga, minyak...",0.391505
25978,52078,cnnindonesia,"Puas Melesat, Harga Minyak Stabil Pagi Ini",https://www.cnnindonesia.com/ekonomi/202304040...,Harga minyak dunia stabil di awal perdagangan ...,2023-04-04 01:42:35+00,"[puas, melesat, harga, minyak, stabil, pagi, h...",0.388548
