In [18]:
import pandas as pd
import pickle
import gzip

In [19]:
from collections import defaultdict
from modul.preprocessing import case_folding, tokenizing, stopword_removal, normalization, stemming
from modul.scoring import cosine_score
from modul.spelling_correction import generate_kgrams, jaccard_coefficient

# Input

In [20]:
query = input("Input your query: ")
query

'Berita terbaru politik Indonesia'

# Preprocessing

In [21]:
def preprocessing(query: str) -> list:
  query = case_folding(query)
  query = tokenizing(query)
  query = stopword_removal(query)
  query = normalization(query)
  query = stemming(query)
  return query

In [22]:
query = preprocessing(query)
query

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Adi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['berita', 'baru', 'politik', 'indonesia']

# Read Index

In [23]:
def find_document(query: list) -> pd.DataFrame:
  with gzip.open(filename='index/metadata_index.pkl.gz', mode='rb') as f:
    df: pd.DataFrame = pickle.load(f)
  with gzip.open(filename='index/inverted_index.pkl.gz', mode='rb') as f:
    inverted_index: defaultdict = pickle.load(f)
  relevant_ids = set()
  for term in query:
    if term in inverted_index:
      relevant_ids.update(inverted_index[term])
  result_df = df[df['id'].isin(relevant_ids)]
  return result_df

# Spelling Correction

In [24]:
def spelling_correction(query: list) -> list:
  with gzip.open(filename='index/dictionary_index.pkl.gz', mode='rb') as f:
    dictionary_index: dict = pickle.load(f)
  with gzip.open(filename='index/kgram_index.pkl.gz', mode='rb') as f:
    kgram_index: dict = pickle.load(f)
  corrected_query = []
  for word in query:
    query_kgrams = generate_kgrams(word)
    candidate_term_ids = set()
    for kgram in query_kgrams:
      if kgram in kgram_index:
        candidate_term_ids.update(kgram_index[kgram])
    candidate_term_dict = {key: dictionary_index[key] for key in candidate_term_ids if key in dictionary_index}
    best_match: str
    best_score = 0
    candidate_term = list(candidate_term_dict.values())
    for term in candidate_term:
      term_kgrams = generate_kgrams(term)
      score = jaccard_coefficient(query_kgrams, term_kgrams)
      if score > best_score:
        best_match = term
        best_score = score
    corrected_query.append(best_match if best_match else word)
  return corrected_query

In [25]:
correct_query = spelling_correction(query)
correct_query

['berita', 'baru', 'politik', 'indonesia']

# Process

In [26]:
if correct_query == query:
  result = find_document(query)
else:
  print(correct_query)
  result = find_document(correct_query)
result

Unnamed: 0,id,source,title,url,content,date,text_preprocessed
0,83,tempo,"Depo Plumpang Terbakar, Anggota DPR Minta Pert...",https://nasional.tempo.co/read/1698528/depo-pl...,"TEMPO.CO, Jakarta - Anggota Komisi VII DPR RI ...",2023-03-04 06:18:13+00,"[depo, plumpang, bakar, anggota, dpr, pertamin..."
1,84,tempo,Jokowi Perintahkan Wapres Ma'ruf Amin Tinjau L...,https://nasional.tempo.co/read/1698522/jokowi-...,"TEMPO.CO, Jakarta - Presiden Joko Widodo atau ...",2023-03-04 06:04:38+00,"[jokowi, perintah, wapres, ma, ruf, amin, tinj..."
2,85,tempo,HNW Mendukung Jamaah Umroh First Travel Dapatk...,https://nasional.tempo.co/read/1698527/hnw-men...,INFO NASIONAL - Wakil Ketua MPR RI Dr. H. M. H...,2023-03-04 06:18:04+00,"[hnw, dukung, jamaah, umroh, first, travel, da..."
3,86,tempo,Tim Dokkes Polri Telah Terima 14 Kantong Jenaz...,https://nasional.tempo.co/read/1698540/tim-dok...,"TEMPO.CO, Jakarta - Tim Kedokteran dan Kesehat...",2023-03-04 06:44:10+00,"[tim, dokkes, polri, terima, 14, kantong, jena..."
4,87,tempo,Bamsoet Ajak Komunitas Otomotif Kembangkan Per...,https://nasional.tempo.co/read/1698536/bamsoet...,INFO NASIONAL - Ketua MPR RI sekaligus Ketua U...,2023-03-04 06:38:57+00,"[bamsoet, ajak, komunitas, otomotif, kembang, ..."
...,...,...,...,...,...,...,...
32720,63553,okezone,Momen Menegangkan Paspamres Nyaris Baku Tembak...,https://nasional.okezone.com/read/2023/04/12/3...,PASPAMPRES adalah pasukan yang bertugas melaks...,2023-04-11 19:04:00+00,"[momen, tegang, paspamres, baku, tembak, agen,..."
32725,63558,cnbcindonesia,Cek Saldo! Sri Mulyani Sudah Transfer THR ke 2...,https://www.cnbcindonesia.com/news/20230411194...,"Jakarta, CNBC Indonesia - Pemerintah melalui K...",2023-04-11 19:30:35+00,"[cek, saldo, sri, mulyani, transfer, thr, 2, 3..."
32728,63561,okezone,Humor Gus Dur: Puasa Rajab Ala Gus Dur yang Me...,https://nasional.okezone.com/read/2023/04/11/3...,JAKARTA Presiden ke-4 RI KH Abdurrahman Wahid...,2023-04-11 20:01:29+00,"[humor, gus, dur, puasa, rajab, ala, gus, dur,..."
32729,63562,okezone,Kisah Keji PKI Bantai Pelajar Sekaligus Atlet ...,https://nasional.okezone.com/read/2023/04/11/3...,PELAJAR serta atlet peraih emas Pekan Olahraga...,2023-04-11 20:04:13+00,"[kisah, keji, pki, bantai, pelajar, atlet, aih..."


# Scoring

In [13]:
def scoring_document(query: list, result: pd.DataFrame, top_k: int = 10):
  with gzip.open(filename='index/dictionary_index.pkl.gz', mode='rb') as f:
    dictionary_index: dict = pickle.load(f)
  with gzip.open(filename='index/tfidf_index.pkl.gz', mode='rb') as f:
    tfidf_index: defaultdict = pickle.load(f)
  return cosine_score(query, result, dictionary_index, tfidf_index, top_k)

In [14]:
result = scoring_document(correct_query, result, top_k=10)
result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_results['cosine_score'] = pd.Series(top_results['id'].map(dict(top_docs)))


Unnamed: 0,id,source,title,url,content,date,text_preprocessed,cosine_score
28934,58348,okezone,"Usai Bertemu Prabowo, Perindo Sambangi Golkar ...",https://nasional.okezone.com/read/2023/04/08/3...,JAKARTA - Usai bertemu Ketua Umum DPP Partai G...,2023-04-08 06:38:55+00,"[bertemu, prabowo, perindo, sambang, golkar, s...",0.28392
11114,25157,jawapos,Mahfud Tak Permasalahkan Rumah Ibadah Dijadika...,https://www.jawapos.com/nasional/politik/21/03...,"Menteri Koordinator Bidang Politik, Hukum dan...",2023-03-21 08:23:07+00,"[mahfud, masalah, rumah, ibadah, jadi, diskusi...",0.263272
34,103,tempo,Banyak Politikus Pindah Parpol Menjelang Pemil...,https://nasional.tempo.co/read/1698659/banyak-...,"TEMPO.CO, Jakarta -Pengamat politik Adi Prayit...",2023-03-04 15:03:34+00,"[politikus, pindah, parpol, jelang, milu, amat...",0.248923
23775,48010,cnnindonesia,AHY Sebut RI Dalam Bahaya Jika Ada Intervensi ...,https://www.cnnindonesia.com/nasional/20230403...,Ketua Umum Partai Demokrat Agus Harimurti Yudh...,2023-04-03 15:30:09+00,"[ahy, ri, bahaya, intervensi, politik, pk, moe...",0.24123
16737,39854,cnbcindonesia,Jokowi Pastikan Israel Tetap Ikut Piala Dunia ...,https://www.cnbcindonesia.com/news/20230328194...,"Jakarta, CNBC Indonesia - Presiden Joko Widodo...",2023-03-28 13:30:44+00,"[jokowi, pasti, israel, piala, dunia, u, 20, j...",0.233599
17505,34243,kumparan,Survei Indikator: Ganjar Unggul dalam Simulasi...,https://kumparan.com/kumparannews/survei-indik...,Nama Ganjar Pranowo masih berada di posisi ter...,2023-03-26 10:41:34+00,"[survei, indikator, ganjar, unggul, simulasi, ...",0.232858
17771,40476,suara,Survei Indikator: Ganjar Unggul Dalam Simulasi...,https://www.suara.com/bisnis/2023/03/27/095955...,Suara.com - Nama Ganjar Pranowo masih berada d...,2023-03-27 02:59:55+00,"[survei, indikator, ganjar, unggul, simulasi, ...",0.232554
18776,35560,okezone,Mahfud MD Bilang Tempat Ibadah Boleh Digunakan...,https://nasional.okezone.com/read/2023/03/26/3...,"JAKARTA - Menko Polhukam Mahfud MD mengatakan,...",2023-03-26 02:40:54+00,"[mahfud, md, bilang, ibadah, giat, politik, ta...",0.232311
29238,58930,kumparan,"Pesta Demokrasi Kian Dekat, Bagaimana dengan G...",https://kumparan.com/dani-fazli/pesta-demokras...,"Dalam berbagai berita yang beredar, seperti ya...",2023-04-09 12:03:28+00,"[pesta, demokrasi, kian, dekat, generasi, muda...",0.227385
9267,24848,kumparan,Memahami Sistem Kepartaian Indonesia Pada Masa...,https://kumparan.com/berita-terkini/memahami-s...,"Sebagaimana yang diketahui, merdeka pada tangg...",2023-03-21 13:25:29+00,"[paham, sistem, partai, indonesia, demokrasi, ...",0.226347
