In [None]:
# https://www.sbert.net/docs/training/overview.html
# https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/nli/training_nli_v2.py

# Package

In [None]:
!pip install sentence-transformers==2.7.0
!pip install rank-bm25

!pip install sastrawi
!pip install swifter

Collecting sentence-transformers==2.7.0
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/171.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m163.8/171.5 kB[0m [31m5.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers==2.7.0)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers==2.7.0)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers==2.7.0)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-

# Library

In [None]:
import pandas as pd

from google.colab import drive
drive.mount('/content/drive')

# SBERT
from sentence_transformers import SentenceTransformer, InputExample, losses, datasets, util
from torch.utils.data import DataLoader

#BM25
from rank_bm25 import BM25Okapi

import pickle

import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import swifter

import string
import re

pd.set_option('display.max_colwidth', None)

Mounted at /content/drive


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Function Declaration

### RRF

In [None]:
def reciprocal_rank_fusion(result):
  k = 60
  return sum([1.0 / (k + i) for i in result])

### Evaluation

In [None]:
def precision_calc(relevant_document, retrieved_document):
  return len(set(relevant_document).intersection(set(retrieved_document))) / len(retrieved_document)

def recall_calc(relevant_document, retrieved_document):
  return len(set(relevant_document).intersection(set(retrieved_document))) / len(relevant_document)

def f_measure_calc(precision, recall):
  precision_recall = precision + recall
  if precision_recall == 0:
    return 0
  else:
    return (2 * precision * recall) / (precision + recall)

def evaluation(df, column_index, key_terms, k):
  match_result = list(df.sort_values(by=column_index)['index_corpus'].head(k))
  precision_score = precision_calc(key_terms, match_result)
  recall_score = recall_calc(key_terms, match_result)
  f_measure_score = f_measure_calc(precision_score, recall_score)
  return precision_score, recall_score, f_measure_score

def search_query(query, model, embeddings, column_name):
  query_embedding = model.encode(query)
  similarity = util.cos_sim(query_embedding, embeddings)
  index_score = [i.item() for i in similarity[0]]
  dfAns[column_name] = index_score

### Others

In [None]:
def text_prepros(text):

    # converting to lowercase
    text = text.lower()
    # remove tab, new line, ans back slice
    text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
    # remove puctuation
    text = text.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
    # remove whitespace leading & trailing
    text = text.strip()
    # substituting multiple spaces with single space
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    # remove number
    text = re.sub(r'\d+', ' ', text)
    return text

def word_tokenize_wrapper(text):
    return word_tokenize(text)

# stem function bag of word
def stemmed_wrapper(term):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    return stemmer.stem(term)


def set_sumber(state, pasal, ayat, angka, huruf, paragraf):
  sumber = ''
  sumber += state
  sumber += ' - Psl' + pasal
  if ayat != '0':
    sumber += '/Ayt' + ayat
  if angka != '0':
    sumber += '/Ank' + angka
  if huruf != '0':
    sumber += '/Hrf' + huruf
  if paragraf != '0':
    sumber += '/Par' + paragraf
  return sumber

def search_term(seris, term):
  index_term = []
  i = 0
  for seri in seris:
    if term in seri:
      index_term.append(i)
    i += 1
  return index_term

def search_term_list(seris, terms):
  index_term = []
  for term in terms:
    i = 0
    for seri in seris:
      if term in seri:
        index_term.append(i)
      i += 1
  return index_term

# Fine Tuning SBERT

## Prepare Dataset

# BEGIN AUTO RUN

In [None]:
model_name = 'fine_tuned_model_kup_all_a2'
# fine_tuned_model_kup_all_a2 dan a3_dan_KUP1 sukses
# fine_tuned_model_kup_all_a3
# fine_tuned_model_kup_all_a1 -> BEST RESULT
# sheetname = 'Materi_KUP_GPT' #a3_dan_KUP1
sheetname = 'Materi_KUP_GPT_Copy'
epok = 5
print(model_name)

fine_tuned_model_kup_all_a2


In [None]:
tax_term_dict = {
    'ar': 'account representatives',
    'bpn': 'bukti penerimaan negara',
    'bukper': 'bukti permulaan',
    'but': 'bentuk usaha tetap',
    'dirjen': 'direktur jenderal',
    'ditjen': 'direktorat jenderal',
    'djbc': 'direktorat jenderal bea cukai',
    'djp': 'direktorat jenderal pajak',
    'dpp': 'dasar pengenaan pajak',
    'fpp': 'fungsional pemeriksa pajak',
    'nomor identitas wajib pajak': 'nomor pokok wajib pajak',
    'jo': 'joint operation',
    'kkp': 'kertas kerja pemeriksaan',
    'kmk': 'keputusan menteri keuangan',
    'kpp': 'kantor pelayanan pajak',
    'kppn': 'kantor pelayanan perbendaharaan negara',
    'kso': 'kerja sama operasi',
    'ktp': 'kartu tanda penduduk',
    'kup': 'ketentuan umum dan tatacara perpajakan',
    'lhp': 'laporan hasil pemeriksaan',
    'ms pjk': 'masa pajak',
    'nik': 'nomor induk kependudukan',
    'nomor pajak': 'nomor pokok wajib pajak',
    'nppn': 'norma penghitungan penghasilan neto',
    'npwp': 'nomor pokok wajib pajak',
    'ntpn': 'nomor transaksi penerimaan pajak',
    'op': 'orang pribadi',
    'pbb': 'pajak bumi dan bangunan',
    'phkp': 'penghasilan kena pajak',
    'pjk': 'pajak',
    'pkp': 'pengusaha kena pajak',
    'pmk': 'peraturan menteri keuangan',
    'pp': 'peraturan pemerintah',
    'pph': 'pajak penghasilan',
    'ppn': 'pajak pertambahan nilai',
    'ppnbm': 'pajak penjualan atas barang mewah',
    'ptkp': 'penghasilan tidak kena pajak',
    'sdsn': 'susunan dalam satu naskah',
    'sk': 'surat keputusan',
    'skp': 'surat ketetapan pajak',
    'skpib': 'surat keputusan pemberian imbalan bunga',
    'skpkb': 'surat ketetapan pajak kurang bayar',
    'skpkbt': 'surat ketetapan pajak kurang bayar tambahan',
    'skpkpp': 'surat keputusan penghitungan kelebihan pembayaran pajak',
    'skplb': 'surat ketetapan pajak lebih bayar',
    'skpn': 'surat ketetapan pajak nihil',
    'skppkp': 'surat keputusan pengembalian pendahuluan kelebihan pajak',
    'sp': 'subjek pajak',
    'sp2': 'surat pemberitahuan pemeriksaan',
    'spdn': 'subjek pajak dalam negeri',
    'sphp': 'surat pemberitahuan hasil pemeriksaan',
    'spln': 'subjek pajak luar negeri',
    'spm': 'surat perintah membayar',
    'spmp': 'surat perintah melaksanakan penyitaan',
    'sppt': 'surat pemberitahuan pajak terutang',
    'spt': 'surat pemberitahuan',
    'ssp': 'surat setoran pajak',
    'stp': 'surat tagihan pajak',
    'wbt': 'warisan belum terbagi',
    'wna': 'warga negara asing',
    'wni': 'warga negara indonesia',
    'wp': 'wajib pajak'
}

def normalization_query(query):
  for tax_term in tax_term_dict:
    query = re.sub(r"\W"+tax_term+"\W", " "+tax_term_dict[tax_term] + " ", query)
    query = re.sub(r"\A"+tax_term+" ", tax_term_dict[tax_term] + " ", query)
    query = re.sub(r" "+tax_term+"\Z", " " + tax_term_dict[tax_term], query)
    query = re.sub(r"\A"+tax_term+"\Z", tax_term_dict[tax_term], query)
    query = re.sub(r"\W"+tax_term.upper()+"\W", " "+tax_term_dict[tax_term] + " ", query)
    query = re.sub(r"\A"+tax_term.upper()+" ", tax_term_dict[tax_term] + " ", query)
    query = re.sub(r" "+tax_term.upper()+"\Z", " " + tax_term_dict[tax_term], query)
    query = re.sub(r"\A"+tax_term.upper()+"\Z", tax_term_dict[tax_term], query)
  return query

In [None]:
dfSub2 = pd.read_excel('/content/drive/MyDrive/Thesis/Teori_KUP.xlsx', sheet_name=sheetname)
# dfSub2 = dfSub2.head(11)
dfSub2['Sentence_1'] = dfSub2['Sentence_1'].apply(text_prepros)
dfSub2['Sentence_1'] = dfSub2['Sentence_1'].apply(word_tokenize_wrapper)
dfSub2['Sentence_1'] = dfSub2['Sentence_1'].apply(lambda x: ' '.join(x))
dfSub2['Sentence_1'] = dfSub2['Sentence_1'].apply(lambda x: normalization_query(x))
dfSub2['Sentence_2'] = dfSub2['Sentence_2'].apply(text_prepros)
dfSub2['Sentence_2'] = dfSub2['Sentence_2'].apply(word_tokenize_wrapper)
dfSub2['Sentence_2'] = dfSub2['Sentence_2'].apply(lambda x: ' '.join(x))
dfSub2['Sentence_2'] = dfSub2['Sentence_2'].apply(lambda x: normalization_query(x))
dfSub2['Score_min'] = dfSub2.apply(lambda x: min(x['Score_kemiripan_1'], x['Score_kemiripan_2'], x['Score_kemiripan_semantik']), axis=1)
print(dfSub2.columns)

train_examples_KUP_ALL = []
for query, isi, skor in zip(dfSub2['Sentence_1'], dfSub2['Sentence_2'], dfSub2['Score_kemiripan_semantik']): #Score_kemiripan_semantik #Score_20240707 #Score_kemiripan_1 #Score_kemiripan_kata
  train_examples_KUP_ALL.append(InputExample(texts=[query, isi], label=skor)) #label=0.8
print(len(train_examples_KUP_ALL))

Index(['Sentence_1', 'Sentence_2', 'Step', 'Score_kemiripan_1',
       'Score_kemiripan_2', 'Score_kemiripan_semantik', 'Score_kemiripan_kata',
       'No', 'Prompt', 'Score_min'],
      dtype='object')
303


In [None]:
display(dfSub2[['Score_kemiripan_1', 'Score_kemiripan_2', 'Score_kemiripan_kata', 'Score_kemiripan_semantik', 'Score_min']].describe())
dfSub2.duplicated().sum()

Unnamed: 0,Score_kemiripan_1,Score_kemiripan_2,Score_kemiripan_kata,Score_kemiripan_semantik,Score_min
count,303.0,303.0,303.0,303.0,303.0
mean,0.913795,0.884403,0.783795,0.799175,0.795931
std,0.035933,0.058716,0.035933,0.03758,0.037667
min,0.8,0.707,0.67,0.69,0.69
25%,0.89,0.848,0.76,0.77,0.77
50%,0.92,0.887,0.79,0.8,0.8
75%,0.94,0.927,0.81,0.83,0.82
max,1.0,0.997,0.87,0.9,0.9


0

### Dataset Pasal vs Penjelasan

In [None]:
# dfAns = pd.read_excel('/content/drive/MyDrive/Thesis/Dataset KUP.xlsx', sheet_name='Cleaned_RAW', dtype={'Pasal':'str', 'Ayat': 'str', 'Angka':'str', 'Huruf':'str', 'Paragraf':'str'})
# dfAns['Keys'] = dfAns.apply(lambda x: x['Pasal']+"/"+x['Ayat']+"/"+x['Angka']+"/"+x['Huruf'], axis=1)
# dfAns['Isi_cleaned'] = dfAns['Isi_cleaned'].apply(text_prepros)
# dfAns['Isi_cleaned'] = dfAns['Isi_cleaned'].apply(word_tokenize_wrapper)
# dfAns['Isi_cleaned'] = dfAns['Isi_cleaned'].apply(lambda x: ' '.join(x))
# dfAns['Isi_cleaned'] = dfAns['Isi_cleaned'].apply(lambda x: normalization_query(x))

# dfPasal = dfAns[dfAns['State'] == 'Pasal']
# dfPenjelasan = dfAns[~(dfAns['State'] == 'Pasal')]
# dfMerge = dfPasal.merge(dfPenjelasan, on='Keys', how='inner')
# print(dfMerge.shape)
# print(dfMerge.columns)

# train_examples = []
# for query, isi in zip(dfMerge['Isi_cleaned_x'], dfMerge['Isi_cleaned_y']):
#   train_examples.append(InputExample(texts=[query, isi], label=1.0))
# print(len(train_examples))

# train_examples_KUP_ALL = train_examples_KUP_ALL + train_examples
# print(len(train_examples_KUP_ALL))

## Fine Tuning Pretrained SBERT Model

https://www.sbert.net/docs/package_reference/sentence_transformer/SentenceTransformer.html#sentence_transformers.SentenceTransformer.fit

fit(train_objectives: Iterable[Tuple[torch.utils.data.dataloader.DataLoader, torch.nn.modules.module.Module]], evaluator: Optional[sentence_transformers.evaluation.SentenceEvaluator.SentenceEvaluator] = None, epochs: int = 1, steps_per_epoch=None, scheduler: str = 'WarmupLinear', warmup_steps: int = 10000, optimizer_class: Type[torch.optim.optimizer.Optimizer] = <class 'torch.optim.adamw.AdamW'>, optimizer_params: Dict[str, object] = {'lr': 2e-05}, weight_decay: float = 0.01, evaluation_steps: int = 0, output_path: Optional[str] = None, save_best_model: bool = True, max_grad_norm: float = 1, use_amp: bool = False, callback: Optional[Callable[[float, int, int], None]] = None, show_progress_bar: bool = True, checkpoint_path: Optional[str] = None, checkpoint_save_steps: int = 500, checkpoint_save_total_limit: int = 0) → None[source]

In [None]:
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
train_dataloader = DataLoader(train_examples_KUP_ALL, shuffle=True, batch_size=16)

train_loss = losses.CosineSimilarityLoss(model)

# Tune the model
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=epok, warmup_steps=100)

model.save("/content/drive/MyDrive/Thesis_Finetuned/"+model_name) #fine_tuned_model_kup difinetune dengan train_examples_KUP_ALL
print(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.13k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/19 [00:00<?, ?it/s]

Iteration:   0%|          | 0/19 [00:00<?, ?it/s]

Iteration:   0%|          | 0/19 [00:00<?, ?it/s]

Iteration:   0%|          | 0/19 [00:00<?, ?it/s]

Iteration:   0%|          | 0/19 [00:00<?, ?it/s]

fine_tuned_model_kup_all_a2


## Load Fine Tuned SBERT Model

In [None]:
model_load = SentenceTransformer("/content/drive/MyDrive/Thesis_Finetuned/"+model_name)
print(model_name)

fine_tuned_model_kup_all_a2


### Test SBERT Embeddings

In [None]:
s1 = "pemberian hukuman untuk pegawai pajak"
s2 = "pegawai pajak yang karena kelalaiannya atau dengan sengaja menghitung atau menetapkan pajak tidak sesuai dengan ketentuan undang-undang perpajakan dikenai sanksi sesuai dengan ketentuan peraturan perundang-undangan"

model_load_embeddings = model_load.encode([s1, s2])
similarity = util.cos_sim(model_load_embeddings[0], model_load_embeddings[1])
print(similarity)

model_multimpnet = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
multilingual_mpnet_embeddings = model_multimpnet.encode([s1, s2])
similarity = util.cos_sim(multilingual_mpnet_embeddings[0], multilingual_mpnet_embeddings[1])
print(similarity)

tensor([[0.6617]])
tensor([[0.8511]])


## Create SBERT Embeddings

In [None]:
dfAns = pd.read_csv('/content/drive/MyDrive/Thesis/Dataset KUP Preprocessed.csv', sep=';', dtype={'Angka':'str','Paragraf':'str'})

### Save Embeddings Pickle

In [None]:
model_multimpnet = model_load
multilingual_mpnet_embeddings = model_multimpnet.encode(dfAns['Isi_lower'])
with open("/content/drive/MyDrive/Thesis_Finetuned/multilingual_mpnet_embeddings_"+model_name+".pkl", "wb") as fOut:
    pickle.dump({'multilingual_mpnet_embeddings': multilingual_mpnet_embeddings},fOut)

multilingual_mpnet_embeddings_joined = model_multimpnet.encode(dfAns['Isi_Tokenized_Joined'])
with open("/content/drive/MyDrive/Thesis_Finetuned/multilingual_mpnet_embeddings_joined_"+model_name+".pkl", "wb") as fOut:
    pickle.dump({'multilingual_mpnet_embeddings_joined': multilingual_mpnet_embeddings_joined},fOut)

multilingual_mpnet_embeddings_stemmed = model_multimpnet.encode(dfAns['Isi_Stemmed'])
with open("/content/drive/MyDrive/Thesis_Finetuned/multilingual_mpnet_embeddings_stemmed_"+model_name+".pkl", "wb") as fOut:
    pickle.dump({'multilingual_mpnet_embeddings_stemmed': multilingual_mpnet_embeddings_stemmed},fOut)

print(model_name)

fine_tuned_model_kup_all_a2


### Load Embeddings Pickles

In [None]:
with open("/content/drive/MyDrive/Thesis_Finetuned/multilingual_mpnet_embeddings_"+model_name+".pkl", "rb") as fIn:
  cache_data = pickle.load(fIn)
  multilingual_mpnet_embeddings = cache_data['multilingual_mpnet_embeddings']

with open("/content/drive/MyDrive/Thesis_Finetuned/multilingual_mpnet_embeddings_joined_"+model_name+".pkl", "rb") as fIn:
  cache_data = pickle.load(fIn)
  multilingual_mpnet_embeddings_joined = cache_data['multilingual_mpnet_embeddings_joined']

with open("/content/drive/MyDrive/Thesis_Finetuned/multilingual_mpnet_embeddings_stemmed_"+model_name+".pkl", "rb") as fIn:
  cache_data = pickle.load(fIn)
  multilingual_mpnet_embeddings_stemmed = cache_data['multilingual_mpnet_embeddings_stemmed']

print(model_name)

fine_tuned_model_kup_all_a2


# BM25

## BM25 Encodings

In [None]:
corpus = dfAns['Isi_lower']
tokenized_corpus = [doc.split(" ") for doc in corpus]
bm25 = BM25Okapi(tokenized_corpus)

corpus = dfAns['Isi_Tokenized_Joined']
tokenized_corpus = [doc.split(" ") for doc in corpus]
bm25_joined = BM25Okapi(tokenized_corpus)

corpus = dfAns['Isi_Stemmed']
tokenized_corpus = [doc.split(" ") for doc in corpus]
bm25_stemmed = BM25Okapi(tokenized_corpus)

# BEGIN

In [None]:
model_multimpnet_raw = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

# multilingual_mpnet_embeddings_raw = model_multimpnet_raw.encode(dfAns['Isi_lower'])
# with open("/content/drive/MyDrive/Thesis/multilingual_mpnet_embeddings_raw.pkl", "wb") as fOut:
#     pickle.dump({'multilingual_mpnet_embeddings_raw': multilingual_mpnet_embeddings_raw},fOut)

# multilingual_mpnet_embeddings_joined_raw = model_multimpnet_raw.encode(dfAns['Isi_Tokenized_Joined'])
# with open("/content/drive/MyDrive/Thesis/multilingual_mpnet_embeddings_joined_raw.pkl", "wb") as fOut:
#     pickle.dump({'multilingual_mpnet_embeddings_joined_raw': multilingual_mpnet_embeddings_joined_raw},fOut)

# multilingual_mpnet_embeddings_stemmed_raw = model_multimpnet_raw.encode(dfAns['Isi_Stemmed'])
# with open("/content/drive/MyDrive/Thesis/multilingual_mpnet_embeddings_stemmed_raw.pkl", "wb") as fOut:
#     pickle.dump({'multilingual_mpnet_embeddings_stemmed_raw': multilingual_mpnet_embeddings_stemmed_raw},fOut)

with open("/content/drive/MyDrive/Thesis/multilingual_mpnet_embeddings_raw.pkl", "rb") as fIn:
  cache_data = pickle.load(fIn)
  multilingual_mpnet_embeddings_raw = cache_data['multilingual_mpnet_embeddings_raw']

with open("/content/drive/MyDrive/Thesis/multilingual_mpnet_embeddings_joined_raw.pkl", "rb") as fIn:
  cache_data = pickle.load(fIn)
  multilingual_mpnet_embeddings_joined_raw = cache_data['multilingual_mpnet_embeddings_joined_raw']

with open("/content/drive/MyDrive/Thesis/multilingual_mpnet_embeddings_stemmed_raw.pkl", "rb") as fIn:
  cache_data = pickle.load(fIn)
  multilingual_mpnet_embeddings_stemmed_raw = cache_data['multilingual_mpnet_embeddings_stemmed_raw']

# Evaluation

## Prepare References

In [None]:
dfSearch = dfAns[['Bab', 'State', 'Pasal', 'Ayat', 'Angka', 'Huruf', 'Paragraf', 'Isi']].copy()
dfSearch['Isi_lower'] = dfSearch['Isi'].apply(lambda x: x.lower())
dfSearch['Sumber'] = dfSearch.apply(lambda x: set_sumber(x['State'], str(x['Pasal']), str(x['Ayat']), str(x['Angka']), str(x['Huruf']), str(x['Paragraf'])), axis=1)

dfSearch['Pasal'] = [x.lower() for x in dfSearch['Pasal']]
dfSearch['Ayat'] = [x.lower() for x in dfSearch['Ayat']]
dfSearch['Angka'] = [x.lower() for x in dfSearch['Angka']]
dfSearch['key_answer'] = dfSearch.apply(lambda x: x['Pasal'] + '/' + x['Ayat'] + '/' + x['Angka'], axis=1)

In [None]:
queries = [
  ('Pemberian hukuman untuk pegawai pajak', ['36a/1/0', '36a/2/0', '36a/3/0', '36a/4/0', '36a/5/0']),
  ('sanksi yang diberikan atas pelanggaran - pelanggaran yang dilakukan pegawai pajak', ['36a/1/0', '36a/2/0', '36a/3/0', '36a/4/0', '36a/5/0']),
  ('Jelaskan tentang etika yang mengatur pegawai DJP', ['36b/1/0', '36b/2/0', '36b/3/0']),
  ('kode etik bagi seluruh pegawai DJP', ['36b/1/0', '36b/2/0', '36b/3/0']),
  ('Jelaskan tentang Nomor identitas WP', ['1/0/6', '2/1/0', '2/1a/0', '2/4/0', '2/4a/0', '2/6/0', '2/7/0', '2/10/0', '37a/2/0', '44e/2/0']),
  ('Hak Mendahului Negara di dalam penagihan pajak', ['21/1/0', '21/2/0', '21/3/0', '21/4/0', '21/5/0']),
  ('Negara memiliki Hak Mendahului dalam menagih pajak yang terutang', ['21/1/0', '21/2/0', '21/3/0', '21/4/0', '21/5/0']),
  ('Pemberian Sanksi dalam hal SPT tidak disampaikan', ['3/5a/0', '7/1/0', '13/1/0','38a/0/0','39/1/0']),
  ('Bagaimana melakukan pembetulan SPT tahunan', ['8/1/0', '8/1a/0', '8/2/0', '8/2a/0', '8/6/0']),
  ('kriteria menghapus NPWP', ['2/6/0', '2/7/0', '44e/2/0']),
  ('kriteria WP yang dapat dihapus NPWP nya beserta jangka waktu nya', ['2/6/0', '2/7/0', '44e/2/0']),
  ('dalam hal SPT dianggap tidak disampaikan', ['3/7/0', '3/7a/0', '4/4b/0']),
  ('SP dapat mengakui ketidakbenaran dalam pengisian SPT', ['8/3/0', '8/3a/0', '8/4/0', '8/5/0', '17b/5/0', '44a/0/0']),
  ('penggunaan SSP sebagai alat untuk membayar pajak', ['1/0/14','3/5/0','10/1/0','10/1a/0','10/2/0']),
  ('bagaimana jika WP kelebihan membayar pajak', ['1/0/19','11/1/0','11/1a/0','11/2/0','11/3/0','11/4/0','17/3/0','17b/1/0','17b/2/0','17c/1/0','17c/6/0','17d/1/0','17d/2/0','27b/1/0','27b/2/0','27b/3/0']),
  ('SKPN diterbitkan apabila', ['17A/1/0', '17A/2/0', '1/0/18']),
  ('apa yang menjadi dasar penagihan atas pajak yang masih terutang', ['18/1/0','20a/8/0']),
  ('wakil WP dalam memenuhi kewajiban perpajakannya', ['32/1/0', '32/2/0']),
  ('kewajiban bagi pihak lain untuk memberikan data dan informasi yang berkaitan dengan perpajakan', ['35a/1/0', '35a/2/0', '2/10/0', '41c/3/0', '41c/4/0']),
  ('bonus pegawai DJP atas capaian kinerjanya', ['36d/1/0', '36d/2/0', '36d/3/0']),
  ('perbuatan tindak pidana perpajakan', ['1/0/26','1/0/27','1/0/31','1/0/32','13/4/0','17b/1a/0','17b/4/0','17b/5/0','17c/2/0','17c/6/0','22/2/0','34/4/0','34/5/0','35/1/0','35/2/0','36a/3/0','36a/4/0','36a/5/0','38/0/0','39/1/0','39/2/0','39/3/0','39a/0/0','40/0/0','41/1/0','41/2/0','41/3/0','41a/0/0','41b/0/0','41c/1/0','41c/2/0','41c/3/0','41c/4/0','43/1/0','43/2/0','43a/1/0','43a/2/0','43a/3/0','43a/4/0','44/1/0','44/2/0','44/3/0','44a/0/0','44b/1/0','44b/2/0','44b/2a/0','44b/2b/0','44b/2c/0','44c/1/0','44c/2/0','44c/3/0','44d/1/0','44e/2/0']),
  ('peran penyidik pajak', ['1/0/31', '1/0/32', '43a/1a/0', '44/1/0', '44/2/0', '44/3/0', '44/4/0', '44a/0/0']),
  ('pemberlakukan NIK sebagai nomor identitas wajib pajak', ['2/1a/0', '44e/2/0'])
]

import re

def normalization_query(query):
  for tax_term in tax_term_dict:
    query = re.sub(r"\W"+tax_term+"\W", " "+tax_term_dict[tax_term] + " ", query)
    query = re.sub(r"\A"+tax_term+" ", tax_term_dict[tax_term] + " ", query)
    query = re.sub(r" "+tax_term+"\Z", " " + tax_term_dict[tax_term], query)
    query = re.sub(r"\A"+tax_term+"\Z", tax_term_dict[tax_term], query)
    query = re.sub(r"\W"+tax_term.upper()+"\W", " "+tax_term_dict[tax_term] + " ", query)
    query = re.sub(r"\A"+tax_term.upper()+" ", tax_term_dict[tax_term] + " ", query)
    query = re.sub(r" "+tax_term.upper()+"\Z", " " + tax_term_dict[tax_term], query)
    query = re.sub(r"\A"+tax_term.upper()+"\Z", tax_term_dict[tax_term], query)
  return query

i = 0
while i < len(queries):
  queries[i] = list(queries[i])
  queries[i][0] = normalization_query(queries[i][0])
  queries[i] = tuple(queries[i])
  i += 1

for i in queries:
  print(i)

('Pemberian hukuman untuk pegawai pajak', ['36a/1/0', '36a/2/0', '36a/3/0', '36a/4/0', '36a/5/0'])
('sanksi yang diberikan atas pelanggaran - pelanggaran yang dilakukan pegawai pajak', ['36a/1/0', '36a/2/0', '36a/3/0', '36a/4/0', '36a/5/0'])
('Jelaskan tentang etika yang mengatur pegawai direktorat jenderal pajak', ['36b/1/0', '36b/2/0', '36b/3/0'])
('kode etik bagi seluruh pegawai direktorat jenderal pajak', ['36b/1/0', '36b/2/0', '36b/3/0'])
('Jelaskan tentang Nomor identitas wajib pajak', ['1/0/6', '2/1/0', '2/1a/0', '2/4/0', '2/4a/0', '2/6/0', '2/7/0', '2/10/0', '37a/2/0', '44e/2/0'])
('Hak Mendahului Negara di dalam penagihan pajak', ['21/1/0', '21/2/0', '21/3/0', '21/4/0', '21/5/0'])
('Negara memiliki Hak Mendahului dalam menagih pajak yang terutang', ['21/1/0', '21/2/0', '21/3/0', '21/4/0', '21/5/0'])
('Pemberian Sanksi dalam hal surat pemberitahuan tidak disampaikan', ['3/5a/0', '7/1/0', '13/1/0', '38a/0/0', '39/1/0'])
('Bagaimana melakukan pembetulan surat pemberitahuan tahuna

In [None]:
from statistics import mean

In [None]:
def evaluate_queries(queries, k=0):
  evaluation_queries_out = []
  for query_answer in queries:
    query = query_answer[0]
    key_answer = query_answer[1]
    # print(query, '\n', key_answer, '\n', evaluate_query(query, key_answer, k), '\n')
    evaluation_queries_out.append([query, evaluate_query(query, key_answer, k)])
  return evaluation_queries_out

def evaluate_query(query, key_answer, k):
  query = query.lower()
  query_tokenized = text_prepros(query)
  query_tokenized = word_tokenize_wrapper(query_tokenized)
  query_tokenized_joined = ' '.join(query_tokenized)
  query_stemmed = stemmed_wrapper(query_tokenized_joined)

  if len(key_answer) > 0:
    key_answer = [answer.lower() for answer in key_answer]
    key_terms = dfSearch[dfSearch['key_answer'].isin(key_answer)].index
    # print('Query:', query)
    # display(dfSearch.loc[key_terms, ['State', 'Pasal', 'Ayat', 'Angka', 'Isi_lower']].sort_index())
    # print('')

  if k <= 0:
    k = len(key_terms)

  dfAns.drop(dfAns.columns[14:], axis=1, inplace=True)

  search_query(query = query, model = model_multimpnet, embeddings = multilingual_mpnet_embeddings, column_name = 'multi_mpnet')
  search_query(query = query_tokenized_joined, model = model_multimpnet, embeddings = multilingual_mpnet_embeddings_joined, column_name = 'mpnet_joined')

  search_query(query = query, model = model_multimpnet_raw, embeddings = multilingual_mpnet_embeddings_raw, column_name = 'multi_mpnet_raw')
  search_query(query = query_tokenized_joined, model = model_multimpnet_raw, embeddings = multilingual_mpnet_embeddings_joined_raw, column_name = 'mpnet_joined_raw')

  tokenized_query = query.split(" ")
  tokenized_query_joined = query_tokenized_joined.split(" ")

  doc_scores = bm25.get_scores(tokenized_query)
  dfAns['BM25'] = doc_scores
  doc_scores = bm25_joined.get_scores(tokenized_query_joined)
  dfAns['BM25_joined'] = doc_scores

  dfAns2 = dfAns.copy()
  dfAns2['index_corpus'] = dfAns2.index

  # Lower Preprocessing
  dfAns2 = dfAns2.sort_values(by='multi_mpnet_raw', ascending=False)
  dfAns2.reset_index(drop=True, inplace=True)
  dfAns2['index_multi_mpnet_raw'] = dfAns2.index + 1

  dfAns2 = dfAns2.sort_values(by='multi_mpnet', ascending=False)
  dfAns2.reset_index(drop=True, inplace=True)
  dfAns2['index_multi_mpnet'] = dfAns2.index + 1

  dfAns2 = dfAns2.sort_values(by='BM25', ascending=False)
  dfAns2.reset_index(drop=True, inplace=True)
  dfAns2['index_BM25'] = dfAns2.index + 1

  dfAns2['RRF_raw'] = dfAns2.apply(lambda x: reciprocal_rank_fusion([x['index_multi_mpnet_raw'], x['index_BM25']]), axis=1)
  dfAns2 = dfAns2.sort_values(by='RRF_raw', ascending=False)
  dfAns2.reset_index(drop=True, inplace=True)
  dfAns2['index_RRF_raw'] = dfAns2.index + 1

  dfAns2['RRF'] = dfAns2.apply(lambda x: reciprocal_rank_fusion([x['index_multi_mpnet'], x['index_BM25']]), axis=1)
  dfAns2 = dfAns2.sort_values(by='RRF', ascending=False)
  dfAns2.reset_index(drop=True, inplace=True)
  dfAns2['index_RRF'] = dfAns2.index + 1

  # Lower Preprocessing + Cleaning
  dfAns2 = dfAns2.sort_values(by='mpnet_joined_raw', ascending=False)
  dfAns2.reset_index(drop=True, inplace=True)
  dfAns2['index_mpnet_joined_raw'] = dfAns2.index + 1

  dfAns2 = dfAns2.sort_values(by='mpnet_joined', ascending=False)
  dfAns2.reset_index(drop=True, inplace=True)
  dfAns2['index_mpnet_joined'] = dfAns2.index + 1

  dfAns2 = dfAns2.sort_values(by='BM25_joined', ascending=False)
  dfAns2.reset_index(drop=True, inplace=True)
  dfAns2['index_BM25_joined'] = dfAns2.index + 1

  dfAns2['RRF_joined_raw'] = dfAns2.apply(lambda x: reciprocal_rank_fusion([x['index_mpnet_joined_raw'], x['index_BM25_joined']]), axis=1)
  dfAns2 = dfAns2.sort_values(by='RRF_joined_raw', ascending=False)
  dfAns2.reset_index(drop=True, inplace=True)
  dfAns2['index_RRF_joined_raw'] = dfAns2.index + 1

  dfAns2['RRF_joined'] = dfAns2.apply(lambda x: reciprocal_rank_fusion([x['index_mpnet_joined'], x['index_BM25_joined']]), axis=1)
  dfAns2 = dfAns2.sort_values(by='RRF_joined', ascending=False)
  dfAns2.reset_index(drop=True, inplace=True)
  dfAns2['index_RRF_joined'] = dfAns2.index + 1

  index_columns = [i for i in dfAns2.columns if 'index' in i ]
  index_columns.remove('index_corpus')
  evaluation_out = []
  for index_key in index_columns:
    evaluation_out.append([index_key, evaluation(dfAns2, index_key, key_terms, k)])
  return evaluation_out

ks = [0, 5, 10]
dfEvaluate = {'k':[], 'Method': [], 'Avg_Precision': [], 'Avg_Recall': [], 'Avg_FMeasure': []}
for k in ks:
  search_evaluations = evaluate_queries(queries, k)

  index_all = {}

  for search_evaluation in search_evaluations:
    term = search_evaluation[0]
    method_evaluations = search_evaluation[1]
    # print(term)
    for method_evaluation in method_evaluations:
      index_evaluation = method_evaluation[0]
      metrix_evaluations = method_evaluation[1]

      if index_evaluation not in index_all:
        index_all[index_evaluation] = [metrix_evaluations]
      else:
        index_all[index_evaluation].append(metrix_evaluations)

  print(f'k yang digunakan: {k} \n')
  for index_method in index_all:
    precision = []
    recall = []
    fmeasure = []

    for method_evaluate in index_all[index_method]:
      precision.append(method_evaluate[0])
      recall.append(method_evaluate[1])
      fmeasure.append(method_evaluate[2])

    dfEvaluate['k'].append(k)
    dfEvaluate['Method'].append(index_method)
    dfEvaluate['Avg_Precision'].append(mean(precision))
    dfEvaluate['Avg_Recall'].append(mean(recall))
    dfEvaluate['Avg_FMeasure'].append(mean(fmeasure))
    # print(f'{index_method}\nAverage Precision: {mean(precision)}, Average Recall: {mean(recall)}, Average F-Measure: {mean(fmeasure)}')
  print('')
dfEvaluate = pd.DataFrame.from_dict(dfEvaluate)

k yang digunakan: 0 


k yang digunakan: 5 


k yang digunakan: 10 




## Matrix Evaluation

In [None]:
print('Positif lebih baik')
print(list(dfEvaluate.iloc[1,[2,3,4]] - dfEvaluate.iloc[0,[2,3,4]]))
print(list(dfEvaluate.iloc[4,[2,3,4]] - dfEvaluate.iloc[3,[2,3,4]]))
print(list(dfEvaluate.iloc[6,[2,3,4]] - dfEvaluate.iloc[5,[2,3,4]]))
print(list(dfEvaluate.iloc[9,[2,3,4]] - dfEvaluate.iloc[8,[2,3,4]]))

n = 10
print(n)
print(list(dfEvaluate.iloc[1+n,[2,3,4]] - dfEvaluate.iloc[0+n,[2,3,4]]))
print(list(dfEvaluate.iloc[4+n,[2,3,4]] - dfEvaluate.iloc[3+n,[2,3,4]]))
print(list(dfEvaluate.iloc[6+n,[2,3,4]] - dfEvaluate.iloc[5+n,[2,3,4]]))
print(list(dfEvaluate.iloc[9+n,[2,3,4]] - dfEvaluate.iloc[8+n,[2,3,4]]))

n = 20
print(n)
print(list(dfEvaluate.iloc[1+n,[2,3,4]] - dfEvaluate.iloc[0+n,[2,3,4]]))
print(list(dfEvaluate.iloc[4+n,[2,3,4]] - dfEvaluate.iloc[3+n,[2,3,4]]))
print(list(dfEvaluate.iloc[6+n,[2,3,4]] - dfEvaluate.iloc[5+n,[2,3,4]]))
print(list(dfEvaluate.iloc[9+n,[2,3,4]] - dfEvaluate.iloc[8+n,[2,3,4]]))

Positif lebih baik
[0.019575471698113134, 0.019575471698113134, 0.019575471698113245]
[0.06831761006289305, 0.06831761006289305, 0.06831761006289305]
[0.0531104730653541, 0.0531104730653541, 0.05311047306535399]
[0.05644654088050305, 0.05644654088050305, 0.05644654088050316]
10
[0.017391304347826098, 0.04338255400601587, 0.02864886238199582]
[0.034782608695652084, 0.03423913043478266, 0.03532409619366139]
[0.05217391304347829, 0.03886382280557843, 0.03558111054362928]
[0.0434782608695653, 0.03206521739130441, 0.03710781971651533]
20
[0.004347826086956497, 0.022101449275362284, 0.010075914423740506]
[0.03478260869565214, 0.06386382280557834, 0.03995859213250519]
[0.017391304347826098, 0.03768115942028982, 0.022556670382757327]
[0.034782608695652195, 0.05652173913043479, 0.042512077294686035]


In [None]:
print(model_name)
dfEvaluate

fine_tuned_model_kup_all_a2


Unnamed: 0,k,Method,Avg_Precision,Avg_Recall,Avg_FMeasure
0,0,index_multi_mpnet_raw,0.538026,0.538026,0.538026
1,0,index_multi_mpnet,0.557602,0.557602,0.557602
2,0,index_BM25,0.438515,0.438515,0.438515
3,0,index_RRF_raw,0.596049,0.596049,0.596049
4,0,index_RRF,0.664366,0.664366,0.664366
5,0,index_mpnet_joined_raw,0.518994,0.518994,0.518994
6,0,index_mpnet_joined,0.572105,0.572105,0.572105
7,0,index_BM25_joined,0.382387,0.382387,0.382387
8,0,index_RRF_joined_raw,0.584273,0.584273,0.584273
9,0,index_RRF_joined,0.64072,0.64072,0.64072


# END RESULT