In [1]:
import pandas as pd
import numpy as np
from get_candidates import get_candidates
import sys
sys.path.append('../')
import indexing.preprocess_reference_data

# Evaluasi Model String Distance - Levenshtein Distance

## Load the test data

In [2]:
df_tr = pd.read_csv('../indexing/data/splitted_data/v1/train.csv')
df_val = pd.read_csv('../indexing/data/splitted_data/v1/val.csv')
df_test = pd.read_csv('../indexing/data/splitted_data/v1/test.csv')

In [3]:
df = pd.concat([df_tr, df_val, df_test])

In [4]:
df.head()

Unnamed: 0,instansi,reference,status
0,"Kementerian Koordinator Bidang Politik, Hukum,...",Kemenko Polhukam,yes
1,"Kementerian Koordinator Bidang Politik, Hukum,...",Polhukam,yes
2,"Kementerian Koordinator Bidang Politik, Hukum,...",Koordinator Politik,no
3,"Kementerian Koordinator Bidang Politik, Hukum,...",Koordinator Hukum,no
4,Kementerian Luar Negeri,Kemlu,yes


In [5]:
df = df.reset_index()
df = df.drop('index', axis=1)
df.head()

Unnamed: 0,instansi,reference,status
0,"Kementerian Koordinator Bidang Politik, Hukum,...",Kemenko Polhukam,yes
1,"Kementerian Koordinator Bidang Politik, Hukum,...",Polhukam,yes
2,"Kementerian Koordinator Bidang Politik, Hukum,...",Koordinator Politik,no
3,"Kementerian Koordinator Bidang Politik, Hukum,...",Koordinator Hukum,no
4,Kementerian Luar Negeri,Kemlu,yes


Clean combined

In [6]:
df2 = df.copy()
df2['cln reference'] = df2['reference'].str.replace('/', ' ')
df2['cln reference'] = df2['cln reference'].str.replace('[^\w\s]','',regex=True)

In [9]:
prd = indexing.preprocess_reference_data.PreprocessReferenceData()
ref_data = prd.create_reference_data_from_labeled(df2)
ref_data = prd.clean_reference_data(ref_data)

In [10]:
ref_data

Unnamed: 0,reference
0,Kemenko Polhukam
1,Polhukam
2,Kemlu
3,Kemlu RI
4,Kementerian BUMN
...,...
932,Indofarma Global Medika
933,Indofarma Global Medika Cabang
934,PT IGM
935,Indofarma


In [11]:
for index, row in ref_data.iterrows():
    print(f"Index: {index}, row: {row[['reference']].values[0]}")
    break

Index: 0, row: Kemenko Polhukam


In [12]:
candidates_all = []

for index, row in ref_data.iterrows():
    instansi = row[['reference']].values[0]
    instansi = instansi.lower()
    candidates = get_candidates(instansi)
    candidates_all.append(candidates)

In [13]:
eval_df = ref_data.copy()
eval_df['candidate(s) with edit dist.'] = candidates_all

In [14]:
eval_df.head()

Unnamed: 0,reference,candidate(s) with edit dist.
0,Kemenko Polhukam,"{'kemenko polhukam': 0, 'polhukam': 8}"
1,Polhukam,"{'polhukam': 0, 'kemenko polhukam': 8}"
2,Kemlu,"{'kemlu': 0, 'kemlu ri': 3}"
3,Kemlu RI,"{'kemlu ri': 0, 'kemlu': 3}"
4,Kementerian BUMN,"{'kementerian bumn': 0, 'kemen bumn': 6, 'keme..."


In [15]:
candidates_all_without_edit_dist = []

In [16]:
for index, row in eval_df.iterrows():
    candidates = row[['candidate(s) with edit dist.']].values[0]
    candidates = [i for i in candidates.keys()]
    candidates_all_without_edit_dist.append(candidates)

In [17]:
eval_df['candidate(s)'] = candidates_all_without_edit_dist

In [18]:
eval_df

Unnamed: 0,reference,candidate(s) with edit dist.,candidate(s)
0,Kemenko Polhukam,"{'kemenko polhukam': 0, 'polhukam': 8}","[kemenko polhukam, polhukam]"
1,Polhukam,"{'polhukam': 0, 'kemenko polhukam': 8}","[polhukam, kemenko polhukam]"
2,Kemlu,"{'kemlu': 0, 'kemlu ri': 3}","[kemlu, kemlu ri]"
3,Kemlu RI,"{'kemlu ri': 0, 'kemlu': 3}","[kemlu ri, kemlu]"
4,Kementerian BUMN,"{'kementerian bumn': 0, 'kemen bumn': 6, 'keme...","[kementerian bumn, kemen bumn, kementerian bad..."
...,...,...,...
932,Indofarma Global Medika,"{'indofarma global medika': 0, 'indofarma glob...","[indofarma global medika, indofarma global med..."
933,Indofarma Global Medika Cabang,"{'indofarma global medika cabang': 0, 'indofar...","[indofarma global medika cabang, indofarma glo..."
934,PT IGM,"{'pt igm': 0, 'pt pupuk indonesia holding comp...","[pt igm, pt pupuk indonesia holding company, p..."
935,Indofarma,"{'indofarma': 0, 'pt indofarma': 3, 'indofarma...","[indofarma, pt indofarma, indofarma tbk, indof..."


In [19]:
eval_df.to_csv('./eval/performance_v1_1.csv')

## Get True References

In [63]:
import re

In [20]:
df2.head()

Unnamed: 0,instansi,reference,status,cln reference
0,"Kementerian Koordinator Bidang Politik, Hukum,...",Kemenko Polhukam,yes,Kemenko Polhukam
1,"Kementerian Koordinator Bidang Politik, Hukum,...",Polhukam,yes,Polhukam
2,"Kementerian Koordinator Bidang Politik, Hukum,...",Koordinator Politik,no,Koordinator Politik
3,"Kementerian Koordinator Bidang Politik, Hukum,...",Koordinator Hukum,no,Koordinator Hukum
4,Kementerian Luar Negeri,Kemlu,yes,Kemlu


In [21]:
eval_df.head()

Unnamed: 0,reference,candidate(s) with edit dist.,candidate(s)
0,Kemenko Polhukam,"{'kemenko polhukam': 0, 'polhukam': 8}","[kemenko polhukam, polhukam]"
1,Polhukam,"{'polhukam': 0, 'kemenko polhukam': 8}","[polhukam, kemenko polhukam]"
2,Kemlu,"{'kemlu': 0, 'kemlu ri': 3}","[kemlu, kemlu ri]"
3,Kemlu RI,"{'kemlu ri': 0, 'kemlu': 3}","[kemlu ri, kemlu]"
4,Kementerian BUMN,"{'kementerian bumn': 0, 'kemen bumn': 6, 'keme...","[kementerian bumn, kemen bumn, kementerian bad..."


In [66]:
true_refs_all = []

In [67]:
for index, row in eval_df.iterrows():
    reference = row[['reference']].values[0]
    instansi_resmi = df2[(df2['cln reference']==reference)]['instansi'].values[0]
    true_refs = df2[(df2['instansi']==instansi_resmi) 
                   & (df['status']=='yes')]['reference'].values
    true_refs_ = []
    for ref in true_refs:
        ref = ref.lower()
        ref = ref.replace('/', ' ')
        ref = re.sub(r'[^\w\s]','', ref)
        true_refs_.append(ref)
    instansi_resmi = instansi_resmi.lower()
    instansi_resmi = instansi_resmi.replace('/', ' ')
    instansi_resmi = re.sub(r'[^\w\s]','', instansi_resmi)
    true_refs_.append(instansi_resmi)
    true_refs_all.append(true_refs_)

In [68]:
true_refs_all

[['kemenko polhukam',
  'polhukam',
  'kementerian koordinator bidang politik hukum dan keamanan'],
 ['kemenko polhukam',
  'polhukam',
  'kementerian koordinator bidang politik hukum dan keamanan'],
 ['kemlu', 'kemlu ri', 'kementerian luar negeri'],
 ['kemlu', 'kemlu ri', 'kementerian luar negeri'],
 ['kementerian bumn', 'kemen bumn', 'kementerian badan usaha milik negara'],
 ['kementerian bumn', 'kemen bumn', 'kementerian badan usaha milik negara'],
 ['kemenparekraf',
  'kemenparekraf ri',
  'kementerian pariwisata dan ekonomi kreatif'],
 ['kemenparekraf',
  'kemenparekraf ri',
  'kementerian pariwisata dan ekonomi kreatif'],
 ['kemen pupr',
  'kemen pupr ri',
  'kementerian pekerjaan umum dan perumahan rakyat pupr'],
 ['kemen pupr',
  'kemen pupr ri',
  'kementerian pekerjaan umum dan perumahan rakyat pupr'],
 ['kemenpora', 'kemenpora ri', 'kementerian pemuda dan olahraga'],
 ['kemenpora', 'kemenpora ri', 'kementerian pemuda dan olahraga'],
 ['kemendikbudristek',
  'kemdikbud',
  'k

In [69]:
eval_df['true candidates'] = true_refs_all

In [70]:
eval_df

Unnamed: 0,reference,candidate(s) with edit dist.,candidate(s),true candidates
0,Kemenko Polhukam,"{'kemenko polhukam': 0, 'polhukam': 8}","[kemenko polhukam, polhukam]","[kemenko polhukam, polhukam, kementerian koord..."
1,Polhukam,"{'polhukam': 0, 'kemenko polhukam': 8}","[polhukam, kemenko polhukam]","[kemenko polhukam, polhukam, kementerian koord..."
2,Kemlu,"{'kemlu': 0, 'kemlu ri': 3}","[kemlu, kemlu ri]","[kemlu, kemlu ri, kementerian luar negeri]"
3,Kemlu RI,"{'kemlu ri': 0, 'kemlu': 3}","[kemlu ri, kemlu]","[kemlu, kemlu ri, kementerian luar negeri]"
4,Kementerian BUMN,"{'kementerian bumn': 0, 'kemen bumn': 6, 'keme...","[kementerian bumn, kemen bumn, kementerian bad...","[kementerian bumn, kemen bumn, kementerian bad..."
...,...,...,...,...
932,Indofarma Global Medika,"{'indofarma global medika': 0, 'indofarma glob...","[indofarma global medika, indofarma global med...","[indofarma tbk, indofarma global medika, indof..."
933,Indofarma Global Medika Cabang,"{'indofarma global medika cabang': 0, 'indofar...","[indofarma global medika cabang, indofarma glo...","[indofarma tbk, indofarma global medika, indof..."
934,PT IGM,"{'pt igm': 0, 'pt pupuk indonesia holding comp...","[pt igm, pt pupuk indonesia holding company, p...","[indofarma tbk, indofarma global medika, indof..."
935,Indofarma,"{'indofarma': 0, 'pt indofarma': 3, 'indofarma...","[indofarma, pt indofarma, indofarma tbk, indof...","[indofarma tbk, indofarma global medika, indof..."


## Check whether the candidates are in true candidates

In [77]:
checks = []

In [78]:
for index, row in eval_df.iterrows():
    candidates = row[['candidate(s)']].values[0]
    true_candidates = row[['true candidates']].values[0]
    check = []
    for i in candidates:
        if i in true_candidates:
            check.append(1)
        else:
            check.append(0)
    print(check)
    checks.append(check)

[1, 1]
[1, 1]
[1, 1]
[1, 1]
[1, 1, 1, 0, 0]
[1, 1]
[1, 1]
[1, 1]
[1, 1, 1]
[1, 1]
[1, 1]
[1, 1]
[1]
[1]
[1, 1]
[1, 1]
[1, 1]
[1, 1]
[1, 1]
[1, 1]
[1]
[1]
[1, 1]
[1, 1]
[1]
[1]
[1, 1]
[1, 1]
[1, 1]
[1, 0]
[1, 0, 1]
[1, 1]
[1, 1]
[1, 1, 1]
[1, 1]
[1, 1]
[0, 0]
[1, 1]
[1, 1]
[1, 1]
[1, 1]
[1, 1]
[1]
[1]
[1, 1]
[1, 1]
[1, 1]
[1, 1]
[1, 1, 1]
[1, 1]
[1, 1]
[1, 1, 0]
[1, 1]
[1]
[1]
[1]
[1, 1]
[1, 1]
[1, 1]
[1, 1]
[1]
[1, 1, 0, 0, 0]
[0, 0, 0]
[1, 0, 1]
[1, 1, 1]
[1, 1, 0]
[1, 1]
[1, 1]
[1, 1]
[1, 1]
[0, 0]
[1, 1]
[1, 1]
[1, 1]
[1]
[1]
[0, 0, 0, 0, 0]
[1]
[1]
[1, 0]
[1, 1, 0]
[1, 1]
[1, 1]
[1, 1, 0, 0]
[1, 1]
[1]
[1, 1]
[1, 0]
[1, 1]
[1, 1]
[0, 0, 0]
[1, 1, 1]
[1, 1]
[1, 1]
[1, 1]
[1]
[1, 1]
[1, 1]
[1]
[1]
[1, 0, 0, 1, 0]
[1, 1]
[1, 1]
[1, 1, 0]
[1, 1]
[1, 1]
[1, 0]
[1]
[1, 1]
[1]
[1]
[1]
[1, 1, 1]
[1, 1, 1]
[1, 1, 1]
[1, 1]
[1, 1, 1]
[1, 1, 0, 0, 0]
[1, 1]
[1, 0]
[1, 1]
[1]
[1, 1]
[1]
[1]
[1, 1]
[1, 1, 0, 0, 1]
[1, 1, 1]
[1, 1, 1]
[1, 1, 1, 0]
[1, 1, 1]
[1, 1, 1, 0]
[1, 1, 1]
[1, 1, 1, 0]
[1

In [81]:
eval_df['checks'] = checks

In [82]:
eval_df

Unnamed: 0,reference,candidate(s) with edit dist.,candidate(s),true candidates,checks
0,Kemenko Polhukam,"{'kemenko polhukam': 0, 'polhukam': 8}","[kemenko polhukam, polhukam]","[kemenko polhukam, polhukam, kementerian koord...","[1, 1]"
1,Polhukam,"{'polhukam': 0, 'kemenko polhukam': 8}","[polhukam, kemenko polhukam]","[kemenko polhukam, polhukam, kementerian koord...","[1, 1]"
2,Kemlu,"{'kemlu': 0, 'kemlu ri': 3}","[kemlu, kemlu ri]","[kemlu, kemlu ri, kementerian luar negeri]","[1, 1]"
3,Kemlu RI,"{'kemlu ri': 0, 'kemlu': 3}","[kemlu ri, kemlu]","[kemlu, kemlu ri, kementerian luar negeri]","[1, 1]"
4,Kementerian BUMN,"{'kementerian bumn': 0, 'kemen bumn': 6, 'keme...","[kementerian bumn, kemen bumn, kementerian bad...","[kementerian bumn, kemen bumn, kementerian bad...","[1, 1, 1, 0, 0]"
...,...,...,...,...,...
932,Indofarma Global Medika,"{'indofarma global medika': 0, 'indofarma glob...","[indofarma global medika, indofarma global med...","[indofarma tbk, indofarma global medika, indof...","[1, 1, 1]"
933,Indofarma Global Medika Cabang,"{'indofarma global medika cabang': 0, 'indofar...","[indofarma global medika cabang, indofarma glo...","[indofarma tbk, indofarma global medika, indof...","[1, 1, 1]"
934,PT IGM,"{'pt igm': 0, 'pt pupuk indonesia holding comp...","[pt igm, pt pupuk indonesia holding company, p...","[indofarma tbk, indofarma global medika, indof...","[1, 0, 0, 0]"
935,Indofarma,"{'indofarma': 0, 'pt indofarma': 3, 'indofarma...","[indofarma, pt indofarma, indofarma tbk, indof...","[indofarma tbk, indofarma global medika, indof...","[1, 1, 1, 1, 0]"


In [83]:
accuracy_pcts = []

In [85]:
for index,row in eval_df.iterrows():
    check = row[['checks']].values[0]
    accuracy_pct = sum(check)/len(check)
    accuracy_pcts.append(accuracy_pct)
#     print(accuracy_pct)
#     break

In [86]:
eval_df['accuracy pct'] = accuracy_pcts

In [87]:
eval_df

Unnamed: 0,reference,candidate(s) with edit dist.,candidate(s),true candidates,checks,accuracy pct
0,Kemenko Polhukam,"{'kemenko polhukam': 0, 'polhukam': 8}","[kemenko polhukam, polhukam]","[kemenko polhukam, polhukam, kementerian koord...","[1, 1]",1.00
1,Polhukam,"{'polhukam': 0, 'kemenko polhukam': 8}","[polhukam, kemenko polhukam]","[kemenko polhukam, polhukam, kementerian koord...","[1, 1]",1.00
2,Kemlu,"{'kemlu': 0, 'kemlu ri': 3}","[kemlu, kemlu ri]","[kemlu, kemlu ri, kementerian luar negeri]","[1, 1]",1.00
3,Kemlu RI,"{'kemlu ri': 0, 'kemlu': 3}","[kemlu ri, kemlu]","[kemlu, kemlu ri, kementerian luar negeri]","[1, 1]",1.00
4,Kementerian BUMN,"{'kementerian bumn': 0, 'kemen bumn': 6, 'keme...","[kementerian bumn, kemen bumn, kementerian bad...","[kementerian bumn, kemen bumn, kementerian bad...","[1, 1, 1, 0, 0]",0.60
...,...,...,...,...,...,...
932,Indofarma Global Medika,"{'indofarma global medika': 0, 'indofarma glob...","[indofarma global medika, indofarma global med...","[indofarma tbk, indofarma global medika, indof...","[1, 1, 1]",1.00
933,Indofarma Global Medika Cabang,"{'indofarma global medika cabang': 0, 'indofar...","[indofarma global medika cabang, indofarma glo...","[indofarma tbk, indofarma global medika, indof...","[1, 1, 1]",1.00
934,PT IGM,"{'pt igm': 0, 'pt pupuk indonesia holding comp...","[pt igm, pt pupuk indonesia holding company, p...","[indofarma tbk, indofarma global medika, indof...","[1, 0, 0, 0]",0.25
935,Indofarma,"{'indofarma': 0, 'pt indofarma': 3, 'indofarma...","[indofarma, pt indofarma, indofarma tbk, indof...","[indofarma tbk, indofarma global medika, indof...","[1, 1, 1, 1, 0]",0.80


In [90]:
eval_df['candidate(s)'][935]

['indofarma',
 'pt indofarma',
 'indofarma tbk',
 'indofarma global medika',
 'pt indofarma persero tbk']

In [91]:
eval_df['true candidates'][935]

['indofarma tbk',
 'indofarma global medika',
 'indofarma global medika cabang',
 'pt igm',
 'indofarma',
 'pt indofarma']

In [93]:
eval_df.groupby('accuracy pct').count()[['candidate(s)']]

Unnamed: 0_level_0,candidate(s)
accuracy pct,Unnamed: 1_level_1
0.0,77
0.2,20
0.25,3
0.333333,11
0.4,62
0.5,40
0.6,31
0.666667,79
0.75,40
0.8,7


In [94]:
eval_df[eval_df['accuracy pct'] == 0.00]

Unnamed: 0,reference,candidate(s) with edit dist.,candidate(s),true candidates,checks,accuracy pct
36,Mahkamah Konstitusi,"{'mahkamah konstitusi': 0, 'mahkamah konstitus...","[mahkamah konstitusi, mahkamah konstitusi mk]","[mahkamah agung, ma, mahkamah agung ma]","[0, 0]",0.0
62,BPKP,"{'bpkp': 0, 'perwakilan bpkp provinsi sulawesi...","[bpkp, perwakilan bpkp provinsi sulawesi barat...","[bpk, badan pemeriksa keuangan, badan pemeriks...","[0, 0, 0]",0.0
70,Badan Tenaga Nuklir Nasional,"{'badan tenaga nuklir nasional': 0, 'badan ten...","[badan tenaga nuklir nasional, badan tenaga nu...","[badan informasi geospasial, big, badan inform...","[0, 0]",0.0
76,Kepolisian Negara Republik Indonesia Resor Kot...,{'kepolisian negara republik indonesia resor k...,[kepolisian negara republik indonesia resor ko...,"[kejaksaan negeri kota kediri, kejaksaaan ting...","[0, 0, 0, 0, 0]",0.0
90,Ombudsman Republik Indonesia,"{'ombudsman republik indonesia': 0, 'ombudsman...","[ombudsman republik indonesia, ombudsman repub...",[kepolisian negara republik indonesia resor ko...,"[0, 0, 0]",0.0
...,...,...,...,...,...,...
863,Pemkab Tanah Laut,"{'pemkab tanah laut': 0, 'pemerintah kabupaten...","[pemkab tanah laut, pemerintah kabupaten tanah...","[pemkab tana tidung, pemerintah ktt, pemerinta...","[0, 0]",0.0
867,Pemkab Timor Tengah Selatan,"{'pemkab timor tengah selatan': 0, 'pemkab tim...","[pemkab timor tengah selatan, pemkab timteng s...","[pemerintah kabupaten timteng utara, pemkab ti...","[0, 0, 0, 0, 0]",0.0
870,Pemkot Banda Aceh,"{'pemkot banda aceh': 0, 'pemerintah kota band...","[pemkot banda aceh, pemerintah kota banda aceh]","[pemkab aceh barat, pemerintah kabupaten aceh ...","[0, 0]",0.0
877,Pemprov Kalimantan Selatan,"{'pemprov kalimantan selatan': 0, 'pemprov kal...","[pemprov kalimantan selatan, pemprov kalsel, p...","[pemprov kalteng, pemerintah provinsi kalteng,...","[0, 0, 0, 0]",0.0
