In [1]:
import pandas as pd
import numpy as np
from get_candidates import get_candidates
import sys
sys.path.append('../')
import indexing.preprocess_reference_data

# Evaluasi Model String Distance - Levenshtein Distance

## Load the test data

In [2]:
df_tr = pd.read_csv('../indexing/data/splitted_data/v1/train.csv')
df_val = pd.read_csv('../indexing/data/splitted_data/v1/val.csv')
df_test = pd.read_csv('../indexing/data/splitted_data/v1/test.csv')

In [3]:
df = pd.concat([df_tr, df_val, df_test])

In [4]:
df.head()

Unnamed: 0,instansi,reference,status
0,"Kementerian Koordinator Bidang Politik, Hukum,...",Kemenko Polhukam,yes
1,"Kementerian Koordinator Bidang Politik, Hukum,...",Polhukam,yes
2,"Kementerian Koordinator Bidang Politik, Hukum,...",Koordinator Politik,no
3,"Kementerian Koordinator Bidang Politik, Hukum,...",Koordinator Hukum,no
4,Kementerian Luar Negeri,Kemlu,yes


In [6]:
prd = indexing.preprocess_reference_data.PreprocessReferenceData()
ref_data = prd.create_reference_data_from_labeled(df)
ref_data = prd.clean_reference_data(ref_data)

In [7]:
ref_data

Unnamed: 0,reference
0,Kemenko Polhukam
1,Polhukam
2,Kemlu
3,Kemlu RI
4,Kementerian BUMN
...,...
932,Indofarma Global Medika
933,Indofarma Global Medika Cabang
934,PT IGM
935,Indofarma


In [8]:
for index, row in ref_data.iterrows():
    print(f"Index: {index}, row: {row[['reference']].values[0]}")

Index: 0, row: Kemenko Polhukam
Index: 1, row: Polhukam
Index: 2, row: Kemlu
Index: 3, row: Kemlu RI
Index: 4, row: Kementerian BUMN
Index: 5, row: Kemen BUMN
Index: 6, row: Kemenparekraf
Index: 7, row: Kemenparekraf RI
Index: 8, row: Kemen PUPR
Index: 9, row: Kemen PUPR RI
Index: 10, row: Kemenpora
Index: 11, row: Kemenpora RI
Index: 12, row: Kemendikbudristek
Index: 13, row: Kemdikbud
Index: 14, row: Kemendag
Index: 15, row: Kemendag RI
Index: 16, row: Kemenhub
Index: 17, row: Kemenhub RI
Index: 18, row: Kemenperin
Index: 19, row: Kemenperin RI
Index: 20, row: Kemhan
Index: 21, row: Departemen Pertahanan
Index: 22, row: Kementan
Index: 23, row: Kementan RI
Index: 24, row: Kemensetneg
Index: 25, row: Setneg RI
Index: 26, row: BPK
Index: 27, row: Badan Pemeriksa Keuangan
Index: 28, row: DPD
Index: 29, row: Anggota Dewan Perwakilan Daerah
Index: 30, row: Anggota DPR
Index: 31, row: Anggota Dewan Perwakilan Rakyat
Index: 32, row: KY
Index: 33, row: Komisi Yudisial KY
Index: 34, row: Mahk

In [9]:
candidates_all = []

for index, row in ref_data.iterrows():
    instansi = row[['reference']].values[0]
    instansi = instansi.lower()
    candidates = get_candidates(instansi)
    candidates_all.append(candidates)

In [13]:
eval_df = ref_data.copy()
eval_df['candidate(s) with edit dist.'] = candidates_all

In [14]:
eval_df.head()

Unnamed: 0,reference,candidate(s) with edit dist.
0,Kemenko Polhukam,"{'kemenko polhukam': 0, 'polhukam': 8}"
1,Polhukam,"{'polhukam': 0, 'kemenko polhukam': 8}"
2,Kemlu,"{'kemlu': 0, 'kemlu ri': 3}"
3,Kemlu RI,"{'kemlu ri': 0, 'kemlu': 3}"
4,Kementerian BUMN,"{'kementerian bumn': 0, 'kemen bumn': 6, 'keme..."


In [40]:
candidates_all_without_edit_dist = []

In [41]:
for index, row in eval_df.iterrows():
    candidates = row[['candidate(s) with edit dist.']].values[0]
    candidates = [i for i in candidates.keys()]
    candidates_all_without_edit_dist.append(candidates)

In [43]:
eval_df['candidate(s)'] = candidates_all_without_edit_dist

In [44]:
eval_df

Unnamed: 0,reference,candidate(s) with edit dist.,candidate(s)
0,Kemenko Polhukam,"{'kemenko polhukam': 0, 'polhukam': 8}","[kemenko polhukam, polhukam]"
1,Polhukam,"{'polhukam': 0, 'kemenko polhukam': 8}","[polhukam, kemenko polhukam]"
2,Kemlu,"{'kemlu': 0, 'kemlu ri': 3}","[kemlu, kemlu ri]"
3,Kemlu RI,"{'kemlu ri': 0, 'kemlu': 3}","[kemlu ri, kemlu]"
4,Kementerian BUMN,"{'kementerian bumn': 0, 'kemen bumn': 6, 'keme...","[kementerian bumn, kemen bumn, kementerian bad..."
...,...,...,...
932,Indofarma Global Medika,"{'indofarma global medika': 0, 'indofarma glob...","[indofarma global medika, indofarma global med..."
933,Indofarma Global Medika Cabang,"{'indofarma global medika cabang': 0, 'indofar...","[indofarma global medika cabang, indofarma glo..."
934,PT IGM,"{'pt igm': 0, 'pt pupuk indonesia holding comp...","[pt igm, pt pupuk indonesia holding company, p..."
935,Indofarma,"{'indofarma': 0, 'pt indofarma': 3, 'indofarma...","[indofarma, pt indofarma, indofarma tbk, indof..."


In [45]:
eval_df.to_csv('./eval/performance_v1.csv')

## Get True References

In [100]:
df2 = df.copy()
df2['cleaned_instansi'] = df2['instansi'].str.replace('/', ' ')
df2['cleaned_instansi'] = df2['cleaned_instansi'].str.replace('[^\w\s]','',regex=True)

In [101]:
df2.head()

Unnamed: 0,instansi,reference,status,cleaned_instansi
0,"Kementerian Koordinator Bidang Politik, Hukum,...",Kemenko Polhukam,yes,Kementerian Koordinator Bidang Politik Hukum d...
1,"Kementerian Koordinator Bidang Politik, Hukum,...",Polhukam,yes,Kementerian Koordinator Bidang Politik Hukum d...
2,"Kementerian Koordinator Bidang Politik, Hukum,...",Koordinator Politik,no,Kementerian Koordinator Bidang Politik Hukum d...
3,"Kementerian Koordinator Bidang Politik, Hukum,...",Koordinator Hukum,no,Kementerian Koordinator Bidang Politik Hukum d...
4,Kementerian Luar Negeri,Kemlu,yes,Kementerian Luar Negeri


In [48]:
df.head(2)

Unnamed: 0,instansi,reference,status
0,"Kementerian Koordinator Bidang Politik, Hukum,...",Kemenko Polhukam,yes
1,"Kementerian Koordinator Bidang Politik, Hukum,...",Polhukam,yes


In [49]:
eval_df.head(2)

Unnamed: 0,reference,candidate(s) with edit dist.,candidate(s)
0,Kemenko Polhukam,"{'kemenko polhukam': 0, 'polhukam': 8}","[kemenko polhukam, polhukam]"
1,Polhukam,"{'polhukam': 0, 'kemenko polhukam': 8}","[polhukam, kemenko polhukam]"


In [99]:
df['instansi'][33]

33               Kementerian Perhubungan
33         Pemerintah Kabupaten Merangin
33    Komisi Pemberantasan Korupsi (KPK)
Name: instansi, dtype: object

In [102]:
true_refs_all = []

In [98]:
for index, row in eval_df.iterrows():
    try:
        reference = row[['reference']].values[0]
#         print(reference)
        instansi_resmi = df2[(df2['reference']==reference)]['instansi'].values[0]
#         print(instansi_resmi)
    except:
        print(index)
        print(row[['reference']].values[0])
        continue
    true_refs = df2[(df2['instansi']==instansi_resmi) 
                   & (df['status']=='yes')]['reference'].values
    true_refs = [x.lower() for x in true_refs]
#     print(true_refs)
    true_refs_all.append(true_refs)
    

33
Komisi Yudisial KY
50
Kementerian Investasi BKPM
84
Lembaga Kebijakan Pengadaan Barang Jasa Pemerintah
117
Bank Indonesia BI
127
Pemerintah Kab Kepulauan Seribu
427
Pemerintah Kabupaten SabuRaijua
428
Pemkab SabuRaijua
538
PT Asabri Persero
548
Indonesia Financial Group IFG
569
PT Biro Klasifikasi Indonesia Persero Cabang Cirebon
571
PT Boma Bisma Indra Persero Cabang Jakarta
579
PT Garam Persero
585
PT Indonesia Asahan Aluminium Persero
588
PT IKI Persero
590
PT INUKI Persero
592
PT ISN Persero
594
PT INTI Persero
625
PT Perinus Persero
626
PT Perkebunan Nusantara III Persero
629
PT PNM Persero
641
PT Pupuk Indonesia Persero
663
SMFIndonesia
671
Kementerian PPN Bappenas
726
PT Taspen Persero
750
Kementerian PP  PA
753
Badan Pengusahaan Batam BP Batam
757
Badan SAR Nasional Basarnas
758
Badan Nasional Pencarian dan Pertolongan BNPP
889
Perum Bulog Kanwil Kaltim  Kaltara
898
Kantor Pusat PT Brantas Abipraya Persero
908
PT Geo Dipa Energi Persero


In [96]:
len(true_refs_all)

906

In [92]:
eval_df['reference'][936]

'PT Telkom Indonesia'

In [93]:
true_refs_all[936]

['dahana', 'pt dahana subang']

In [91]:
len(candidates_all)

937