In [1]:
import pandas as pd
import numpy as np
from get_similar_entity import get_similar_entity
from evaluation import clean_test_df, get_similar_entities_lev_v2

## Prep test data
Test data: distinct nama_instansi penerima dan pemberi from Tabel ai2022

In [3]:
test_data = pd.read_csv('./eval/test_data/distinct_instansi_ai_table.csv')

In [4]:
test_data.head()

Unnamed: 0,nama_instansi
0,Kementerian Kominfo
1,tidak tahu
2,PT Maybank Indonesia Tbk
3,Rekanan
4,Kemko Marves


In [5]:
test_data = clean_test_df(test_data)

In [6]:
test_data.head()

Unnamed: 0,nama_instansi
0,kementerian kominfo
1,tidak tahu
2,pt maybank indonesia tbk
3,rekanan
4,kemko marves


## Predictions

In [8]:
predictions = get_similar_entities_lev_v2(test_data, reference_version='v2')

In [9]:
predictions.to_csv('./eval/automatically_labeled.csv')

In [10]:
predictions.head()

Unnamed: 0,nama_instansi,candidates
0,kementerian kominfo,kominfo
1,tidak tahu,"Bukan instansi BUMN, Kementerian, Pemerintah"
2,pt maybank indonesia tbk,pt bni
3,rekanan,"Bukan instansi BUMN, Kementerian, Pemerintah"
4,kemko marves,"Bukan instansi BUMN, Kementerian, Pemerintah"


In [11]:
predictions.groupby('candidates').count()

Unnamed: 0_level_0,nama_instansi
candidates,Unnamed: 1_level_1
"Bukan instansi BUMN, Kementerian, Pemerintah",1181
anggota dpr,5
anri,1
arsip nasional republik indonesia anri,1
asdp indonesia,1
...,...
skk migas,7
superintending company of indonesia,1
telkom,1
tni angkatan udara,2


## Clean labeled data

In [6]:
train = pd.read_csv('../indexing/data/splitted_data/v2/train.csv')
val = pd.read_csv('../indexing/data/splitted_data/v2/val.csv')
test = pd.read_csv('../indexing/data/splitted_data/v2/test.csv')

In [7]:
df_all = pd.concat([train,val,test])

In [8]:
df_all

Unnamed: 0,instansi,status,reference
0,lembaga pembiayaan ekspor indonesia,yes,lembaga pembiayaan ekspor
1,lembaga pembiayaan ekspor indonesia,yes,lpei
2,lembaga pembiayaan ekspor indonesia,yes,lembaga ekspor indonesia
3,lembaga pembiayaan ekspor indonesia,yes,pembiayaan ekspor indonesia
4,lembaga pembiayaan ekspor indonesia,no,pembiayaan indonesia
...,...,...,...
907,pt indofarma,yes,indofarma
908,pt indofarma,no,igm
909,pt kereta api indonesia (persero) (kai),no,kereta api
910,pt dok dan perkapalan surabaya (persero),no,pt perkapalan


Clean the data

In [9]:
df_all = df_all.reset_index(drop=True)

In [11]:
df_all['cln_instansi'] = df_all['instansi'].str.replace('/', ' ')
df_all['cln_instansi'] = df_all['cln_instansi'].str.replace('[^\w\s]','',regex=True)

In [13]:
df_all['cln_reference'] = df_all['reference'].str.replace('/', ' ')
df_all['cln_reference'] = df_all['cln_reference'].str.replace('[^\w\s]','',regex=True)

In [14]:
df_all = df_all.drop(['reference', 'instansi'], axis=1)

In [16]:
df_all = df_all.rename(columns={'cln_reference':'reference', 'cln_instansi':'instansi'})

In [17]:
df_all

Unnamed: 0,status,instansi,reference
0,yes,lembaga pembiayaan ekspor indonesia,lembaga pembiayaan ekspor
1,yes,lembaga pembiayaan ekspor indonesia,lpei
2,yes,lembaga pembiayaan ekspor indonesia,lembaga ekspor indonesia
3,yes,lembaga pembiayaan ekspor indonesia,pembiayaan ekspor indonesia
4,no,lembaga pembiayaan ekspor indonesia,pembiayaan indonesia
...,...,...,...
4340,yes,pt indofarma,indofarma
4341,no,pt indofarma,igm
4342,no,pt kereta api indonesia persero kai,kereta api
4343,no,pt dok dan perkapalan surabaya persero,pt perkapalan


In [18]:
def get_true_refs(nama_instansi):
    """
    Function to get true references of given nama_instansi
    """
    try:
        official_instansi = df_all[(df_all['reference']==nama_instansi) & 
                                   (df_all['status']=='yes')]['instansi'].values[0]
    except:
        official_instansi = df_all[df_all['instansi']==nama_instansi]['instansi'].values[0]
    
    true_references = df_all[(df_all['instansi']==official_instansi)
                        & (df_all['status']=='yes')]['reference'].values
    
    true_refs = []
    for i in true_references:
        true_refs.append(i)

    true_refs.append(official_instansi)

    return true_refs

In [24]:
get_true_refs('kominfo')

['kementerian komunikasi',
 'kementerian informatika',
 'kominfo',
 'kementerian komunikasi dan informatika']

## Analysis

In [4]:
predictions = pd.read_csv('./eval/automatically_labeled.csv')

In [5]:
predictions

Unnamed: 0.1,Unnamed: 0,nama_instansi,candidates
0,0,kementerian kominfo,kominfo
1,1,tidak tahu,"Bukan instansi BUMN, Kementerian, Pemerintah"
2,2,pt maybank indonesia tbk,pt bni
3,3,rekanan,"Bukan instansi BUMN, Kementerian, Pemerintah"
4,4,kemko marves,"Bukan instansi BUMN, Kementerian, Pemerintah"
...,...,...,...
2155,2156,pt cahaya maha pertiwi,pt pii
2156,2157,pemkab sintang,pemkab sintang
2157,2158,desa ginuk,"Bukan instansi BUMN, Kementerian, Pemerintah"
2158,2159,direktur utama pt bni,direktur utama pt bni


In [25]:
true_refs = []
status = []

In [26]:
for index, row in predictions.iterrows():
    nama_inst = row[['candidates']].values[0]
    if nama_inst == 'Bukan instansi BUMN, Kementerian, Pemerintah':
        true_refs.append([])
        status.append('yes')
    else:
        true_ref = get_true_refs(nama_inst)
        true_refs.append(true_ref)
        test_instansi = row[['nama_instansi']].values[0]
        if test_instansi in true_ref:
            status.append('yes')
        else:
            status.append('no')

In [27]:
predictions['true_references'] = true_refs
predictions['status'] = status

In [28]:
predictions

Unnamed: 0.1,Unnamed: 0,nama_instansi,candidates,true_references,status
0,0,kementerian kominfo,kominfo,"[kementerian komunikasi, kementerian informati...",no
1,1,tidak tahu,"Bukan instansi BUMN, Kementerian, Pemerintah",[],yes
2,2,pt maybank indonesia tbk,pt bni,"[pt bni, bni, bank negara indonesia, bank bni,...",no
3,3,rekanan,"Bukan instansi BUMN, Kementerian, Pemerintah",[],yes
4,4,kemko marves,"Bukan instansi BUMN, Kementerian, Pemerintah",[],yes
...,...,...,...,...,...
2155,2156,pt cahaya maha pertiwi,pt pii,"[pt penjaminan infrastruktur indonesia, penjam...",no
2156,2157,pemkab sintang,pemkab sintang,"[pemkab sintang, pemerintah sintang, pemerinta...",yes
2157,2158,desa ginuk,"Bukan instansi BUMN, Kementerian, Pemerintah",[],yes
2158,2159,direktur utama pt bni,direktur utama pt bni,"[pt bni, bni, bank negara indonesia, bank bni,...",yes


In [30]:
predictions.groupby('status').count()

Unnamed: 0_level_0,Unnamed: 0,nama_instansi,candidates,true_references
status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,601,601,601,601
yes,1559,1558,1559,1559


In [31]:
predictions.isnull().sum()

Unnamed: 0         0
nama_instansi      1
candidates         0
true_references    0
status             0
dtype: int64

In [34]:
test_data = pd.read_csv('./eval/test_data/distinct_instansi_ai_table.csv')

In [35]:
test_data.isnull().sum()

nama_instansi    1
dtype: int64