In [1]:
import pandas as pd
import index
import search
import preprocess_reference_data

In [2]:
df1 = pd.read_csv('./data/v1/train.csv')
df2 = pd.read_csv('./data/v1/val.csv')
df3 = pd.read_csv('./data/v1/test.csv')

In [3]:
dfAll = pd.concat([df1, df2, df3])

In [4]:
dfAll

Unnamed: 0,instansi,reference,status
0,"Kementerian Koordinator Bidang Politik, Hukum,...",Kemenko Polhukam,yes
1,"Kementerian Koordinator Bidang Politik, Hukum,...",Polhukam,yes
2,"Kementerian Koordinator Bidang Politik, Hukum,...",Koordinator Politik,no
3,"Kementerian Koordinator Bidang Politik, Hukum,...",Koordinator Hukum,no
4,Kementerian Luar Negeri,Kemlu,yes
...,...,...,...
584,"PT Telkom Indonesia (Persero), Tbk.",Telkom University,no
585,"PT Telkom Indonesia (Persero), Tbk.",Telkom,no
586,PT Semen Kupang,Kupang,no
587,PT Semen Kupang,Pemkot Kupang,no


In [5]:
dfAll = dfAll.reset_index()

In [6]:
dfAll

Unnamed: 0,index,instansi,reference,status
0,0,"Kementerian Koordinator Bidang Politik, Hukum,...",Kemenko Polhukam,yes
1,1,"Kementerian Koordinator Bidang Politik, Hukum,...",Polhukam,yes
2,2,"Kementerian Koordinator Bidang Politik, Hukum,...",Koordinator Politik,no
3,3,"Kementerian Koordinator Bidang Politik, Hukum,...",Koordinator Hukum,no
4,4,Kementerian Luar Negeri,Kemlu,yes
...,...,...,...,...
2478,584,"PT Telkom Indonesia (Persero), Tbk.",Telkom University,no
2479,585,"PT Telkom Indonesia (Persero), Tbk.",Telkom,no
2480,586,PT Semen Kupang,Kupang,no
2481,587,PT Semen Kupang,Pemkot Kupang,no


# Create a New DF - Distinct Instansi from Column instansi

In [8]:
df_dist = pd.DataFrame(dfAll['instansi'].unique())

In [9]:
df_dist

Unnamed: 0,0
0,"Kementerian Koordinator Bidang Politik, Hukum,..."
1,Kementerian Luar Negeri
2,Kementerian Badan Usaha Milik Negara
3,Kementerian Pariwisata dan Ekonomi Kreatif
4,Kementerian Pekerjaan Umum dan Perumahan Rakya...
...,...
532,"PT Taman Wisata Candi Borobudur, Prambanan, Da..."
533,PT Varuna Tirta Prakasya (Persero)
534,PT Indofarma
535,"PT Telkom Indonesia (Persero), Tbk."


In [10]:
df_dist = df_dist.rename(columns={0: 'reference'})

In [11]:
df_dist

Unnamed: 0,reference
0,"Kementerian Koordinator Bidang Politik, Hukum,..."
1,Kementerian Luar Negeri
2,Kementerian Badan Usaha Milik Negara
3,Kementerian Pariwisata dan Ekonomi Kreatif
4,Kementerian Pekerjaan Umum dan Perumahan Rakya...
...,...
532,"PT Taman Wisata Candi Borobudur, Prambanan, Da..."
533,PT Varuna Tirta Prakasya (Persero)
534,PT Indofarma
535,"PT Telkom Indonesia (Persero), Tbk."


# Preprocess Reference Data

In [12]:
prd = preprocess_reference_data.PreprocessReferenceData(dfAll)

In [13]:
reference_data = prd.get_preprocessed_df()

In [14]:
reference_data

Unnamed: 0,reference
0,Kemenko Polhukam
1,Polhukam
2,Kemlu
3,Kemlu RI
4,Kementerian BUMN
...,...
933,Indofarma Global Medika
934,Indofarma Global Medika Cabang
935,PT IGM
936,Indofarma


In [15]:
# concatenate distinct instansi
reference_data = pd.concat([reference_data, df_dist])

In [16]:
reference_data = pd.DataFrame(reference_data['reference'].unique())

In [17]:
reference_data = reference_data.reset_index()

In [18]:
reference_data

Unnamed: 0,index,0
0,0,Kemenko Polhukam
1,1,Polhukam
2,2,Kemlu
3,3,Kemlu RI
4,4,Kementerian BUMN
...,...,...
1466,1466,"PT Taman Wisata Candi Borobudur, Prambanan, Da..."
1467,1467,PT Varuna Tirta Prakasya (Persero)
1468,1468,PT Indofarma
1469,1469,"PT Telkom Indonesia (Persero), Tbk."


In [19]:
reference_data = reference_data.drop(['index'], axis=1)

In [20]:
reference_data = reference_data.rename(columns={0: 'reference'})

In [21]:
reference_data

Unnamed: 0,reference
0,Kemenko Polhukam
1,Polhukam
2,Kemlu
3,Kemlu RI
4,Kementerian BUMN
...,...
1466,"PT Taman Wisata Candi Borobudur, Prambanan, Da..."
1467,PT Varuna Tirta Prakasya (Persero)
1468,PT Indofarma
1469,"PT Telkom Indonesia (Persero), Tbk."


In [22]:
reference_data.to_csv('./data/v2/reference_data.csv')

# Create Indexing Table

In [23]:
idx = index.Index(reference_data)

In [24]:
index_table = idx.get_index_table()

In [25]:
index_table

Unnamed: 0,Kata,Index
0,Kemenko,[0]
1,Polhukam,"[0, 1]"
2,Kemlu,"[2, 3]"
3,RI,"[3, 7, 9, 11, 15, 17, 19, 23, 25, 80, 81, 85, ..."
4,Kementerian,"[4, 50, 51, 671, 745, 750, 937, 938, 939, 940,..."
...,...,...
955,(Pelni),[1463]
956,"Borobudur,",[1466]
957,"Prambanan,",[1466]
958,Ratu,[1466]


In [26]:
index_table.to_csv('./data/v2/index_table.csv')

# Perform Search

In [27]:
it = pd.read_csv('./data/v2/index_table.csv', index_col='Unnamed: 0')
rd = pd.read_csv('./data/v2/reference_data.csv', index_col='Unnamed: 0')

In [28]:
rd.head(1)

Unnamed: 0,reference
0,Kemenko Polhukam


In [29]:
it.head(1)

Unnamed: 0,Kata,Index
0,Kemenko,[0]


In [30]:
search_ = search.Search(rd, it)

In [31]:
nama_instansi = "kementerian pertanian"
search_.search(nama_instansi)

{'kementerian': ['Kementerian BUMN',
  'Kementerian Investasi/BKPM',
  'Kementerian Investasi',
  'Kementerian PPN/Bappenas',
  'Kementerian Koperasi dan UKM',
  'Kementerian PP & PA',
  'Kementerian Koordinator Bidang Politik, Hukum, dan Keamanan',
  'Kementerian Luar Negeri',
  'Kementerian Badan Usaha Milik Negara',
  'Kementerian Pariwisata dan Ekonomi Kreatif',
  'Kementerian Pekerjaan Umum dan Perumahan Rakyat (PUPR)',
  'Kementerian Pemuda dan Olahraga',
  'Kementerian Pendidikan, Kebudayaan, Riset, dan Teknologi',
  'Kementerian Perdagangan',
  'Kementerian Perhubungan',
  'Kementerian Perindustrian',
  'Kementerian Pertahanan',
  'Kementerian Pertanian',
  'Kementerian Sekretariat Negara',
  'Kementerian Perencanaan Pembangunan Nasional (PPN)/Badan Perencanaan Pembangunan Nasional (Bappenas)',
  'Kementerian Sosial',
  'Kementerian Koperasi dan Usaha Kecil dan Menengah',
  'Kementerian Pendayagunaan Aparatur Negara dan Reformasi Birokrasi (Kem PANRB)',
  'Kementerian Pemberday

In [34]:
search_.search("kereta")

{'kereta': ['PT Industri Kereta Api',
  'PT Kereta Api Indonesia ',
  'PT Industri Kereta Api (Persero) (INKA)',
  'PT Kereta Api Indonesia (Persero) (KAI)']}

In [61]:
search_.search("bri custody")

{'bri': ['Bank BRI Unit Prambanan']}