In [1]:
!pip install pandarallel
!pip install pyahocorasick
!pip install --upgrade pip
!pip install spacy==3.0.*

Collecting pandarallel
  Downloading pandarallel-1.5.2.tar.gz (16 kB)
Building wheels for collected packages: pandarallel
  Building wheel for pandarallel (setup.py) ... [?25l- \ done
[?25h  Created wheel for pandarallel: filename=pandarallel-1.5.2-py3-none-any.whl size=18384 sha256=600218c777e44e24875ea22adb81da82a0e834e2389dca6deb5ffb02753c5b27
  Stored in directory: /root/.cache/pip/wheels/b5/6d/51/9ece2eaf007ea3f7fb0ce053c5773b2eb0d308887da3af12c1
Successfully built pandarallel
Installing collected packages: pandarallel
Successfully installed pandarallel-1.5.2
Collecting pyahocorasick
  Downloading pyahocorasick-1.4.1.tar.gz (321 kB)
[K     |████████████████████████████████| 321 kB 1.2 MB/s 
[?25hBuilding wheels for collected packages: pyahocorasick
  Building wheel for pyahocorasick (setup.py) ... [?25l- \ | / done
[?25h  Created wheel for pyahocorasick: filename=pyahocorasick-1.4.1-cp37-cp37m-linux_x86_64.whl size=102846 sha256=dfb2f6e898c79effea38

In [2]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None)

from pandarallel import pandarallel
pandarallel.initialize()

import spacy
from spacy.training import Example
import random

import ahocorasick

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
df = pd.read_csv("../input/scl-2021-ds/train.csv")
df.set_index("id", inplace=True)
df['POI'] = np.nan
df['street'] = np.nan

def extract_entities(row):
    extracted = row['POI/street'].split("/")
    
    if len(extracted) == 2:
        poi, street = extracted
        if poi.strip() != '':
            row['POI'] = poi
        
        if street.strip() != '':
            row['street'] = street
        
    return row

df = df.parallel_apply(extract_entities, axis=1)
nlp = spacy.blank('id')  # create blank Language class

In [4]:
from copy import deepcopy

def _build_aho(words):
    aho = ahocorasick.Automaton()
    for idx, key in enumerate(words):
        aho.add_word(key, (idx, key))

    return aho

def format_data(text, poi, street):
    entities = []
    _text = deepcopy(text)

    if isinstance(poi, str):
        aho = _build_aho([poi])
        aho.make_automaton()
        latest_char_idx = 0
        
        for end, (_, word) in aho.iter(_text):
            start = end - len(word) + 1
            if start < latest_char_idx:
                continue

            entities.append((start, end + 1, 'POI'))
            _text = _text.replace(word, " " * len(word))
            latest_char_idx = end + 1
        
    if isinstance(street, str):
        aho = _build_aho([street])
        aho.make_automaton()
        latest_char_idx = 0

        for end, (_, word) in aho.iter(_text):
            start = end - len(word) + 1
            if start < latest_char_idx:
                continue

            entities.append((start, end + 1, 'STREET'))
            latest_char_idx = end + 1
    
    return Example.from_dict(nlp.make_doc(text), {"entities": entities})

In [5]:
print("Preparing Spacy examples...")

examples = []
for idx in df.index:
    try:
        row = df.loc[idx]
        example = format_data(row['raw_address'], row['POI'], row['street'])
        examples.append(example)
    except Exception as e:
        print(idx)
        print("-" * 50)
        print(e)
        break

Preparing Spacy examples...


In [6]:
def train_spacy(nlp, examples, iterations):
    TRAIN_DATA = examples
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.add_pipe('ner', last=True)

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(iterations):
            print("Starting iteration " + str(itn))
            random.shuffle(examples)
            losses = {}
            for example in examples:
                nlp.update(
                    [example],
                    drop=0.2,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            print(losses)
    return nlp

In [7]:
train = examples[:1000]

In [8]:
%%time
ner_nlp = train_spacy(nlp, train, 20)

[2021-03-14 12:16:48,501] [INFO] Created vocabulary
[2021-03-14 12:16:48,503] [INFO] Finished initializing nlp object


Starting iteration 0
{'ner': 1593.752131479031}
Starting iteration 1
{'ner': 1356.2414852303602}
Starting iteration 2
{'ner': 1039.1322869061582}
Starting iteration 3
{'ner': 803.4610263547089}
Starting iteration 4
{'ner': 686.0507712694739}
Starting iteration 5
{'ner': 597.6766044558012}
Starting iteration 6
{'ner': 544.4457384367031}
Starting iteration 7
{'ner': 417.3591129597886}
Starting iteration 8
{'ner': 383.72910013047886}
Starting iteration 9
{'ner': 283.704569734301}
Starting iteration 10
{'ner': 285.1407638495554}
Starting iteration 11
{'ner': 290.40882803727123}
Starting iteration 12
{'ner': 252.32356877679442}
Starting iteration 13
{'ner': 244.90817650918942}
Starting iteration 14
{'ner': 237.22882166935443}
Starting iteration 15
{'ner': 203.74835436724933}
Starting iteration 16
{'ner': 211.71486276331456}
Starting iteration 17
{'ner': 174.35693278933218}
Starting iteration 18
{'ner': 165.54111778170073}
Starting iteration 19
{'ner': 111.69934397446634}
CPU times: user 6mi

In [9]:
for idx, row in df.iloc[100:110].iterrows():
    print(f"address: {row['raw_address']}")
    print(f"expected poi: {row['POI']}")
    print(f"expected street: {row['street']}")
    print()
    
    doc = ner_nlp(row['raw_address'])
    for ent in doc.ents:
        print(ent.text, "-", ent.label_)

    print("-" * 50)

address: kedai tenun jep senn, kota bumi, kebon melati
expected poi: kedai tenun jepara sennaart
expected street: kota bumi

kota bumi - STREET
--------------------------------------------------
address: wadungasri dalam waru raya wad asri, 24 sidoarjo
expected poi: dalam waru
expected street: raya wad asri

dalam waru - POI
raya wad asri - STREET
--------------------------------------------------
address: bulusan tim barat iii, no 35 3 tembalang
expected poi: nan
expected street: tim barat iii

tim barat iii - STREET
--------------------------------------------------
address: bakti jaya bukit perm vii 8 15315 setu
expected poi: nan
expected street: bukit perm vii

bukit perm vii - STREET
--------------------------------------------------
address: jl terusan buah batu no 185. samping indomaret. bandung.
expected poi: samping indomaret
expected street: jl terusan buah batu

jl terusan buah batu - STREET
samping indomaret - POI
--------------------------------------------------
address: 

In [10]:
df_test = pd.read_csv("../input/scl-2021-ds/test.csv")
df_test.set_index("id", inplace=True)

In [11]:
submission = []
for idx, row in df_test.iloc[:].iterrows():
    doc = ner_nlp(row['raw_address'])
    tmp = {'id': idx}
    for ent in doc.ents:
        tmp[ent.label_] = ent.text
    submission.append(tmp)
    


In [12]:
submission

[{'id': 0, 'STREET': 's. par'},
 {'id': 1, 'STREET': 'angg per'},
 {'id': 2, 'STREET': 'mand imog'},
 {'id': 3, 'STREET': 'raya nga sri'},
 {'id': 4, 'POI': 'cut mutia'},
 {'id': 5},
 {'id': 6},
 {'id': 7},
 {'id': 8},
 {'id': 9, 'STREET': 'raya won'},
 {'id': 10},
 {'id': 11, 'STREET': 'kemur viii'},
 {'id': 12, 'STREET': 'kimia farma'},
 {'id': 13},
 {'id': 14, 'POI': 'toko teddy raya pan jakat'},
 {'id': 15, 'STREET': 'raya sawungg'},
 {'id': 16, 'STREET': 'jl tanjung ii'},
 {'id': 17, 'STREET': 'batik ayu 3'},
 {'id': 18},
 {'id': 19},
 {'id': 20, 'STREET': 'jatikarya gg. damai 1'},
 {'id': 21, 'STREET': 'tpa amara rachma'},
 {'id': 22, 'POI': 'pendowo gg. pan'},
 {'id': 23, 'STREET': 'tebet barat vi'},
 {'id': 24, 'STREET': 'gad kir timur vi'},
 {'id': 25, 'STREET': 'kp taman'},
 {'id': 26, 'STREET': 'raya bant'},
 {'id': 27, 'STREET': 'pulo mas barat vi'},
 {'id': 28, 'STREET': 'jembatan merah putih ambon'},
 {'id': 29},
 {'id': 30, 'POI': 'belajar gane opera ahmad'},
 {'id': 31,

In [13]:
pd.DataFrame(submission)

Unnamed: 0,id,STREET,POI
0,0,s. par,
1,1,angg per,
2,2,mand imog,
3,3,raya nga sri,
4,4,,cut mutia
...,...,...,...
49995,49995,,toko mbak farid semboro semboro
49996,49996,vete,3 cari
49997,49997,nasio,
49998,49998,,graha indah pamulang


In [14]:
submission = pd.DataFrame(submission)
submission['POI/street'] = submission['POI'] + '/' + submission['STREET'] 

In [15]:
pd.DataFrame({'id':submission['id'],'POI/street':submission['POI/street']}).to_csv('submission.csv', header=True, index=False)