In [1]:
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from time import time

In [2]:
# import training dataset

train = pd.read_csv("train/train.csv")
train.head()


Unnamed: 0,id,raw_address,POI/street
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika
1,1,"aye, jati sampurna",/
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung
3,3,"toko dita, kertosono",toko dita/
4,4,jl. orde baru,/jl. orde baru


In [3]:
# split the cleaned poi and street into respective cols

trainsplit = train['POI/street'].str.split('/', 1, expand = True)
trainsplit

Unnamed: 0,0,1
0,,jl kapuk timur delta sili iii lippo cika
1,,
2,,siung
3,toko dita,
4,,jl. orde baru
...,...,...
299995,,jend ahmad yani
299996,,raya cila kko
299997,,
299998,taman asri,


In [4]:
# add the poi and street into the training dataframe

train['POI/street'].replace('/', np.nan, inplace=True)
train['POI'] = trainsplit[0]
train['street'] = trainsplit[1]
df = train.copy()

In [5]:
# compare the cleaned street and poi against the raw address
# return the index of the first word that requires cleaning in the raw address

def data_cleaning_index(raw, clean):
    cleanlen = len(clean)
    rawlen = len(raw)
    
    if cleanlen < rawlen:
        
        # find overlap words
        overlap = list(set(raw) & set(clean))
        
        if len(overlap) > 0:
            firstoverlap = overlap[0]
            rawindex_first = raw.index(firstoverlap)     #get the index of the first overlap in the raw_address
            cleanindex_first = clean.index(firstoverlap) #get the index of the first overlap in the cleaned address
            start = rawindex_first - cleanindex_first    #find the start of the cleaned address
            return start
        else :
            return 0
        
    else :
        return 0           
            

In [6]:
# testing on row 998

num = 998
raw1 = df.iloc[num].raw_address
street1 = df.iloc[num].street
poi = df.iloc[num].POI

clean = word_tokenize(raw1)
streettkn = word_tokenize(street1)
poitkn = word_tokenize(poi)

first_index = data_cleaning_index(clean, streettkn)
clean[first_index: first_index+len(streettkn)] = streettkn

first_index = data_cleaning_index(clean, poitkn)
clean[first_index: first_index+len(poitkn)] = poitkn



In [7]:
# clean all the words from the raw_address based on the data_cleaning_index function

def data_clean_all(row):
       
    street = word_tokenize(row.street)
    rawadd = word_tokenize(row.raw_address)
    poi = word_tokenize(row.POI)
    
    startindex = data_cleaning_index(rawadd, street)
    rawadd[startindex: startindex + len(street)] = street

    startindex = data_cleaning_index(rawadd, poi)
    rawadd[startindex: startindex + len(poi)] = poi
    
    cleaned = ' '.join(rawadd)
    cleaned = cleaned.replace(' .', '.')
    cleaned = cleaned.replace(' ,', ',')
    cleaned = cleaned.replace(' )', ')')
    cleaned = cleaned.replace('( ', '(')
    return cleaned

In [8]:
#using the data_clean_all function to clean all raw_address to the correct phrasing and assign to a new column

start = time()
df['cleaned_add'] = df.apply(data_clean_all, axis = 1)
end = time()
total_time = round((end - start)/60, 3)
print("Total time takent to execute : " + format(total_time))

Total time takent to execute : 1.216


In [9]:
df.head()

Unnamed: 0,id,raw_address,POI/street,POI,street,cleaned_add
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika,,jl kapuk timur delta sili iii lippo cika,jl kapuk timur delta sili iii lippo cika 11 a ...
1,1,"aye, jati sampurna",,,,"aye, jati sampurna"
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung,,siung,setu siung 119 rt 5 1 13880 cipayung
3,3,"toko dita, kertosono",toko dita/,toko dita,,"toko dita, kertosono"
4,4,jl. orde baru,/jl. orde baru,,jl. orde baru,jl. orde baru


In [10]:
# drop the col poi/street
df = df.drop(["POI/street"], axis = 1)


In [11]:
# create a tuple of unclean and clean words for street

def map_street(row):
    
    street = word_tokenize(row.street)
    rawadd = word_tokenize(row.raw_address)
    
    startindex = data_cleaning_index(rawadd, street)
    old = ' '.join(rawadd[startindex: startindex + len(street)])
    old = old.replace(' ,', ',')
    old = old.replace(' .', '.')
    old = old.replace(' )', ')')
    old = old.replace('( ', '(')
    new = row.street
    
    if old == new:
        return None
    
    return old, new
    

In [12]:
# create a tuple of unclean and clean words for poi

def map_poi(row):

    rawadd = word_tokenize(row.raw_address)
    poi = word_tokenize(row.POI)

    startindex = data_cleaning_index(rawadd, poi)
    old = ' '.join(rawadd[startindex: startindex + len(poi)])
    old = old.replace(' ,', ',')
    old = old.replace(' .', '.')
    old = old.replace(' )', ')')
    old = old.replace('( ', '(')
    new = row.POI
    if old == new:
        return None
    return old, new
    

In [13]:
# assign the tuple of unclean and clean words into new columns for both street and poi

start = time()
df['street_dic'] = df.apply(map_street, axis = 1)
df['poi_dic'] = df.apply(map_poi, axis = 1)
end = time()
totaltime = round((end - start)/60, 3)
print("Total time takent to execute : " + format(totaltime))

Total time takent to execute : 1.789


In [14]:
# check the new tuples for row 998

df.loc[[998], :]

Unnamed: 0,id,raw_address,POI,street,cleaned_add,street_dic,poi_dic
998,998,"yaya pelayanan halieluyah, tebet raya, 30d rw ...",yayasan pelayanan halieluyah,tebet raya,"yayasan pelayanan halieluyah, tebet raya, 30d ...",,"(yaya pelayanan halieluyah, yayasan pelayanan ..."


In [15]:
# remove all rows that do not have any tuples under street_dic to lessen the runtime

allstreet = df[df['street_dic'].notnull()]
dict_street = dict()

# create a dictionary with key as the unclean street and value as cleaned street
for index, row in allstreet.iterrows():
    
    # remove those tuple with only 1 word as it will not be accurate
    # do not process those that are already inside the dictionary
    if len(row['street_dic'][0].split()) > 1 and row['street_dic'][0] not in dict_street:
        if row['street_dic'][1][-1] == " ":
            curitem = row['street_dic'][1][0:-1]
            dict_street[row['street_dic'][0]] = curitem
        else:
            dict_street[row['street_dic'][0]] = row['street_dic'][1]
    
    
# remove all rows that do not have any tuples under poi_dic to lessen the runtime
allpoi = df[df['poi_dic'].notnull()]
dict_poi = dict()

# create a dictionary with key as the unclean poi and value as cleaned poi
for index, row in allpoi.iterrows():
    
    # remove those tuple with only 1 word as it will not be accurate
    # do not process those that are already inside the dictionary
    if len(row['poi_dic'][0].split()) > 1 and row['poi_dic'][0] not in dict_poi:
        if row['poi_dic'][1][-1] == " ":
            curitem = row['poi_dic'][1][0:-1]
            dict_street[row['poi_dic'][0]] = curitem
        else:
            dict_poi[row['poi_dic'][0]] = row['poi_dic'][1]
    
print("Length of street dictionary: " + format(len(dict_street)))
print("Length of poi dictionary: " + format(len(dict_poi)))

Length of street dictionary: 10457
Length of poi dictionary: 42426


In [16]:
# function to return the cleaned street base on the dictionary

def get_street(row):
    newstreet = ""
    raw = row.raw_address
    for k, v in dict_street.items():
        if (k in raw):
            raw = raw.replace(k,v)
            newstreet += v + " "
    
    if newstreet:
        newstreet = newstreet[0:-1] 
    
    return newstreet


In [17]:
# function to return the cleaned poi base on the dictionary

def get_poi(row):
    newpoi = ""
    raw = row.raw_address
    
    for k, v in dict_poi.items():
        if (k in raw):
            raw = raw.replace(k,v)
            newpoi += v + " "
    if newpoi:
        newpoi = newpoi[0:-1] 
    
    return newpoi

In [18]:
# apply the functions to retrieve the clean street and poi, storing them in new cols

start = time()
test = pd.read_csv("test/test.csv")
test['street'] = test.apply(get_street, axis = 1)
test['poi'] = test.apply(get_poi, axis = 1)
end = time()
totaltime = round((end - start)/60, 3)
print("Total time taken : " + format(totaltime))


Total time taken : 5.701


In [19]:
# create a new col street/POI to concat both cleaned street and cleaned poi
test["street/POI"] = test["street"] + "/" + test["poi"]
test

Unnamed: 0,id,raw_address,street,poi,street/POI
0,0,s. par 53 sidanegara 4 cilacap tengah,s. parman manyar 5,,s. parman manyar 5/
1,1,"angg per, baloi indah kel. lubuk baja",baloi pers anggrek per,,baloi pers anggrek per/
2,2,"asma laun, mand imog,",", mangun",,", mangun/"
3,3,"ud agung rej, raya nga sri wedari karanganyar",raya ngawi- raya ngawi-m,ud agung rejeki,raya ngawi- raya ngawi-m/ud agung rejeki
4,4,"cut mutia, 35 baiturrahman",cut meu cut meutia,,cut meu cut meutia/
...,...,...,...,...,...
49995,49995,toko mbak farid semboro semboro,,,/
49996,49996,"vie - tk. ridho kids, vete 3 cari, 16720 ciawi",,,/
49997,49997,"mart dan roti bakar malabar, nasio,",,,/
49998,49998,graha indah pamulang jl. mujair raya bambu apu...,bambu hub jl. pln raya bamban,,bambu hub jl. pln raya bamban/


In [20]:
# drop irrelevant cols and assinged to final_result

final_result = test.drop(["street","poi", "raw_address"], axis = 1)
final_result.head()
len(final_result)

50000

In [21]:
# export result to csv file

final_result.to_csv('Cleaned address_test.csv', index = False)

In [23]:
final_result.head()

Unnamed: 0,id,street/POI
0,0,s. parman manyar 5/
1,1,baloi pers anggrek per/
2,2,", mangun/"
3,3,raya ngawi- raya ngawi-m/ud agung rejeki
4,4,cut meu cut meutia/
