In [None]:
import os
import pickle
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.naive_bayes import MultinomialNB
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

from time import time

#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
train_df = pd.read_csv('train.csv')
train_df['POI'] = train_df['POI/street'].apply(lambda row: row.split('/')[0])
train_df['street'] = train_df['POI/street'].apply(lambda row: row.split('/')[1])
del train_df['POI/street']
train_df.head()

Unnamed: 0,id,raw_address,POI,street
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,,jl kapuk timur delta sili iii lippo cika
1,1,"aye, jati sampurna",,
2,2,setu siung 119 rt 5 1 13880 cipayung,,siung
3,3,"toko dita, kertosono",toko dita,
4,4,jl. orde baru,,jl. orde baru


In [None]:
test_df = pd.read_csv('test.csv')
test_df.head()

Unnamed: 0,id,raw_address
0,0,s. par 53 sidanegara 4 cilacap tengah
1,1,"angg per, baloi indah kel. lubuk baja"
2,2,"asma laun, mand imog,"
3,3,"ud agung rej, raya nga sri wedari karanganyar"
4,4,"cut mutia, 35 baiturrahman"


In [None]:
X = train_df['raw_address']
y_train_poi = train_df['POI']
y_train_street = train_df['street']

cv = CountVectorizer()
X_train = cv.fit_transform(X)
print(f'X shape: {X_train.shape}')

X shape: (300000, 81117)


In [None]:
start = time()

total = len(train_df)
size = 3000
batch = list(range(0, total-size+1, size))
n_iter = 20

mnb_poi = MultinomialNB(alpha=1, fit_prior=True)
mnb_street = MultinomialNB(alpha=1, fit_prior=True)

for n in range(n_iter):
    print(f'Epoch: {n+1}/{n_iter}')
    X, y_poi, y_street = shuffle(X_train, y_train_poi, y_train_street, random_state=n)
    if n != 0:
        for b in range(0, len(batch)):
            mnb_poi.partial_fit(X[batch[b]:batch[b]+size], y_poi[batch[b]:batch[b]+size])
            mnb_street.partial_fit(X[batch[b]:batch[b]+size], y_street[batch[b]:batch[b]+size])
    elif n == 0:
        mnb_poi.fit(X[batch[0]:batch[0]+size], y_poi[batch[0]:batch[0]+size])
        mnb_street.fit(X[batch[0]:batch[0]+size], y_street[batch[0]:batch[0]+size])
        for b in range(1, len(batch)):
            mnb_poi.partial_fit(X[batch[b]:batch[b]+size], y_poi[batch[b]:batch[b]+size])
            mnb_street.partial_fit(X[batch[b]:batch[b]+size], y_street[batch[b]:batch[b]+size])

print('Training complete!')
print(f'Time elapsed: {round((time() - start)/60, 2)} minutes.')
print()

poi_pred = mnb_poi.predict(cv.transform(test_df['raw_address']))
street_pred = mnb_street.predict(cv.transform(test_df['raw_address']))

answer_df = pd.DataFrame()
answer_df['id'] = test_df['id']
answer_df['POI'] = poi_pred
answer_df['street'] = street_pred
answer_df['POI/street'] = answer_df['POI'] + '/' + answer_df['street']
answer_df = answer_df[['id', 'POI/street']]
answer_df.head()
answer_df.to_csv('submit08.csv', header=True, index=False)
print('Saving complete!')

Epoch: 1/20
Epoch: 2/20
Epoch: 3/20
Epoch: 4/20
Epoch: 5/20
Epoch: 6/20
Epoch: 7/20
Epoch: 8/20
Epoch: 9/20
Epoch: 10/20
Epoch: 11/20
Epoch: 12/20
Epoch: 13/20
Epoch: 14/20
Epoch: 15/20
Epoch: 16/20
Epoch: 17/20
Epoch: 18/20
Epoch: 19/20
Epoch: 20/20
Training complete!
Time elapsed: 423.37 minutes.

Saving complete!
