# XGBoost

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb

from IPython.display import display



In [2]:
import os, time, re, zipfile
import pickle
import gc
from PIL import Image
from shutil import copy2

### Run name

In [3]:
project_name = 'Text_Normalization_Challenge_En'
step_name = 'XGBoost'
date_str = time.strftime("%Y%m%d", time.localtime())
time_str = time.strftime("%Y%m%d_%H%M%S", time.localtime())
run_name = project_name + '_' + step_name + '_' + time_str
print('run_name: ' + run_name)

run_name: Text_Normalization_Challenge_En_XGBoost_20171104_155452


### Basic folders

In [4]:
cwd = os.getcwd()
input_path = os.path.join(cwd, 'input')
log_path = os.path.join(cwd, 'log')
model_path = os.path.join(cwd, 'model')
output_path = os.path.join(cwd, 'output')
print('input_path: ' + input_path)
print('log_path: ' + log_path)
print('model_path: ' + model_path)
print('output_path: ' + output_path)

input_path: D:\Kaggle\Text_Normalization_Challenge_English_Language\input
log_path: D:\Kaggle\Text_Normalization_Challenge_English_Language\log
model_path: D:\Kaggle\Text_Normalization_Challenge_English_Language\model
output_path: D:\Kaggle\Text_Normalization_Challenge_English_Language\output


### Unzip

In [5]:
def Unzip(data_path, zip_name):
    extract_name = zip_name[0:-4]
    extract_path = os.path.join(data_path, extract_name)
    zip_path = os.path.join(data_path, zip_name)
    if not (os.path.isdir(extract_path) or os.path.isfile(extract_path)):
        with zipfile.ZipFile(zip_path) as file:
            for name in file.namelist():
                file.extract(name, data_path)

In [6]:
Unzip(input_path, os.path.join(input_path, 'en_sample_submission.csv.zip'))
Unzip(input_path, os.path.join(input_path, 'en_test.csv.zip'))
Unzip(input_path, os.path.join(input_path, 'en_train.csv.zip'))

### Load data

In [14]:
%%time

df_sample_submission = pd.read_csv(os.path.join(input_path, 'en_sample_submission.csv'))
df_test = pd.read_csv(os.path.join(input_path, 'en_test.csv'))
df_train = pd.read_csv(os.path.join(input_path, 'en_train.csv'))

Wall time: 13 s


In [15]:
%%time

print('df_sample_submission amount: %d' %len(df_sample_submission))
display(df_sample_submission.head(2))

print('df_test amount: %d' %len(df_test))
display(df_test.head(2))

print('df_train amount: %d' %len(df_train))
display(df_train.head(20))

df_sample_submission amount: 1088564


Unnamed: 0,id,after
0,0_0,Another
1,0_1,religious


df_test amount: 1088564


Unnamed: 0,sentence_id,token_id,before
0,0,0,Another
1,0,1,religious


df_train amount: 9918441


Unnamed: 0,sentence_id,token_id,class,before,after
0,0,0,PLAIN,Brillantaisia,Brillantaisia
1,0,1,PLAIN,is,is
2,0,2,PLAIN,a,a
3,0,3,PLAIN,genus,genus
4,0,4,PLAIN,of,of
5,0,5,PLAIN,plant,plant
6,0,6,PLAIN,in,in
7,0,7,PLAIN,family,family
8,0,8,PLAIN,Acanthaceae,Acanthaceae
9,0,9,PUNCT,.,.


Wall time: 99 ms


### Convert word to letters

In [56]:
df = df_train

x_data = []
y_data0 =  pd.factorize(df['class'])
labels = y_data0[1]
y_data = y_data0[0]
print(y_data0)
print(type(y_data0))
print(y_data0)
print(labels)
print(y_data)
# display(df_train['class'].head())
# display(type(pd.factorize(df_train['class'])))
# display(pd.factorize(df_train['class']))

(array([0, 0, 0, ..., 9, 0, 1], dtype=int64), Index(['PLAIN', 'PUNCT', 'DATE', 'LETTERS', 'CARDINAL', 'VERBATIM', 'DECIMAL',
       'MEASURE', 'MONEY', 'ORDINAL', 'TIME', 'ELECTRONIC', 'DIGIT',
       'FRACTION', 'TELEPHONE', 'ADDRESS'],
      dtype='object'))
<class 'tuple'>
(array([0, 0, 0, ..., 9, 0, 1], dtype=int64), Index(['PLAIN', 'PUNCT', 'DATE', 'LETTERS', 'CARDINAL', 'VERBATIM', 'DECIMAL',
       'MEASURE', 'MONEY', 'ORDINAL', 'TIME', 'ELECTRONIC', 'DIGIT',
       'FRACTION', 'TELEPHONE', 'ADDRESS'],
      dtype='object'))
Index(['PLAIN', 'PUNCT', 'DATE', 'LETTERS', 'CARDINAL', 'VERBATIM', 'DECIMAL',
       'MEASURE', 'MONEY', 'ORDINAL', 'TIME', 'ELECTRONIC', 'DIGIT',
       'FRACTION', 'TELEPHONE', 'ADDRESS'],
      dtype='object')
[0 0 0 ..., 9 0 1]


In [62]:
%%time

max_num_features = 10
pad_size = 1
boundary_letter = -1
space_letter = 0
# max_data_size = 960000
max_data_size = 12

out_path = output_path
# df = pd.read_csv(r'../input/en_train.csv')
df = df_train

x_data = []
y_data =  pd.factorize(df['class'])
labels = y_data[1]
y_data = y_data[0]
# gc.collect()
display(df['before'].head(5))

for x in df['before'][0:10].values:
    x_row = np.ones(max_num_features, dtype=int) * space_letter
    for xi, i in zip(list(str(x)), np.arange(max_num_features)):
        # Convert word to letters
        x_row[i] = ord(xi)
    x_data.append(x_row)
display(x_data[0:5])
# print(x_data.shape)

0    Brillantaisia
1               is
2                a
3            genus
4               of
Name: before, dtype: object

[array([ 66, 114, 105, 108, 108,  97, 110, 116,  97, 105]),
 array([105, 115,   0,   0,   0,   0,   0,   0,   0,   0]),
 array([97,  0,  0,  0,  0,  0,  0,  0,  0,  0]),
 array([103, 101, 110, 117, 115,   0,   0,   0,   0,   0]),
 array([111, 102,   0,   0,   0,   0,   0,   0,   0,   0])]

Wall time: 535 ms


In [66]:
%%time

max_num_features = 15
pad_size = 1
boundary_letter = -1
space_letter = 0
# max_data_size = 960000
max_data_size = 12

out_path = output_path
# df = pd.read_csv(r'../input/en_train.csv')
df = df_train

x_data = []
y_data =  pd.factorize(df['class'])
labels = y_data[1]
y_data = y_data[0]
# gc.collect()
for x in df['before'].values:
    x_row = np.ones(max_num_features, dtype=int) * space_letter
    for xi, i in zip(list(str(x)), np.arange(max_num_features)):
        x_row[i] = ord(xi)
    x_data.append(x_row)

def context_window_transform(data, pad_size):
    pre = np.zeros(max_num_features)
    pre = [pre for x in np.arange(pad_size)]
    data = pre + data + pre
    neo_data = []
    for i in np.arange(len(data) - pad_size * 2):
        row = []
        for x in data[i : i + pad_size * 2 + 1]:
            row.append([boundary_letter])
            row.append(x)
        row.append([boundary_letter])
        neo_data.append([int(x) for y in row for x in y])
    return neo_data

x_data = x_data[:max_data_size]
y_data = y_data[:max_data_size]
x_data = np.array(context_window_transform(x_data, pad_size))
display(x_data)
gc.collect()
x_data = np.array(x_data)
y_data = np.array(y_data)

print('Total number of samples:', len(x_data))
print('Use: ', max_data_size)
#x_data = np.array(x_data)
#y_data = np.array(y_data)

print('x_data sample:')
print(x_data[0])
print('y_data sample:')
print(y_data[0])
print('labels:')
print(labels)

array([[ -1,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,  -1,  66, 114, 105, 108, 108,  97, 110, 116,  97,
        105, 115, 105,  97,   0,   0,  -1, 105, 115,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,  -1],
       [ -1,  66, 114, 105, 108, 108,  97, 110, 116,  97, 105, 115, 105,
         97,   0,   0,  -1, 105, 115,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,  -1,  97,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,  -1],
       [ -1, 105, 115,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,  -1,  97,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,  -1, 103, 101, 110, 117, 115,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,  -1],
       [ -1,  97,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,  -1, 103, 101, 110, 117, 115,   0,   0,   0,   0,
          0,   0,   

Total number of samples: 12
Use:  12
x_data sample:
[ -1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  -1  66
 114 105 108 108  97 110 116  97 105 115 105  97   0   0  -1 105 115   0
   0   0   0   0   0   0   0   0   0   0   0   0  -1]
y_data sample:
0
labels:
Index(['PLAIN', 'PUNCT', 'DATE', 'LETTERS', 'CARDINAL', 'VERBATIM', 'DECIMAL',
       'MEASURE', 'MONEY', 'ORDINAL', 'TIME', 'ELECTRONIC', 'DIGIT',
       'FRACTION', 'TELEPHONE', 'ADDRESS'],
      dtype='object')
Wall time: 2min 46s


In [9]:
%%time

x_train = x_data
y_train = y_data
gc.collect()

x_train, x_valid, y_train, y_valid= train_test_split(x_train, y_train, test_size=0.1, random_state=2017)
gc.collect()
num_class = len(labels)
dtrain = xgb.DMatrix(x_train, label=y_train)
dvalid = xgb.DMatrix(x_valid, label=y_valid)
watchlist = [(dvalid, 'valid'), (dtrain, 'train')]

param = {'objective':'multi:softmax',
         'eta':'0.3', 'max_depth':10,
         'silent':1, 'nthread':-1,
         'num_class':num_class,
         'eval_metric':'merror'}
model = xgb.train(param, 
                  dtrain, 
                  100, 
                  watchlist, 
                  early_stopping_rounds = 50,
                  verbose_eval=10)
gc.collect()

[0]	valid-merror:0.007135	train-merror:0.006369
Multiple eval metrics have been passed: 'train-merror' will be used for early stopping.

Will train until train-merror hasn't improved in 50 rounds.
[10]	valid-merror:0.004708	train-merror:0.003522
[20]	valid-merror:0.004062	train-merror:0.002486
[30]	valid-merror:0.003729	train-merror:0.001762
[40]	valid-merror:0.003615	train-merror:0.001189
[50]	valid-merror:0.003552	train-merror:0.000874
[60]	valid-merror:0.0035	train-merror:0.000569
[70]	valid-merror:0.003531	train-merror:0.000419
[80]	valid-merror:0.003458	train-merror:0.000319
[90]	valid-merror:0.003448	train-merror:0.000248
Wall time: 1h 52min 4s


In [10]:
%%time

pred = model.predict(dvalid)
pred = [labels[int(x)] for x in pred]
y_valid = [labels[x] for x in y_valid]
x_valid = [ [ chr(x) for x in y[2 + max_num_features: 2 + max_num_features * 2]] for y in x_valid]
x_valid = [''.join(x) for x in x_valid]
x_valid = [re.sub('a+$', '', x) for x in x_valid]

gc.collect()

df_pred = pd.DataFrame(columns=['data', 'predict', 'target'])
df_pred['data'] = x_valid
df_pred['predict'] = pred
df_pred['target'] = y_valid
df_pred.to_csv(os.path.join(out_path, 'pred.csv'))

df_erros = df_pred.loc[df_pred['predict'] != df_pred['target']]
df_erros.to_csv(os.path.join(out_path, 'errors.csv'), index=False)

model.save_model(os.path.join(out_path, 'xgb_model'))

Wall time: 1.21 s


In [11]:

print('Done!')

Done!
