In [1]:
import json
import pandas as pd
import numpy as np
import scipy.sparse
from nltk.tokenize import wordpunct_tokenize
from nltk.stem.snowball import EnglishStemmer
from nltk.stem import WordNetLemmatizer
from functools import lru_cache
from tqdm import tqdm as tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from scipy import sparse
import time, os, sys
from keras.preprocessing import sequence
from keras.models import Model, Input
from keras.layers import Dense, SpatialDropout1D, Dropout
from keras.layers import Embedding, GlobalMaxPool1D, BatchNormalization
from keras.preprocessing.text import Tokenizer

class DataUtil2:
    """"""
    @classmethod
    def load(cls, file, format, date_cols= None):
        """"""
        data = ''
        if(format== 'csv'):
            data = pd.read_csv(file, parse_dates= date_cols)
        elif(format== 'json'):
            with open(file, 'r') as i_file:
                data = json.load(file)
            i_file.close()
        elif(format== 'pkl'):
            with open(file, 'rb') as i_file:
                data = pickle.load(i_file)
            i_file.close()
        elif(format == 'hdf'):
            data = pd.read_hdf(path_or_buf= file, key='undefined')
        elif(format == 'npz'):
            data = scipy.sparse.load_npz(file)

        return  data

    @classmethod
    def save(cls, data, file, format, precision= 8):
        """"""
        if(format == 'csv'):
            data.to_csv(file, float_format= '%%.%df' % precision, index= False)
        elif(format == 'json'):
            with open(file, 'w') as o_file:
                json.dump(data, o_file, ensure_ascii= True, indent= 4)
            o_file.close()
        elif(format == 'pkl'):
            with open(file, 'wb') as o_file:
                pickle.dump(data, o_file, -1)
            o_file.close()
        elif(format== 'hdf'):
            data.to_hdf(path_or_buf= file, key='undefined', mode='w', complib='blosc')
        elif(format == 'npz'):
            scipy.sparse.save_npz(file, data)
        return

Using TensorFlow backend.


In [2]:
# load data
iformat = 'csv'
oformat = 'hdf'
DataBase = '../data'
DataSet = {}
start = time.time()
for mod in ['train', 'test']:
    DataSet[mod] = DataUtil2.load('%s/raw/%s.%s' % (DataBase, mod, iformat), iformat)
    DataSet[mod]['comment_text'] = DataSet[mod]['comment_text'].fillna('nan')
end = time.time()
print('load data done, time elapsed %.2fs' % (end - start))

stemmer = EnglishStemmer()

@lru_cache(30000)
def stem_word(text):
    return stemmer.stem(text)

lemmatizer = WordNetLemmatizer()

@lru_cache(30000)
def lemmatize_word(text):
    return lemmatizer.lemmatize(text)

def reduce_text(conversion, text):
    return " ".join(map(conversion, wordpunct_tokenize(text.lower())))

def reduce_texts(conversion, texts):
    return [reduce_text(conversion, str(text))
            for text in tqdm(texts)]

start = time.time()
for mod in ['train', 'test']:
    #DataSet[mod]['comment_text_stemmed'] = reduce_texts(stem_word, DataSet[mod]['comment_text'])
    #DataSet[mod]['comment_text_stemmed'] = DataSet[mod]['comment_text_stemmed'].fillna('nan')
    DataSet[mod]['comment_text_lemma'] = reduce_texts(lemmatize_word, DataSet[mod]['comment_text'])
    DataSet[mod]['comment_text_lemma'] = DataSet[mod]['comment_text_lemma'].fillna('nan') 
end = time.time()
print('stemming done, time elapsed %.2fs' % (end - start))

  0%|          | 0/95851 [00:00<?, ?it/s]

load data done, time elapsed 1.13s


100%|██████████| 95851/95851 [00:04<00:00, 20083.17it/s]
100%|██████████| 226998/226998 [00:09<00:00, 23068.43it/s]


stemming done, time elapsed 14.69s


In [4]:
## standarize BOW
max_features= 50000
maxlen= 100
targets = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
text_col = 'comment_text_lemma'

x_train = DataSet['train'][text_col]
y_train = DataSet['train'][targets].values
x_test = DataSet['test'][text_col]
for target in targets:
    DataSet['test'][target] = .0
y_test = DataSet['test'][targets].values

start = time.time()

print('Tokenizing data...')
tok = Tokenizer(num_words=max_features)
tok.fit_on_texts(list(x_train) + list(x_test))
x_train = tok.texts_to_sequences(x_train)
x_test = tok.texts_to_sequences(x_test)
end = time.time()
print('BOW done, time elapsed %.2fs' % (end - start))

# padding into a smaller length
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('train shape: ' , x_train.shape)
print('test shape: ' , x_test.shape)

Tokenizing data...
BOW done, time elapsed 23.05s
Pad sequences (samples x time)
train shape:  (95851, 100)
test shape:  (226998, 100)


In [5]:
label2binary = np.array([
    [0, 0, 0, 0, 0, 0],
    [0, 0, 0, 0, 0, 1],
    [0, 0, 0, 0, 1, 0],
    [0, 0, 0, 0, 1, 1],
    [0, 0, 0, 1, 0, 0],
    [0, 0, 0, 1, 0, 1],
    [0, 0, 0, 1, 1, 0],
    [0, 0, 0, 1, 1, 1],
    [0, 0, 1, 0, 0, 0],
    [0, 0, 1, 0, 0, 1],
    [0, 0, 1, 0, 1, 0],
    [0, 0, 1, 0, 1, 1],
    [0, 0, 1, 1, 0, 0],
    [0, 0, 1, 1, 0, 1],
    [0, 0, 1, 1, 1, 0],
    [0, 0, 1, 1, 1, 1],
    [0, 1, 0, 0, 0, 0],
    [0, 1, 0, 0, 0, 1],
    [0, 1, 0, 0, 1, 0],
    [0, 1, 0, 0, 1, 1],
    [0, 1, 0, 1, 0, 0],
    [0, 1, 0, 1, 0, 1],
    [0, 1, 0, 1, 1, 0],
    [0, 1, 0, 1, 1, 1],
    [0, 1, 1, 0, 0, 0],
    [0, 1, 1, 0, 0, 1],
    [0, 1, 1, 0, 1, 0],
    [0, 1, 1, 0, 1, 1],
    [0, 1, 1, 1, 0, 0],
    [0, 1, 1, 1, 0, 1],
    [0, 1, 1, 1, 1, 0],
    [0, 1, 1, 1, 1, 1],
    [1, 0, 0, 0, 0, 0],
    [1, 0, 0, 0, 0, 1],
    [1, 0, 0, 0, 1, 0],
    [1, 0, 0, 0, 1, 1],
    [1, 0, 0, 1, 0, 0],
    [1, 0, 0, 1, 0, 1],
    [1, 0, 0, 1, 1, 0],
    [1, 0, 0, 1, 1, 1],
    [1, 0, 1, 0, 0, 0],
    [1, 0, 1, 0, 0, 1],
    [1, 0, 1, 0, 1, 0],
    [1, 0, 1, 0, 1, 1],
    [1, 0, 1, 1, 0, 0],
    [1, 0, 1, 1, 0, 1],
    [1, 0, 1, 1, 1, 0],
    [1, 0, 1, 1, 1, 1],
    [1, 1, 0, 0, 0, 0],
    [1, 1, 0, 0, 0, 1],
    [1, 1, 0, 0, 1, 0],
    [1, 1, 0, 0, 1, 1],
    [1, 1, 0, 1, 0, 0],
    [1, 1, 0, 1, 0, 1],
    [1, 1, 0, 1, 1, 0],
    [1, 1, 0, 1, 1, 1],
    [1, 1, 1, 0, 0, 0],
    [1, 1, 1, 0, 0, 1],
    [1, 1, 1, 0, 1, 0],
    [1, 1, 1, 0, 1, 1],
    [1, 1, 1, 1, 0, 0],
    [1, 1, 1, 1, 0, 1],
    [1, 1, 1, 1, 1, 0],
    [1, 1, 1, 1, 1, 1],
])

OutputDir = '../data/meta/kfold/'
strategy = 'ft_word_char_tfidf'
targets = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

def metric(y_true, y_pred):
    assert y_true.shape == y_pred.shape
    columns = y_true.shape[1]
    column_losses = []
    for i in range(0, columns):
        column_losses.append(log_loss(y_true[:, i], y_pred[:, i]))
    return np.array(column_losses).mean()

def cv(model, X, y, label2binary, n_splits=3):
    def split(X, y):
        return StratifiedKFold(n_splits=n_splits).split(X, y)
    
    def convert_y(y):
        new_y = np.zeros([len(y)])
        for i, val in enumerate(label2binary):
            idx = (y == val).max(axis=1)
            new_y[idx] = i
        return new_y
    
    X = np.array(X)
    y = np.array(y)
    scores = []
    fold = 0
    for train, test in tqdm(split(X, convert_y(y)), total=n_splits):
        FoldOutput = '%s/%s' % (OutputDir, fold)
        if(os.path.exists(FoldOutput) == False):
            os.makedirs(FoldOutput)
        fitted_model = model(X[train], y[train])
        predict = fitted_model(X[test])
        score = metric(y[test], predict)
        scores.append(score)
        FoldOutputFile = '%s/valid_%s.csv' % (FoldOutput, strategy)
        df = pd.DataFrame(index= range(len(predict)))
        #df.columns = ['%s_%s' % (strategy,t) for t in targets]
        for idx in range(len(targets)):
            df['%s_%s' % (strategy, targets[idx])] = predict[:,idx]
        DataUtil2.save(df, FoldOutputFile, 'csv', 6)
        print('fold %s, cv score %.4f' % (fold, score))
        fold += 1
    return np.array(scores)

In [6]:
# parameters
embedding_dims = 80
batch_size = 32
epochs = 4

def embedding_words(X, y):
    print('Build model...')
    comment_input = Input((maxlen,))

    # we start off with an efficient embedding layer which maps
    # our vocab indices into embedding_dims dimensions
    comment_emb = Embedding(max_features, embedding_dims, input_length=maxlen)(comment_input)

    # we add a GlobalMaxPool1D, which will extract information from the embeddings
    # of all words in the document
    comment_emb = SpatialDropout1D(0.25)(comment_emb)
    max_emb = GlobalMaxPool1D()(comment_emb)

    # normalized dense layer followed by dropout
    main = BatchNormalization()(max_emb)
    main = Dense(embedding_dims)(main)
    main = Dropout(0.05)(main)

    # We project onto a six-unit output layer, and squash it with sigmoids:
    output = Dense(6, activation='sigmoid')(main)

    model = Model(inputs=comment_input, outputs=output)

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    model.fit(X, y, batch_size= batch_size, epochs= epochs)
    
    def _predict(X):
        return model.predict(X)
    
    return _predict

scores = cv(embedding_words, x_train, y_train, label2binary)
print(scores)
print(np.mean(scores))

  0%|          | 0/3 [00:00<?, ?it/s]

Build model...
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


 33%|███▎      | 1/3 [01:00<02:01, 60.67s/it]

fold 0, cv score 0.0509
Build model...
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


 67%|██████▋   | 2/3 [01:59<00:59, 59.95s/it]

fold 1, cv score 0.0512
Build model...
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


100%|██████████| 3/3 [02:59<00:00, 59.90s/it]

fold 2, cv score 0.0522
[ 0.05086079  0.05117356  0.05216046]
0.0513982719331





In [7]:
%%time
model = embedding_words(np.array(x_train), np.array(y_train))

Build model...
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
CPU times: user 3min 28s, sys: 1min 50s, total: 5min 18s
Wall time: 1min 27s


In [8]:
%%time
prediction = model(np.array(x_test))

CPU times: user 17.5 s, sys: 9.71 s, total: 27.2 s
Wall time: 6.52 s


In [9]:
submission = pd.DataFrame()
submission['id'] = DataSet['test']['id']
for i, label in enumerate(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']):
    submission[label] = prediction[:, i]
# print(submission.tail(100))

In [12]:
import sys,os,datetime

SubmitOutputDir = '../data/meta/submit'
if(os.path.exists(SubmitOutputDir) == False):
    os.makedirs(SubmitOutputDir)
SubmitFileName = '%s_%s' % (strategy, datetime.datetime.now().strftime("%Y-%m-%d"))
submission.to_csv('%s/%s.csv' % (SubmitOutputDir, SubmitFileName), index= None)
print('zip %s/%s.zip %s/%s.csv' % (SubmitOutputDir, SubmitFileName, SubmitOutputDir, SubmitFileName))
os.system('zip -r %s/%s.zip %s/%s.csv' % (SubmitOutputDir, SubmitFileName, SubmitOutputDir, SubmitFileName))

zip ../data/meta/submit/ft_word_char_tfidf_2018-01-18.zip ../data/meta/submit/ft_word_char_tfidf_2018-01-18.csv


0