In [95]:
import json
import pandas as pd
import numpy as np
import dill as pickle
import scipy.sparse
from nltk.tokenize import wordpunct_tokenize
from nltk.stem.snowball import EnglishStemmer
from nltk.stem import WordNetLemmatizer
from functools import lru_cache
from tqdm import tqdm as tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from scipy import sparse
import time, os, sys

class DataUtil2:
    """"""
    @classmethod
    def load(cls, file, format, date_cols= None):
        """"""
        data = ''
        if(format== 'csv'):
            data = pd.read_csv(file, parse_dates= date_cols)
        elif(format== 'json'):
            with open(file, 'r') as i_file:
                data = json.load(file)
            i_file.close()
        elif(format== 'pkl'):
            with open(file, 'rb') as i_file:
                data = pickle.load(i_file)
            i_file.close()
        elif(format == 'hdf'):
            data = pd.read_hdf(path_or_buf= file, key='undefined')
        elif(format == 'npz'):
            data = scipy.sparse.load_npz(file)

        return  data

    @classmethod
    def save(cls, data, file, format, precision= 8):
        """"""
        if(format == 'csv'):
            data.to_csv(file, float_format= '%%.%df' % precision, index= False)
        elif(format == 'json'):
            with open(file, 'w') as o_file:
                json.dump(data, o_file, ensure_ascii= True, indent= 4)
            o_file.close()
        elif(format == 'pkl'):
            with open(file, 'wb') as o_file:
                pickle.dump(data, o_file, -1)
            o_file.close()
        elif(format== 'hdf'):
            data.to_hdf(path_or_buf= file, key='undefined', mode='w', complib='blosc')
        elif(format == 'npz'):
            scipy.sparse.save_npz(file, data)
        return

In [96]:
def create_ngram_set(input_list, ngram_value=2):
    """
    Extract a set of n-grams from a list of integers.

    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
    {(4, 9), (4, 1), (1, 4), (9, 4)}

    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
    [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
    """
    return set(zip(*[input_list[i:] for i in range(ngram_value)]))

def add_ngram(sequences, token_indice, ngram_range=2):
    """
    Augment the input list of list (sequences) by appending n-grams values.

    Example: adding bi-gram
    >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
    >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
    >>> add_ngram(sequences, token_indice, ngram_range=2)
    [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]

    Example: adding tri-gram
    >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
    >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}
    >>> add_ngram(sequences, token_indice, ngram_range=3)
    [[1, 3, 4, 5, 1337], [1, 3, 7, 9, 2, 1337, 2018]]
    """
    new_sequences = []
    for input_list in sequences:
        new_list = input_list[:]
        for i in range(len(new_list) - ngram_range + 1):
            for ngram_value in range(2, ngram_range + 1):
                ngram = tuple(new_list[i:i + ngram_value])
                if ngram in token_indice:
                    new_list.append(token_indice[ngram])
        new_sequences.append(new_list)

    return new_sequences

In [97]:
# load data
iformat = 'csv'
oformat = 'hdf'
DataBase = '../data'
DataSet = {}
start = time.time()
for mod in ['train', 'test']:
    DataSet[mod] = DataUtil2.load('%s/raw/%s.%s' % (DataBase, mod, iformat), iformat)
    DataSet[mod]['comment_text'] = DataSet[mod]['comment_text'].fillna('nan')
end = time.time()
print('load data done, time elapsed %.2fs' % (end - start))

load data done, time elapsed 1.69s


In [98]:
stemmer = EnglishStemmer()

@lru_cache(30000)
def stem_word(text):
    return stemmer.stem(text)

lemmatizer = WordNetLemmatizer()

@lru_cache(30000)
def lemmatize_word(text):
    return lemmatizer.lemmatize(text)

def reduce_text(conversion, text):
    return " ".join(map(conversion, wordpunct_tokenize(text.lower())))

def reduce_texts(conversion, texts):
    return [reduce_text(conversion, str(text))
            for text in tqdm(texts)]

start = time.time()
for mod in ['train', 'test']:
    DataSet[mod]['comment_text_stemmed'] = reduce_texts(stem_word, DataSet[mod]['comment_text'])
    DataSet[mod]['comment_text_stemmed'] = DataSet[mod]['comment_text_stemmed'].fillna('nan')
    DataSet[mod].drop('comment_text', axis= 1, inplace= True)
end = time.time()
print('stemming done, time elapsed %.2fs' % (end - start))

100%|██████████| 95851/95851 [00:10<00:00, 8877.76it/s]
100%|██████████| 226998/226998 [00:26<00:00, 8672.46it/s]


stemming done, time elapsed 37.09s


In [None]:
def MyTokenizer(data, number_words= 20000):
    corpus = {}
    for i in range(len(data)):
        word_vec = data[i]
        for j in range(len(word_vec)):
            if(word_vec[j] not in corpus):
                corpus[word_vec[j]] = 1
            else:
                corpus[word_vec[j]] += 1
    top_words = sorted([(value, key) for (key,value) in corpus.items()])[-number_words:]
    word_dict = dict([(word, idx) for (idx,word) in enumerate([w[1] for w in top_words])])
    
    def _transform(X):
        for i in range(len(X)):
            word_vec = X[i]
            new_vec = []
            for j in range(len(word_vec)):
                if(word_vec[j] in word_dict):
                    new_vec.append(word_dict[word_vec[j]])
            X[i] = new_vec
        return X
    return _transform

x_train = DataSet['train']['comment_text_stemmed']
y_train = DataSet['train'][targets].values
x_test = DataSet['test']['comment_text_stemmed']
for target in targets:
    DataSet['test'][target] = .0
y_test = DataSet['test'][targets].values

start = time.time()
## bag-of-words
x_train = x_train.str.split(' ').values
x_test = x_test.str.split(' ').values
tk = MyTokenizer(x_train)
x_train = tk(x_train)
x_test = tk(x_test)
end = time.time()
# print(x_train[:5])
# print(x_test[:5])
print('BOW done, time elapsed %.2fs' % (end - start))

## add n-gram into feature space
ngram_range = 1
if ngram_range > 1:
    print('Adding {}-gram features'.format(ngram_range))
    # Create set of unique n-gram from the training set.
    ngram_set = set()
    for input_list in x_train:
        for i in range(2, ngram_range + 1):
            set_of_ngram = create_ngram_set(input_list, ngram_value=i)
            ngram_set.update(set_of_ngram)

    # Dictionary mapping n-gram token to a unique integer.
    # Integer values are greater than max_features in order
    # to avoid collision with existing features.
    start_index = max_features + 1
    token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
    indice_token = {token_indice[k]: k for k in token_indice}

    # max_features is the highest integer that could be found in the dataset.
    max_features = np.max(list(indice_token.keys())) + 1

    print('current feture space %s' % max_features)
    # Augmenting x_train and x_test with n-grams features
    x_train = add_ngram(x_train, token_indice, ngram_range)
    print('Average %s sequence length: %s' % (mod, np.mean(list(map(len, x_train)), dtype=int)))
end = time.time()
print('add n-gram done, time elapsed %.2fs' % (end - start))

## building model
from keras.preprocessing import sequence
from keras.models import Model, Input
from keras.layers import Dense, SpatialDropout1D, Dropout
from keras.layers import Embedding, GlobalMaxPool1D, BatchNormalization

targets = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

maxlen = 65
embedding_dims = 64
batch_size = 32
epochs = 4

# padding into a smaller length
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('train shape: ' , x_train.shape)
print('test shape: ' , x_test.shape)

# print(type(y_train))
# sys.exit(1)

print('Build model...')
comment_input = Input((maxlen,))

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
comment_emb = Embedding(max_features, embedding_dims, input_length=maxlen)(comment_input)

# we add a GlobalMaxPool1D, which will extract information from the embeddings
# of all words in the document
comment_emb = SpatialDropout1D(0.25)(comment_emb)
max_emb = GlobalMaxPool1D()(comment_emb)

# normalized dense layer followed by dropout
main = BatchNormalization()(max_emb)
main = Dense(64)(main)
main = Dropout(0.5)(main)

# We project onto a six-unit output layer, and squash it with sigmoids:
output = Dense(6, activation='sigmoid')(main)

model = Model(inputs=comment_input, outputs=output)

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
x_valid = x_train[-10:,]
y_valid = y_train[-10:,]
# print(x_valid)
hist = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data= (x_valid, y_valid))

BOW done, time elapsed 13.65s
add n-gram done, time elapsed 13.65s
Pad sequences (samples x time)
train shape:  (95851, 65)
test shape:  (226998, 65)
Build model...


  "This may consume a large amount of memory." % num_elements)


Train on 95851 samples, validate on 10 samples
Epoch 1/4


In [None]:
# label2binary = np.array([
#     [0, 0, 0, 0, 0, 0],
#     [0, 0, 0, 0, 0, 1],
#     [0, 0, 0, 0, 1, 0],
#     [0, 0, 0, 0, 1, 1],
#     [0, 0, 0, 1, 0, 0],
#     [0, 0, 0, 1, 0, 1],
#     [0, 0, 0, 1, 1, 0],
#     [0, 0, 0, 1, 1, 1],
#     [0, 0, 1, 0, 0, 0],
#     [0, 0, 1, 0, 0, 1],
#     [0, 0, 1, 0, 1, 0],
#     [0, 0, 1, 0, 1, 1],
#     [0, 0, 1, 1, 0, 0],
#     [0, 0, 1, 1, 0, 1],
#     [0, 0, 1, 1, 1, 0],
#     [0, 0, 1, 1, 1, 1],
#     [0, 1, 0, 0, 0, 0],
#     [0, 1, 0, 0, 0, 1],
#     [0, 1, 0, 0, 1, 0],
#     [0, 1, 0, 0, 1, 1],
#     [0, 1, 0, 1, 0, 0],
#     [0, 1, 0, 1, 0, 1],
#     [0, 1, 0, 1, 1, 0],
#     [0, 1, 0, 1, 1, 1],
#     [0, 1, 1, 0, 0, 0],
#     [0, 1, 1, 0, 0, 1],
#     [0, 1, 1, 0, 1, 0],
#     [0, 1, 1, 0, 1, 1],
#     [0, 1, 1, 1, 0, 0],
#     [0, 1, 1, 1, 0, 1],
#     [0, 1, 1, 1, 1, 0],
#     [0, 1, 1, 1, 1, 1],
#     [1, 0, 0, 0, 0, 0],
#     [1, 0, 0, 0, 0, 1],
#     [1, 0, 0, 0, 1, 0],
#     [1, 0, 0, 0, 1, 1],
#     [1, 0, 0, 1, 0, 0],
#     [1, 0, 0, 1, 0, 1],
#     [1, 0, 0, 1, 1, 0],
#     [1, 0, 0, 1, 1, 1],
#     [1, 0, 1, 0, 0, 0],
#     [1, 0, 1, 0, 0, 1],
#     [1, 0, 1, 0, 1, 0],
#     [1, 0, 1, 0, 1, 1],
#     [1, 0, 1, 1, 0, 0],
#     [1, 0, 1, 1, 0, 1],
#     [1, 0, 1, 1, 1, 0],
#     [1, 0, 1, 1, 1, 1],
#     [1, 1, 0, 0, 0, 0],
#     [1, 1, 0, 0, 0, 1],
#     [1, 1, 0, 0, 1, 0],
#     [1, 1, 0, 0, 1, 1],
#     [1, 1, 0, 1, 0, 0],
#     [1, 1, 0, 1, 0, 1],
#     [1, 1, 0, 1, 1, 0],
#     [1, 1, 0, 1, 1, 1],
#     [1, 1, 1, 0, 0, 0],
#     [1, 1, 1, 0, 0, 1],
#     [1, 1, 1, 0, 1, 0],
#     [1, 1, 1, 0, 1, 1],
#     [1, 1, 1, 1, 0, 0],
#     [1, 1, 1, 1, 0, 1],
#     [1, 1, 1, 1, 1, 0],
#     [1, 1, 1, 1, 1, 1],
# ])

# def metric(y_true, y_pred):
#     assert y_true.shape == y_pred.shape
#     columns = y_true.shape[1]
#     column_losses = []
#     for i in range(0, columns):
#         column_losses.append(log_loss(y_true[:, i], y_pred[:, i]))
#     return np.array(column_losses).mean()

# def cv(model, X, y, label2binary, n_splits=3):
#     def split(X, y):
#         return StratifiedKFold(n_splits=n_splits).split(X, y)
    
#     def convert_y(y):
#         new_y = np.zeros([len(y)])
#         for i, val in enumerate(label2binary):
#             idx = (y == val).max(axis=1)
#             new_y[idx] = i
#         return new_y
    
#     X = np.array(X)
#     y = np.array(y)
#     scores = []
#     for train, test in tqdm(split(X, convert_y(y)), total=n_splits):
#         fitted_model = model(X[train], y[train])
#         scores.append(metric(y[test], fitted_model(X[test])))
#     return np.array(scores)

In [None]:
# def regression_wordchars(X, y):
#     tfidf_word = TfidfVectorizer(
#         sublinear_tf=True,
#         strip_accents='unicode',
#         analyzer='word',
#         min_df=3, 
#         max_df=0.9,
#         use_idf= 1,
#         smooth_idf= 1,
#         ngram_range=(1,1),
#         max_features=20000
#     )
#     X_tfidf_word = tfidf_word.fit_transform(X[:, 1])
#     tfidf_char = TfidfVectorizer(
#         sublinear_tf=True,
#         strip_accents='unicode',
#         analyzer='char', 
#         ngram_range=(1, 4),
#         max_features=20000,
#         lowercase=False)
#     X_tfidf_char = tfidf_char.fit_transform(X[:, 0])
#     X_tfidf = sparse.hstack([X_tfidf_word, X_tfidf_char])
    
#     columns = y.shape[1]
#     regressions = [
#         LogisticRegression(C= 4).fit(X_tfidf, y[:, i])
#         for i in range(columns)
#     ]
    
#     def _predict(X):
#         X_tfidf_word = tfidf_word.transform(X[:, 1])
#         X_tfidf_char = tfidf_char.transform(X[:, 0])
#         X_tfidf = sparse.hstack([X_tfidf_word, X_tfidf_char])
#         predictions = np.zeros([len(X), columns])
#         for i, regression in enumerate(regressions):
#             regression_prediction = regression.predict_proba(X_tfidf)
#             predictions[:, i] = regression_prediction[:, regression.classes_ == 1][:, 0]
#         return predictions
    
#     return _predict

# ret = cv(regression_wordchars,
#    DataSet['train'][['comment_text', 'comment_text_stemmed']],
#    DataSet['train'][['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']],
#    label2binary)
# print(ret)

In [None]:
# %%time
# model = regression_wordchars(np.array(DataSet['train'][['comment_text', 'comment_text_stemmed']]),
#                              np.array(DataSet['train'][['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]))

In [None]:
# %%time
# prediction = model(np.array(DataSet['test'][['comment_text', 'comment_text_stemmed']]))

In [None]:
# submission = pd.DataFrame()
# submission['id'] = DataSet['test']['id']
# for i, label in enumerate(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']):
#     submission[label] = prediction[:, i]
# # print(submission.tail(100))

In [None]:
# import sys,os,datetime

# strategy = 'lr_tfidf_word_char'
# SubmitOutputDir = '../data/l0'
# if(os.path.exists(SubmitOutputDir) == False):
#     os.makedirs(SubmitOutputDir)
# SubmitFileName = '%s_%s' % (strategy, datetime.datetime.now().strftime("%Y-%m-%d"))
# submission.to_csv('%s/%s.csv' % (SubmitOutputDir, SubmitFileName), index= None)
# print('zip %s/%s.zip %s/%s.csv' % (SubmitOutputDir, SubmitFileName, SubmitOutputDir, SubmitFileName))
# os.system('zip %s/%s.zip %s/%s.csv' % (SubmitOutputDir, SubmitFileName, SubmitOutputDir, SubmitFileName))