In [38]:
import json
import pandas as pd
import numpy as np
import dill as pickle
import scipy.sparse
from nltk.tokenize import wordpunct_tokenize
from nltk.stem.snowball import EnglishStemmer
from nltk.stem import WordNetLemmatizer
from functools import lru_cache
from tqdm import tqdm as tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

class DataUtil2:
    """"""
    @classmethod
    def load(cls, file, format, date_cols= None):
        """"""
        data = ''
        if(format== 'csv'):
            data = pd.read_csv(file, parse_dates= date_cols)
        elif(format== 'json'):
            with open(file, 'r') as i_file:
                data = json.load(file)
            i_file.close()
        elif(format== 'pkl'):
            with open(file, 'rb') as i_file:
                data = pickle.load(i_file)
            i_file.close()
        elif(format == 'hdf'):
            data = pd.read_hdf(path_or_buf= file, key='undefined')
        elif(format == 'npz'):
            data = scipy.sparse.load_npz(file)

        return  data

    @classmethod
    def save(cls, data, file, format, precision= 8):
        """"""
        if(format == 'csv'):
            data.to_csv(file, float_format= '%%.%df' % precision, index= False)
        elif(format == 'json'):
            with open(file, 'w') as o_file:
                json.dump(data, o_file, ensure_ascii= True, indent= 4)
            o_file.close()
        elif(format == 'pkl'):
            with open(file, 'wb') as o_file:
                pickle.dump(data, o_file, -1)
            o_file.close()
        elif(format== 'hdf'):
            data.to_hdf(path_or_buf= file, key='undefined', mode='w', complib='blosc')
        elif(format == 'npz'):
            scipy.sparse.save_npz(file, data)
        return

In [39]:
# load data
iformat = 'csv'
oformat = 'hdf'
DataBase = '../data'
DataSet = {}
for mod in ['train', 'test']:
    DataSet[mod] = DataUtil2.load('%s/raw/%s.%s' % (DataBase, mod, iformat), iformat)
    DataSet[mod]['comment_text'] = DataSet[mod]['comment_text'].fillna('nan')
print('load data done.')

load data done.


In [40]:
stemmer = EnglishStemmer()

@lru_cache(30000)
def stem_word(text):
    return stemmer.stem(text)

lemmatizer = WordNetLemmatizer()

@lru_cache(30000)
def lemmatize_word(text):
    return lemmatizer.lemmatize(text)

def reduce_text(conversion, text):
    return " ".join(map(conversion, wordpunct_tokenize(text.lower())))

def reduce_texts(conversion, texts):
    return [reduce_text(conversion, str(text))
            for text in tqdm(texts)]

for mod in ['train', 'test']:
    DataSet[mod]['comment_text_stemmed'] = reduce_texts(stem_word, DataSet[mod]['comment_text'])
    #DataSet[mod]['comment_text_lemmatized'] = reduce_texts(lemmatize_word, DataSet[mod]['comment_text'])


  0%|          | 0/95851 [00:00<?, ?it/s][A
  0%|          | 316/95851 [00:00<00:30, 3120.56it/s][A
  1%|          | 756/95851 [00:00<00:25, 3756.05it/s][A
  1%|▏         | 1308/95851 [00:00<00:21, 4340.27it/s][A
  2%|▏         | 1828/95851 [00:00<00:27, 3389.45it/s][A
  3%|▎         | 2555/95851 [00:00<00:23, 3996.89it/s][A
  3%|▎         | 3304/95851 [00:00<00:20, 4468.49it/s][A
  4%|▍         | 4082/95851 [00:00<00:18, 4861.34it/s][A
  5%|▌         | 4848/95851 [00:00<00:17, 5158.74it/s][A
  6%|▌         | 5737/95851 [00:01<00:16, 5517.59it/s][A
  7%|▋         | 6583/95851 [00:01<00:15, 5774.40it/s][A
  8%|▊         | 7519/95851 [00:01<00:14, 6062.90it/s][A
  9%|▉         | 8419/95851 [00:01<00:13, 6281.25it/s][A
 10%|▉         | 9307/95851 [00:01<00:13, 6459.22it/s][A
 11%|█         | 10209/95851 [00:01<00:12, 6623.38it/s][A
 12%|█▏        | 11210/95851 [00:01<00:12, 6829.54it/s][A
 13%|█▎        | 12158/95851 [00:01<00:11, 6982.73it/s][A
 14%|█▎        | 13138/95

In [41]:
label2binary = np.array([
    [0, 0, 0, 0, 0, 0],
    [0, 0, 0, 0, 0, 1],
    [0, 0, 0, 0, 1, 0],
    [0, 0, 0, 0, 1, 1],
    [0, 0, 0, 1, 0, 0],
    [0, 0, 0, 1, 0, 1],
    [0, 0, 0, 1, 1, 0],
    [0, 0, 0, 1, 1, 1],
    [0, 0, 1, 0, 0, 0],
    [0, 0, 1, 0, 0, 1],
    [0, 0, 1, 0, 1, 0],
    [0, 0, 1, 0, 1, 1],
    [0, 0, 1, 1, 0, 0],
    [0, 0, 1, 1, 0, 1],
    [0, 0, 1, 1, 1, 0],
    [0, 0, 1, 1, 1, 1],
    [0, 1, 0, 0, 0, 0],
    [0, 1, 0, 0, 0, 1],
    [0, 1, 0, 0, 1, 0],
    [0, 1, 0, 0, 1, 1],
    [0, 1, 0, 1, 0, 0],
    [0, 1, 0, 1, 0, 1],
    [0, 1, 0, 1, 1, 0],
    [0, 1, 0, 1, 1, 1],
    [0, 1, 1, 0, 0, 0],
    [0, 1, 1, 0, 0, 1],
    [0, 1, 1, 0, 1, 0],
    [0, 1, 1, 0, 1, 1],
    [0, 1, 1, 1, 0, 0],
    [0, 1, 1, 1, 0, 1],
    [0, 1, 1, 1, 1, 0],
    [0, 1, 1, 1, 1, 1],
    [1, 0, 0, 0, 0, 0],
    [1, 0, 0, 0, 0, 1],
    [1, 0, 0, 0, 1, 0],
    [1, 0, 0, 0, 1, 1],
    [1, 0, 0, 1, 0, 0],
    [1, 0, 0, 1, 0, 1],
    [1, 0, 0, 1, 1, 0],
    [1, 0, 0, 1, 1, 1],
    [1, 0, 1, 0, 0, 0],
    [1, 0, 1, 0, 0, 1],
    [1, 0, 1, 0, 1, 0],
    [1, 0, 1, 0, 1, 1],
    [1, 0, 1, 1, 0, 0],
    [1, 0, 1, 1, 0, 1],
    [1, 0, 1, 1, 1, 0],
    [1, 0, 1, 1, 1, 1],
    [1, 1, 0, 0, 0, 0],
    [1, 1, 0, 0, 0, 1],
    [1, 1, 0, 0, 1, 0],
    [1, 1, 0, 0, 1, 1],
    [1, 1, 0, 1, 0, 0],
    [1, 1, 0, 1, 0, 1],
    [1, 1, 0, 1, 1, 0],
    [1, 1, 0, 1, 1, 1],
    [1, 1, 1, 0, 0, 0],
    [1, 1, 1, 0, 0, 1],
    [1, 1, 1, 0, 1, 0],
    [1, 1, 1, 0, 1, 1],
    [1, 1, 1, 1, 0, 0],
    [1, 1, 1, 1, 0, 1],
    [1, 1, 1, 1, 1, 0],
    [1, 1, 1, 1, 1, 1],
])

def metric(y_true, y_pred):
    assert y_true.shape == y_pred.shape
    columns = y_true.shape[1]
    column_losses = []
    for i in range(0, columns):
        column_losses.append(log_loss(y_true[:, i], y_pred[:, i]))
    return np.array(column_losses).mean()

def cv(model, X, y, label2binary, n_splits=3):
    def split(X, y):
        return StratifiedKFold(n_splits=n_splits).split(X, y)
    
    def convert_y(y):
        new_y = np.zeros([len(y)])
        for i, val in enumerate(label2binary):
            idx = (y == val).max(axis=1)
            new_y[idx] = i
        return new_y
    
    X = np.array(X)
    y = np.array(y)
    scores = []
    for train, test in tqdm(split(X, convert_y(y)), total=n_splits):
        fitted_model = model(X[train], y[train])
        scores.append(metric(y[test], fitted_model(X[test])))
    return np.array(scores)

In [42]:
# ## check
# def dummy_model(X, y):
#     def _predict(X):
#         return np.ones([X.shape[0], 6]) * 0.5
    
#     return _predict

# ret = cv(dummy_model,
#    DataSet['train']['comment_text'],
#    DataSet['train'][['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']],
#    label2binary)

# print(ret)

In [43]:
# def regression_baseline(X, y):
#     tfidf = TfidfVectorizer()
#     X_tfidf = tfidf.fit_transform(X)
#     columns = y.shape[1]
#     regressions = [
#         LogisticRegression().fit(X_tfidf, y[:, i])
#         for i in range(columns)
#     ]
    
#     def _predict(X):
#         X_tfidf = tfidf.transform(X)
#         predictions = np.zeros([len(X), columns])
#         for i, regression in enumerate(regressions):
#             regression_prediction = regression.predict_proba(X_tfidf)
#             predictions[:, i] = regression_prediction[:, regression.classes_ == 1][:, 0]
#         return predictions
    
#     return _predict

# ret = cv(regression_baseline,
#    DataSet['train']['comment_text'],
#    DataSet['train'][['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']],
#    label2binary)
# print(ret)

# ret = cv(regression_baseline,
#    DataSet['train']['comment_text_stemmed'],
#    DataSet['train'][['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']],
#    label2binary)
# print(ret)

In [44]:
def regression_wordchars(X, y):
    tfidf_word = TfidfVectorizer()
    X_tfidf_word = tfidf_word.fit_transform(X[:, 1])
    tfidf_char = TfidfVectorizer(analyzer='char', ngram_range=(1, 2), lowercase=False)
    X_tfidf_char = tfidf_char.fit_transform(X[:, 0])
    X_tfidf = sparse.hstack([X_tfidf_word, X_tfidf_char])
    
    columns = y.shape[1]
    regressions = [
        LogisticRegression().fit(X_tfidf, y[:, i])
        for i in range(columns)
    ]
    
    def _predict(X):
        X_tfidf_word = tfidf_word.transform(X[:, 1])
        X_tfidf_char = tfidf_char.transform(X[:, 0])
        X_tfidf = sparse.hstack([X_tfidf_word, X_tfidf_char])
        predictions = np.zeros([len(X), columns])
        for i, regression in enumerate(regressions):
            regression_prediction = regression.predict_proba(X_tfidf)
            predictions[:, i] = regression_prediction[:, regression.classes_ == 1][:, 0]
        return predictions
    
    return _predict

ret = cv(regression_wordchars,
   DataSet['train'][['comment_text', 'comment_text_stemmed']],
   DataSet['train'][['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']],
   label2binary)
print(ret)

100%|██████████| 3/3 [03:18<00:00, 66.23s/it]

[ 0.05256475  0.05343051  0.0537346 ]





In [46]:
%%time
model = regression_wordchars(np.array(DataSet['train'][['comment_text', 'comment_text_stemmed']]),
                             np.array(DataSet['train'][['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]))

CPU times: user 1min 18s, sys: 2.41 s, total: 1min 21s
Wall time: 1min 21s


In [47]:
%%time
prediction = model(np.array(DataSet['test'][['comment_text', 'comment_text_stemmed']]))

CPU times: user 1min 54s, sys: 4.44 s, total: 1min 58s
Wall time: 1min 58s


In [50]:
submission = pd.DataFrame()
submission['id'] = DataSet['test']['id']
for i, label in enumerate(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']):
    submission[label] = prediction[:, i]
# print(submission.tail(100))

                  id     toxic  severe_toxic   obscene    threat    insult  \
226898  999544330298  0.032443      0.001476  0.009973  0.001276  0.020829   
226899  999551526659  0.051080      0.006610  0.015649  0.001585  0.006678   
226900  999557088193  0.023915      0.003601  0.008847  0.000857  0.008722   
226901  999560902532  0.010306      0.003172  0.003391  0.001992  0.001695   
226902  999567329075  0.019577      0.000853  0.009575  0.000454  0.004032   
226903  999568036865  0.008346      0.001488  0.006877  0.002715  0.005354   
226904  999569805515  0.035748      0.007173  0.028661  0.001075  0.023376   
226905  999570016962  0.046290      0.003724  0.010511  0.006591  0.009025   
226906  999574161238  0.016588      0.001744  0.009710  0.000879  0.011044   
226907  999574286534  0.149694      0.038070  0.042902  0.002105  0.025391   
226908  999576983525  0.559071      0.038655  0.151538  0.023705  0.202261   
226909  999580611238  0.019359      0.001929  0.008800  0.001120

In [56]:
import sys,os,datetime

strategy = 'lr_tfidf_word_char'
SubmitOutputDir = '../data/l0'
if(os.path.exists(SubmitOutputDir) == False):
    os.makedirs(SubmitOutputDir)
SubmitFileName = '%s_%s' % (strategy, datetime.datetime.now().strftime("%Y-%m-%d"))
submission.to_csv('%s.csv' % SubmitFileName, index= None)
print('zip %s/%s.zip %s/%s.csv' % (SubmitOutputDir, SubmitFileName, SubmitOutputDir, SubmitFileName))
os.system('zip %s/%s.zip %s/%s.csv' % (SubmitOutputDir, SubmitFileName, SubmitOutputDir, SubmitFileName))

zip ../data/l0/lr_tfidf_word_char_2018-01-08.zip ../data/l0/lr_tfidf_word_char_2018-01-08.csv


0