In [7]:
import json
import pandas as pd
import numpy as np
import dill as pickle
import scipy.sparse
from nltk.tokenize import wordpunct_tokenize
from nltk.stem.snowball import EnglishStemmer
from nltk.stem import WordNetLemmatizer
from functools import lru_cache
from tqdm import tqdm as tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import os,sys,time, datetime
from sklearn.metrics import roc_auc_score

DataBaseDir = '../../data/version2'
InputDir = '%s/l0/kfold' % DataBaseDir
OutputDir = '%s/l1' % DataBaseDir
kfold = 4
strategy = 'nbsvm'
# load data
start = time.time()
valid_dfs = []
for fold in range(kfold):
    FoldInputDir = '%s/%s' % (InputDir, fold)
    valid = pd.read_csv('%s/valid.csv' % FoldInputDir).reset_index(drop= True)#.sample(frac= 0.1)
    ## for valid/holdout data set
    if(fold == 0):
        TestData = pd.read_csv('%s/test.csv' % FoldInputDir).reset_index(drop= True)#.sample(frac= 0.1)
    valid['fold'] = fold
    valid_dfs.append(valid)
    print('load data for fold %s done.' % fold)
TrainData = pd.concat(valid_dfs, axis= 0, ignore_index= True)
end = time.time()
print('load data done, train %s, time elapsed %s' % (len(TrainData), (end - start)))

# pre-process
stemmer = EnglishStemmer()

@lru_cache(30000)
def stem_word(text):
    return stemmer.stem(text)

lemmatizer = WordNetLemmatizer()

@lru_cache(30000)
def lemmatize_word(text):
    return lemmatizer.lemmatize(text)

def reduce_text(conversion, text):
    return " ".join(map(conversion, wordpunct_tokenize(text.lower())))

def reduce_texts(conversion, texts):
    return [reduce_text(conversion, str(text))
            for text in tqdm(texts)]

TrainData['comment_text_stemmed'] = reduce_texts(stem_word, TrainData['comment_text'])
TestData['comment_text_stemmed'] = reduce_texts(stem_word, TestData['comment_text'])

targets = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

##
def get_model(X, y):
    tfidf_word = TfidfVectorizer(
        strip_accents='unicode',
        analyzer='word',
        sublinear_tf= 1,
        ngram_range=(1,1),
        max_features=20000
    )
    X_tfidf_word = tfidf_word.fit_transform(X[:, 1])
    tfidf_char = TfidfVectorizer(
        strip_accents='unicode',
        analyzer='char', 
        sublinear_tf= 1,
        ngram_range=(1, 4),
        max_features=20000,
        lowercase=False)
    X_tfidf_char = tfidf_char.fit_transform(X[:, 0])
    X_tfidf = sparse.hstack([X_tfidf_word, X_tfidf_char])
    
    def fit(x, y):
        x = x.tocsr()
        p_1 = x[y == 1].sum(0)
        pr_1 = (p_1 + 1) / ((y == 1).sum() + 1)
        p_0 = x[y == 0].sum(0)
        pr_0 = (p_0 + 1) / ((y == 0).sum() + 1)
        r = np.log(pr_1 / pr_0)
        m = LogisticRegression(C= 0.4)
        x_nb = x.multiply(r)
        return m.fit(x_nb, y), r
    
    columns = y.shape[1]
    regressions = [fit(X_tfidf, y[:, i]) for i in range(columns)]
    
    def _predict(X):
        X_tfidf_word = tfidf_word.transform(X[:, 1])
        X_tfidf_char = tfidf_char.transform(X[:, 0])
        X_tfidf = sparse.hstack([X_tfidf_word, X_tfidf_char])
        predictions = np.zeros([len(X), columns])
        for i, (regression, r) in enumerate(regressions):
            predictions[:, i] = regression.predict_proba(X_tfidf.multiply(r))[:, regression.classes_ == 1][:, 0]
        return predictions
    
    return _predict

##
cv_score = .0
start = time.time()
pred_cols = ['%s_%s' % (strategy, c) for c in targets]
for c in pred_cols:
    TestData[c] = .0
for fold in range(kfold):
    print('====== fold %s ======\n' % fold)
    FoldData = {
        'train': TrainData[TrainData['fold'] != fold],
        'valid': TrainData[TrainData['fold'] == fold],
        'test': TestData
    }
    for c in pred_cols:
        FoldData['valid'][c] = .0
        FoldData['test'][c] = .0
    ## construct bi-gru model
    model = get_model(FoldData['train'][['comment_text', 'comment_text_stemmed']].values, 
                      FoldData['train'][targets].values)
    ## predict for valid
    pred_valid = model(FoldData['valid'][['comment_text', 'comment_text_stemmed']].values)
    FoldData['valid'][pred_cols] = pred_valid
    ## predict for test
    pred_test = model(FoldData['test'][['comment_text', 'comment_text_stemmed']].values)
    FoldData['test'][pred_cols] = pred_test
    TestData[pred_cols] += pred_test
    ## evaluate
    score = roc_auc_score(FoldData['valid'][targets], FoldData['valid'][pred_cols])
    cv_score += score
    ## output
    FoldOutputDir = '%s/kfold/%s' % (OutputDir, fold)
    if(os.path.exists(FoldOutputDir) == False):
        os.makedirs(FoldOutputDir)
    for mod in ['valid', 'test']:
        if(mod == 'test'):
            out_cols = ['id']
            out_cols.extend(pred_cols)
        else:
            out_cols = pred_cols.copy()
            out_cols.extend(targets)
        FoldData[mod][out_cols].to_csv('%s/%s_%s.csv' % (FoldOutputDir, mod, strategy),float_format='%.8f', index= False) 
    end = time.time()
    print('fold %s, score %.5f, time elapsed %.2f' % (fold, score, (end - start)))

cv_score /= kfold
TestData[pred_cols] /= kfold
end = time.time()
print('\n================')
print('cv score %.5f,  time elapsed %s' % (cv_score, (end - start)))
print('================')

## submit
sub = TestData[['id']].copy()
sub[targets] = TestData[pred_cols]
OutputFileName = '%s_submit_%s' % (strategy, datetime.datetime.now().strftime("%Y-%m-%d"))
SubmitDir = '%s/l0/submit' % DataBaseDir
if(os.path.exists(SubmitDir) == False):
    os.makedirs(SubmitDir) 
sub.to_csv('%s/%s.csv' % (SubmitDir, OutputFileName), float_format='%.8f', index=False)
print('zip %s/%s.zip %s/%s.csv' % (SubmitDir, OutputFileName, SubmitDir, OutputFileName))
os.system('zip %s/%s.zip %s/%s.csv' % (SubmitDir, OutputFileName, SubmitDir, OutputFileName))

load data for fold 0 done.
load data for fold 1 done.
load data for fold 2 done.


  0%|          | 375/159571 [00:00<00:42, 3743.03it/s]

load data for fold 3 done.
load data done, train 159571, time elapsed 1.6437060832977295


100%|██████████| 159571/159571 [00:16<00:00, 9834.77it/s]
100%|██████████| 153164/153164 [00:16<00:00, 9253.24it/s]





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


fold 0, score 0.98679, time elapsed 17780.82

fold 1, score 0.98650, time elapsed 18658.47

fold 2, score 0.98731, time elapsed 19617.18

fold 3, score 0.98759, time elapsed 20602.23

cv score 0.98705,  time elapsed 20602.246532201767
zip ../../data/version2/l0/submit/nbsvm_submit_2018-03-10.zip ../../data/version2/l0/submit/nbsvm_submit_2018-03-10.csv


0