In [1]:
import numpy as np
import pandas as pd
from contextlib import contextmanager
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
import time
import re
import string
from scipy.sparse import csr_matrix
from sklearn.preprocessing import MinMaxScaler
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import gc
from collections import defaultdict
import os
import psutil
import lightgbm

# Contraction replacement patterns
cont_patterns = [
    (b'(W|w)on\'t', b'will not'),
    (b'(C|c)an\'t', b'can not'),
    (b'(I|i)\'m', b'i am'),
    (b'(A|a)in\'t', b'is not'),
    (b'(\w+)\'ll', b'\g<1> will'),
    (b'(\w+)n\'t', b'\g<1> not'),
    (b'(\w+)\'ve', b'\g<1> have'),
    (b'(\w+)\'s', b'\g<1> is'),
    (b'(\w+)\'re', b'\g<1> are'),
    (b'(\w+)\'d', b'\g<1> would'),
]
patterns = [(re.compile(regex), repl) for (regex, repl) in cont_patterns]

@contextmanager
def timer(name):
    """
    Taken from Konstantin Lopuhin https://www.kaggle.com/lopuhin
    in script named : Mercari Golf: 0.3875 CV in 75 LOC, 1900 s
    https://www.kaggle.com/lopuhin/mercari-golf-0-3875-cv-in-75-loc-1900-s
    """
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')

def prepare_for_char_n_gram(text):
    """ Simple text clean up process"""
    # 1. Go to lower case (only good for english)
    # Go to bytes_strings as I had issues removing all \n in r""
    clean = bytes(text.lower(), encoding="utf-8")
    # 2. Drop \n and  \t
    clean = clean.replace(b"\n", b" ")
    clean = clean.replace(b"\t", b" ")
    clean = clean.replace(b"\b", b" ")
    clean = clean.replace(b"\r", b" ")
    # 3. Replace english contractions
    for (pattern, repl) in patterns:
        clean = re.sub(pattern, repl, clean)
    # 4. Drop puntuation
    # I could have used regex package with regex.sub(b"\p{P}", " ")
    exclude = re.compile(b'[%s]' % re.escape(bytes(string.punctuation, encoding='utf-8')))
    clean = b" ".join([exclude.sub(b'', token) for token in clean.split()])
    # 5. Drop numbers - as a scientist I don't think numbers are toxic ;-)
    clean = re.sub(b"\d+", b" ", clean)
    # 6. Remove extra spaces - At the end of previous operations we multiplied space accurences
    clean = re.sub(b'\s+', b' ', clean)
    # Remove ending space if any
    clean = re.sub(b'\s+$', b'', clean)
    # 7. Now replace words by words surrounded by # signs
    # e.g. my name is bond would become #my# #name# #is# #bond#
    # clean = re.sub(b"([a-z]+)", b"#\g<1>#", clean)
    clean = re.sub(b" ", b"# #", clean)  # Replace space
    clean = b"#" + clean + b"#"  # add leading and trailing #

    return str(clean, 'utf-8')

def count_regexp_occ(regexp="", text=None):
    """ Simple way to get the number of occurence of a regex"""
    return len(re.findall(regexp, text))

def get_indicators_and_clean_comments(df):
    """
    Check all sorts of content as it may help find toxic comment
    Though I'm not sure all of them improve scores
    """
    # Count number of \n
    df["ant_slash_n"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\n", x))
    # Get length in words and characters
    df["raw_word_len"] = df["comment_text"].apply(lambda x: len(x.split()))
    df["raw_char_len"] = df["comment_text"].apply(lambda x: len(x))
    # Check number of upper case, if you're angry you may write in upper case
    df["nb_upper"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"[A-Z]", x))
    # Number of F words - f..k contains folk, fork,
    df["nb_fk"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"[Ff]\S{2}[Kk]", x))
    # Number of S word
    df["nb_sk"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"[Ss]\S{2}[Kk]", x))
    # Number of D words
    df["nb_dk"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"[dD]ick", x))
    # Number of occurence of You, insulting someone usually needs someone called : you
    df["nb_you"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\W[Yy]ou\W", x))
    # Just to check you really refered to my mother ;-)
    df["nb_mother"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\Wmother\W", x))
    # Just checking for toxic 19th century vocabulary
    df["nb_ng"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\Wnigger\W", x))
    # Some Sentences start with a <:> so it may help
    df["start_with_columns"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"^\:+", x))
    # Check for time stamp
    df["has_timestamp"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\d{2}|:\d{2}", x))
    # Check for dates 18:44, 8 December 2010
    df["has_date_long"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\D\d{2}:\d{2}, \d{1,2} \w+ \d{4}", x))
    # Check for date short 8 December 2010
    df["has_date_short"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\D\d{1,2} \w+ \d{4}", x))
    # Check for http links
    df["has_http"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"http[s]{0,1}://\S+", x))
    # check for mail
    df["has_mail"] = df["comment_text"].apply(
        lambda x: count_regexp_occ(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+', x)
    )
    # Looking for words surrounded by == word == or """" word """"
    df["has_emphasize_equal"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\={2}.+\={2}", x))
    df["has_emphasize_quotes"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\"{4}\S+\"{4}", x))

    # Now clean comments
    df["clean_comment"] = df["comment_text"].apply(lambda x: prepare_for_char_n_gram(x))

    # Get the new length in words and characters
    df["clean_word_len"] = df["clean_comment"].apply(lambda x: len(x.split()))
    df["clean_char_len"] = df["clean_comment"].apply(lambda x: len(x))
    # Number of different characters used in a comment
    # Using the f word only will reduce the number of letters required in the comment
    df["clean_chars"] = df["clean_comment"].apply(lambda x: len(set(x)))
    df["clean_chars_ratio"] = df["clean_comment"].apply(lambda x: len(set(x))) / df["clean_comment"].apply(
        lambda x: 1 + min(99, len(x)))

def char_analyzer(text):
    """
    This is used to split strings in small lots
    I saw this in an article (I can't find the link anymore)
    so <talk> and <talking> would have <Tal> <alk> in common
    """
    tokens = text.split()
    return [token[i: i + 3] for token in tokens for i in range(len(token) - 2)]
    
DataBaseDir = '../../data/version2'
InputDir = '%s/l0/kfold' % DataBaseDir
OutputDir = '%s/l1' % DataBaseDir
kfold = 4
strategy = 'lgb'
## load data
valid_dfs = []
with timer('load data'):
    for fold in range(kfold):
        FoldInputDir = '%s/%s' % (InputDir, fold)
        valid = pd.read_csv('%s/valid.csv' % FoldInputDir).reset_index(drop= True)#.sample(frac= 0.1)
        ## for valid/holdout data set
        if(fold == 0):
            TestData = pd.read_csv('%s/test.csv' % FoldInputDir).reset_index(drop= True)#.sample(frac= 0.1)
        valid['fold'] = fold
        valid_dfs.append(valid)
        print('load data for fold %s done.' % fold)
    TrainData = pd.concat(valid_dfs, axis= 0, ignore_index= True)
    print('load data done, train %s, test %s' % (len(TrainData), len(TestData)))

## pre-preprocessing
with timer("pre-processing"):
    get_indicators_and_clean_comments(TrainData)
    get_indicators_and_clean_comments(TestData)
    

load data for fold 0 done.
load data for fold 1 done.
load data for fold 2 done.
load data for fold 3 done.
load data done, train 159571, test 153164
[load data] done in 1 s
[pre-processing] done in 108 s


In [2]:
targets = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
# parameters
params = {
        "objective": "binary",
        'metric': {'auc'},
        "boosting_type": "gbdt",
        "verbosity": -1,
        "num_threads": 4,
        "bagging_fraction": 0.8,
        "feature_fraction": 0.8,
        "learning_rate": 0.1,
        "num_leaves": 31,
        "verbose": -1,
        "min_split_gain": .1,
        "reg_alpha": .1
}

start = time.time()
cv_score = .0
pred_cols = ['%s_%s' % (strategy, c) for c in targets]
for c in pred_cols:
    TestData[c] = .0
for fold in range(kfold):
    print('====== fold %s ======\n' % fold)
    FoldData = {
        'train': TrainData[TrainData['fold'] != fold].copy(),
        'valid': TrainData[TrainData['fold'] == fold].copy(),
        'test': TestData.copy()
    }
    for c in pred_cols:
        FoldData['valid'][c] = .0
        FoldData['test'][c] = .0
    # Scaling numerical features with MinMaxScaler though tree boosters don't need that
    with timer("Creating numerical features"):
        num_features = [f_ for f_ in FoldData['train'].columns
                        if f_ not in ['fold', "comment_text", "clean_comment", "id", 
                                      "remaining_chars", 'has_ip_address'] + targets]
        skl = MinMaxScaler()
        entire_num_features = pd.concat([FoldData['train'][num_features], FoldData['valid'][num_features], FoldData['test'][num_features]])
        skl = skl.fit(entire_num_features)
        train_num_features = csr_matrix(skl.transform(FoldData['train'][num_features]))
        valid_num_features = csr_matrix(skl.transform(FoldData['valid'][num_features]))
        test_num_features = csr_matrix(skl.transform(FoldData['test'][num_features]))

    # Get TF-IDF features
    EntireCorpus = pd.concat([FoldData['train']['clean_comment'], FoldData['valid']['clean_comment'], FoldData['test']['clean_comment']])

    # First on real words
    with timer("Tfidf on word"):
        word_vectorizer = TfidfVectorizer(
            sublinear_tf=True,
            strip_accents='unicode',
            analyzer='word',
            token_pattern=r'\w{1,}',
            stop_words='english',
            ngram_range=(1, 2),
            max_features=20000)
        word_vectorizer.fit(EntireCorpus)
        train_word_features = word_vectorizer.transform(FoldData['train']['clean_comment'])
        valid_word_features = word_vectorizer.transform(FoldData['valid']['clean_comment'])
        test_word_features = word_vectorizer.transform(FoldData['test']['clean_comment'])

    del word_vectorizer
    gc.collect()

    # Now use the char_analyzer to get another TFIDF
    # Char level TFIDF would go through words when char analyzer only considers
    # characters inside a word
    with timer("Tfidf on char n_gram"):
        char_vectorizer = TfidfVectorizer(
            sublinear_tf=True,
            strip_accents='unicode',
            tokenizer=char_analyzer,
            analyzer='word',
            ngram_range=(1, 1),
            max_features=50000)
        char_vectorizer.fit(EntireCorpus)
        train_char_features = char_vectorizer.transform(FoldData['train']['clean_comment'])
        valid_char_features = char_vectorizer.transform(FoldData['valid']['clean_comment'])
        test_char_features = char_vectorizer.transform(FoldData['test']['clean_comment'])

    del char_vectorizer, EntireCorpus
    gc.collect()

    print((train_char_features > 0).sum(axis=1).max())

    # Now stack TF IDF matrices
    with timer("Staking matrices"):
        csr_trn = hstack(
            [
                train_char_features,
                train_word_features,
                train_num_features
            ]
        ).tocsr()
        del train_word_features
        del train_num_features
        del train_char_features
        gc.collect()

        csr_valid = hstack(
            [
                valid_char_features,
                valid_word_features,
                valid_num_features
            ]
        ).tocsr()
        del valid_word_features
        del valid_num_features
        del valid_char_features
        gc.collect()
        
        csr_test = hstack(
            [
                test_char_features,
                test_word_features,
                test_num_features
            ]
        ).tocsr()
        del test_word_features
        del test_num_features
        del test_char_features
        gc.collect()
    
#     # Drop now useless columns in train and test
#     drop_f = [f_ for f_ in FoldData['train'] if f_ not in ["id"] + targets]
#     train.drop(drop_f, axis=1, inplace=True)
#     gc.collect()
    
    with timer("train lgb"):
        for target in targets:
            lgb_train = lightgbm.Dataset(csr_trn, 
                                    label= FoldData['train'][target].values, 
                                    silent= True, 
                                    free_raw_data= True)
            lgb_valid = lightgbm.Dataset(csr_valid, 
                                    label= FoldData['valid'][target].values, 
                                    silent= True, 
                                    free_raw_data= True)
            model = lightgbm.train(params= params, 
                                   train_set= lgb_train, 
                                   valid_sets= [lgb_train, lgb_valid],
                                   early_stopping_rounds= 50,
                                   num_boost_round= 100, 
                                   verbose_eval=0)
            ## predict for valid
            pred_col = '%s_%s' % (strategy, target)
            pred_valid = model.predict(csr_valid)
            FoldData['valid'][pred_col] = pred_valid
            ## predict for test
            pred_test = model.predict(csr_test)
            FoldData['test'][pred_col] = pred_test
            TestData[pred_col] += pred_test
    ## evaluate
    with timer('evaluation'):
        score = roc_auc_score(FoldData['valid'][targets], FoldData['valid'][pred_cols])
        cv_score += score
        ## output
        FoldOutputDir = '%s/kfold/%s' % (OutputDir, fold)
        if(os.path.exists(FoldOutputDir) == False):
            os.makedirs(FoldOutputDir)
        for mod in ['valid', 'test']:
            if(mod == 'test'):
                out_cols = ['id']
                out_cols.extend(pred_cols)
            else:
                out_cols = pred_cols.copy()
                out_cols.extend(targets)
            FoldData[mod][out_cols].to_csv('%s/%s_%s.csv' % (FoldOutputDir, mod, strategy),float_format='%.8f', index= False) 
        print('fold %s, score %.5f' % (fold, score))
    
cv_score /= kfold
TestData[pred_cols] /= kfold
end = time.time()
print('\n================')
print('cv score %.5f,  time elapsed %s' % (cv_score, (end - start)))
print('================')


[Creating numerical features] done in 1 s
[Tfidf on word] done in 66 s
[Tfidf on char n_gram] done in 70 s
1336
[Staking matrices] done in 6 s
[train lgb] done in 255 s
fold 0, score 0.98147
[evaluation] done in 1 s

[Creating numerical features] done in 0 s
[Tfidf on word] done in 70 s
[Tfidf on char n_gram] done in 73 s
1336
[Staking matrices] done in 7 s
[train lgb] done in 261 s
fold 1, score 0.98254
[evaluation] done in 1 s

[Creating numerical features] done in 0 s
[Tfidf on word] done in 61 s
[Tfidf on char n_gram] done in 71 s
1336
[Staking matrices] done in 5 s
[train lgb] done in 258 s
fold 2, score 0.98084
[evaluation] done in 1 s

[Creating numerical features] done in 0 s
[Tfidf on word] done in 57 s
[Tfidf on char n_gram] done in 68 s
1296
[Staking matrices] done in 4 s
[train lgb] done in 236 s
fold 3, score 0.98279
[evaluation] done in 1 s



NameError: name 'start' is not defined

In [3]:
import datetime
## submit
sub = TestData[['id']].copy()
sub[targets] = TestData[pred_cols]
OutputFileName = '%s_submit_%s' % (strategy, datetime.datetime.now().strftime("%Y-%m-%d"))
SubmitDir = '%s/l0/submit' % DataBaseDir
if(os.path.exists(SubmitDir) == False):
    os.makedirs(SubmitDir)
sub.to_csv('%s/%s.csv' % (SubmitDir, OutputFileName), float_format='%.8f', index=False)
print('zip %s/%s.zip %s/%s.csv' % (SubmitDir, OutputFileName, SubmitDir, OutputFileName))
os.system('zip %s/%s.zip %s/%s.csv' % (SubmitDir, OutputFileName, SubmitDir, OutputFileName))

zip ../../data/version2/l0/submit/lgb_submit_2018-03-10.zip ../../data/version2/l0/submit/lgb_submit_2018-03-10.csv


0