In [5]:
import pandas as pd
import numpy as np
import sys,os,time
from contextlib import contextmanager
import re
import string

@contextmanager
def timer(name):
    """
    Taken from Konstantin Lopuhin https://www.kaggle.com/lopuhin
    in script named : Mercari Golf: 0.3875 CV in 75 LOC, 1900 s
    https://www.kaggle.com/lopuhin/mercari-golf-0-3875-cv-in-75-loc-1900-s
    """
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')

def prepare_for_char_n_gram(text):
    """ Simple text clean up process"""
    # 1. Go to lower case (only good for english)
    # Go to bytes_strings as I had issues removing all \n in r""
    clean = bytes(text.lower(), encoding="utf-8")
    # 2. Drop \n and  \t
    clean = clean.replace(b"\n", b" ")
    clean = clean.replace(b"\t", b" ")
    clean = clean.replace(b"\b", b" ")
    clean = clean.replace(b"\r", b" ")
    # 3. Replace english contractions
    for (pattern, repl) in patterns:
        clean = re.sub(pattern, repl, clean)
    # 4. Drop puntuation
    # I could have used regex package with regex.sub(b"\p{P}", " ")
    exclude = re.compile(b'[%s]' % re.escape(bytes(string.punctuation, encoding='utf-8')))
    clean = b" ".join([exclude.sub(b'', token) for token in clean.split()])
    # 5. Drop numbers - as a scientist I don't think numbers are toxic ;-)
    clean = re.sub(b"\d+", b" ", clean)
    # 6. Remove extra spaces - At the end of previous operations we multiplied space accurences
    clean = re.sub(b'\s+', b' ', clean)
    # Remove ending space if any
    clean = re.sub(b'\s+$', b'', clean)
    # 7. Now replace words by words surrounded by # signs
    # e.g. my name is bond would become #my# #name# #is# #bond#
    # clean = re.sub(b"([a-z]+)", b"#\g<1>#", clean)
    clean = re.sub(b" ", b"# #", clean)  # Replace space
    clean = b"#" + clean + b"#"  # add leading and trailing #

    return str(clean, 'utf-8')

def count_regexp_occ(regexp="", text=None):
    """ Simple way to get the number of occurence of a regex"""
    return len(re.findall(regexp, text))

def get_num_feats(df):
    """
    Check all sorts of content as it may help find toxic comment
    Though I'm not sure all of them improve scores
    """
    # Get length in words and characters
    df["raw_word_len"] = df["comment_text"].apply(lambda x: len(x.split())) + 1
    df["raw_char_len"] = df["comment_text"].apply(lambda x: len(x)) + 1
    # Count number of \n
    df["ant_slash_n"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\n", x))
    df['ant_slash_n_ratio'] = df["ant_slash_n"]/df["raw_char_len"]
    # Check number of upper case, if you're angry you may write in upper case
    df["nb_upper"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"[A-Z]", x))
    df['nb_upper_ratio'] = df["nb_upper"]/df["raw_char_len"]
    # Number of F words - f..k contains folk, fork,
    df["nb_fk"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"[Ff]\S{2}[Kk]", x))
    df['nb_fk_ratio'] = df["nb_fk"]/df['raw_word_len']
    # Number of S word
    df["nb_sk"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"[Ss]\S{2}[Kk]", x))
    df['nb_sk_ratio'] = df["nb_sk"]/df['raw_word_len']
    # Number of D words
    df["nb_dk"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"[dD]ick", x))
    df['nb_dk_ratio'] = df['nb_dk']/df['raw_word_len']
    # Number of occurence of You, insulting someone usually needs someone called : you
    df["nb_you"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\W[Yy]ou\W", x))
    df['nb_you_ratio'] = df["nb_you"]/df['raw_word_len']
    
    # Just to check you really refered to my mother ;-)
    df["nb_mother"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\Wmother\W", x))
    df['nb_mother_ratio'] = df["nb_mother"]/df['raw_word_len']
    # Just checking for toxic 19th century vocabulary
    df["nb_ng"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\Wnigger\W", x))
    df['nb_ng_ratio'] = df["nb_ng"]/df['raw_word_len']
    # Some Sentences start with a <:> so it may help
    df["start_with_columns"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"^\:+", x))
    df['start_with_columns_ratio'] = df["start_with_columns"]/(1 + df["ant_slash_n"])
    
    ## new features
    df['num_exclamation_marks'] = df['comment_text'].apply(lambda comment: comment.count('!'))
    df['num_question_marks'] = df['comment_text'].apply(lambda comment: comment.count('?'))
    df['num_punctuation'] = df['comment_text'].apply( lambda comment: sum(comment.count(w) for w in '.,;:'))
    df['imcomplete_punctuation'] = df['comment_text'].apply( lambda comment: sum(comment.count(w) for w in '*,#,$'))    
    df['question_mask_ratio'] = df['num_question_marks']/df["raw_char_len"]
    df['exclamation_mark_ratio'] = df['num_exclamation_marks']/df["raw_char_len"]
    df['punctuation_ratio'] = df['num_punctuation']/df["raw_char_len"]
    df['imcomplete_punctuation_ratio'] = df['imcomplete_punctuation']/df["raw_char_len"]
    ##
    df['num_unique_words'] = df['comment_text'].apply(lambda comment: len(set(w for w in comment.split())))
    df['words_vs_unique'] = df['num_unique_words'] / df['raw_word_len']
    df['num_smilies'] = df['comment_text'].apply(lambda comment: sum(comment.count(w) for w in (':-)', ':)', ';-)', ';)')))
    df['similes_ratio'] = df['num_smilies'] / df['raw_word_len']
    
    df["count_standard_punctuations"] = df["comment_text"].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
    df["standard_punctuations_ratio"] = df["count_standard_punctuations"]/df['raw_char_len']
    df["count_words_title"] = df["comment_text"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
    df["words_title_ratio"] = df["count_words_title"]/df['raw_word_len']
    
    df['unique_words_greater_200'] = (df['num_unique_words'] > 200).astype(int)
    
    # Check for time stamp
    df["has_timestamp"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\d{2}|:\d{2}", x))
    # Check for dates 18:44, 8 December 2010
    df["has_date_long"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\D\d{2}:\d{2}, \d{1,2} \w+ \d{4}", x))
    # Check for date short 8 December 2010
    df["has_date_short"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\D\d{1,2} \w+ \d{4}", x))
    # Check for http links
    df["has_http"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"http[s]{0,1}://\S+", x))
    # check for mail
    df["has_mail"] = df["comment_text"].apply(
        lambda x: count_regexp_occ(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+', x)
    )
    # Looking for words surrounded by == word == or """" word """"
    df["has_emphasize_equal"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\={2}.+\={2}", x))
    df["has_emphasize_quotes"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\"{4}\S+\"{4}", x))
    
    return df
  
targets = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
kfold = 4
DataBaseDir = '../../data/version2'
InputDir = '%s/l0' % DataBaseDir
OutputDir = '%s/num' % DataBaseDir
for fold in range(kfold):
    FoldInputDir = '%s/kfold/%s' % (InputDir, fold)
    FoldOutputDir = '%s/kfold/%s' % (OutputDir, fold)
    if(os.path.exists(FoldOutputDir) == False):
        os.makedirs(FoldOutputDir)
    with timer('Generate numeric features for fold %s' % fold):
        for mod in ['valid', 'test']:
            data = pd.read_csv('%s/%s.csv' % (FoldInputDir, mod))
            data = get_num_feats(data)
            cols = [c for c in data if(c not in (['comment_text'] + targets))]
            if(mod == 'valid'):
                data = data[cols + targets]
            else:
                data = data[cols]
            data.to_csv('%s/%s.csv' % (FoldOutputDir, mod), float_format='%.8f', index= False)

[Generate numeric features for fold 0] done in 44 s
[Generate numeric features for fold 1] done in 45 s
[Generate numeric features for fold 2] done in 46 s
[Generate numeric features for fold 3] done in 45 s
