In [28]:
import pandas as pd
import re
import numpy as np
from itertools import chain
from tqdm import tqdm_notebook as tqdm

from crossvalidation import multilabel_label_combinations, multilabel_cross_validation
from transform_pipeline import TransformPipeline
from sklearn.model_selection import train_test_split

import editdistance
from functools import lru_cache

from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion

from scipy.sparse import csr_matrix
from ipywidgets import IntProgress
from IPython.display import display
from math import ceil

from nltk.stem.snowball import EnglishStemmer

In [2]:
dftrain = pd.read_csv("input/train.csv")
dftrain['comment_text'] = dftrain['comment_text'].apply(str)
dftest = pd.read_csv("input/test.csv")
dftest['comment_text'] = dftest['comment_text'].apply(str)

In [3]:
def tokenize(text):
    delimeter = "([?\\/.,`~!@#4%^&*-+\[\]{}<>'\"]*[ \s\n\t\r]+)"
    tokens = re.split(delimeter, text + " ")
    stripped_tokens = map(str.strip, tokens)
    noempty_tokens = filter(bool, stripped_tokens)
    return list(noempty_tokens)


def preprocess(text):
    text = text.strip()
    text = text.replace('\\n', ' ').replace('\\r', ' ').replace('\\t', ' ')
    text = text.lower()
    return " ".join(tokenize(text))

In [4]:
dftrain['preprocessed_text'] = dftrain['comment_text'].apply(preprocess)
dftest['preprocessed_text'] = dftest['comment_text'].apply(preprocess)

In [5]:
dftrain.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,preprocessed_text
0,22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0,"nonsense ? kiss off , geek . what i said is tr..."
1,27450690,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0,""" please do not vandalize pages , as you did w..."
2,54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",0,0,0,0,0,0,""" """"points of interest """" i removed the """"poin..."
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0,asking some his nationality is a racial offenc...
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0,the reader here is not going by my say so for ...


In [6]:
targets = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [7]:
y = np.array(dftrain[targets])
labelcombination_y = np.zeros([len(dftrain)], dtype=np.int)
for i, row in enumerate(multilabel_label_combinations(y, 2)):
    labelcombination_y[np.all(y == row, axis=1)] = i

In [8]:
preprocessed_texts = np.array(dftrain['preprocessed_text'])

In [9]:
train_idx, val_idx, _, _ = train_test_split(np.arange(len(dftrain), dtype=np.int), 
                                            labelcombination_y, 
                                            stratify=labelcombination_y,
                                            random_state=42)

In [10]:
def toptoxic_words(texts, y):
    clf = Pipeline([
        ('vec', TfidfVectorizer(tokenizer=lambda s: s.split(' '), binary=True, min_df=5, max_df=0.9)),
        ('clf', LogisticRegression(penalty='l1'))    
    ])
    clf.fit(texts, (1 * (y.sum(axis=1) > 1)))
    weights = np.abs(clf.steps[1][1].coef_[0])
    topwords_idx = weights > 0.025
    topwords = np.array(clf.steps[0][1].get_feature_names())[topwords_idx]
    return topwords

In [11]:
topwords = toptoxic_words(preprocessed_texts[train_idx], y[train_idx])

In [12]:
def word_similarity(word1, word2):
    meanlen = (len(word1) + len(word2)) / 2
    return 1 - editdistance.eval(word1, word2) / meanlen

def sentence_word_similarity_features(texts, feature_words):
    def _chunks(l, n):
        for i in range(0, len(l), n):
            yield l[i:i + n]
            
    def _text_similarity(words, wordset, similarities, text):
        text_words = set(text.split(' '))
        idx = [words.index(word) for word in text_words if word in wordset]
        return similarities[idx].max()
    
    countvec = CountVectorizer(binary=True, tokenizer=lambda s: s.split(' '))
    counts = countvec.fit_transform(texts)
    words = countvec.get_feature_names()
    word_chunk_size = 128
    similarities_progress = IntProgress(min=0, max=ceil(len(words) / word_chunk_size))
    
    result = {}
    
    for i, feature_word in enumerate(tqdm(feature_words)):
        if i == 0:
            display(similarities_progress)
        
        similarities_progress.value = 0
        similarities = np.zeros([len(words)])
        k = 0
        for j, words_chunk in enumerate(_chunks(words, word_chunk_size)):
            for word in words_chunk:
                similarities[k] = word_similarity(word, feature_word)
                k += 1
            similarities_progress.value += 128
        similarities_progress.value = similarities_progress.max
        
        count_similarities = counts.multiply(csr_matrix(similarities))
        feature_similarities = np.array([
            count_similarities[j].max()
            for j in range(len(texts))
        ])
        result[feature_word] = feature_similarities
    return result

In [13]:
topword_similarities = sentence_word_similarity_features(list(dftrain['preprocessed_text']) + list(dftest['preprocessed_text']),
                                                         topwords)
for key, similarities in topword_similarities.items():
    dftrain['lev_' + key] = similarities[:len(dftrain)]
    dftest['lev_' + key] = similarities[-len(dftest):]




In [23]:
levenshtein_features = ['lev_' + word for word in topwords]
levenshtein_features

['lev_!',
 'lev_!!',
 'lev_!!!',
 'lev_!!!!',
 'lev_!!!!!!',
 'lev_!!!!!!!',
 'lev_!!!!!!!!!',
 'lev_!!!!!!!!!!!!',
 'lev_!"',
 'lev_"',
 'lev_""fuck',
 'lev_""fucking',
 "lev_''''''",
 'lev_(',
 'lev_(talk)',
 'lev_(utc)',
 'lev_)',
 'lev_***',
 'lev_-',
 'lev_.',
 'lev_."',
 'lev_...',
 'lev_/',
 'lev_2008',
 'lev_3',
 'lev_==',
 'lev_?',
 'lev_???',
 'lev_]]',
 'lev_a',
 'lev_about',
 'lev_abuse',
 'lev_account',
 'lev_actions',
 'lev_added',
 'lev_admins',
 'lev_adolf',
 'lev_after',
 'lev_again',
 'lev_agree',
 "lev_ain't",
 'lev_all',
 'lev_alone',
 'lev_also',
 'lev_am',
 'lev_anal',
 'lev_and',
 'lev_answer',
 'lev_anti-semite',
 'lev_anti-vandalism',
 'lev_anus',
 'lev_any',
 'lev_anyway',
 'lev_arab',
 'lev_are',
 'lev_arrogant',
 'lev_arse',
 'lev_article',
 'lev_articles',
 'lev_as',
 'lev_ask',
 'lev_ass',
 'lev_asshole',
 'lev_assholes',
 'lev_at',
 'lev_away',
 'lev_back',
 'lev_bag',
 'lev_balls',
 'lev_ban',
 'lev_barnstar',
 'lev_bastard',
 'lev_bastards',
 'lev_be',


In [14]:
dftrain.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,preprocessed_text,lev_!,...,lev_y,lev_yeah,lev_yes,lev_yo,lev_you,lev_you're,lev_your,lev_youre,lev_}},lev_•
0,22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0,"nonsense ? kiss off , geek . what i said is tr...",0.0,...,0.0,0.25,0.2,0.333333,0.714286,0.6,1.0,0.777778,0.0,0.0
1,27450690,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0,""" please do not vandalize pages , as you did w...",0.0,...,0.0,0.25,0.333333,0.6,1.0,0.333333,0.714286,0.5,0.0,0.0
2,54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",0,0,0,0,0,0,""" """"points of interest """" i removed the """"poin...",0.0,...,0.333333,0.5,0.333333,0.6,1.0,0.6,1.0,0.777778,1.0,0.0
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0,asking some his nationality is a racial offenc...,0.0,...,0.0,0.25,0.333333,0.333333,0.714286,0.6,1.0,0.777778,0.0,0.0
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0,the reader here is not going by my say so for ...,0.0,...,0.333333,0.25,0.333333,0.6,1.0,0.384615,0.714286,0.5,0.0,0.0


In [17]:
dftrain.to_csv('train.csv', index=None)

In [18]:
dftest.to_csv('test.csv', index=None)

In [20]:
stemmer = EnglishStemmer()

@lru_cache(30000)
def stem_word(word):
    return stemmer.stem(word)

def stem_text(text):
    tokens = text.split(' ')
    return ' '.join(map(stem_word, tokens))

def stem_texts(texts):
    return [stem_text(text) for text in tqdm(texts)]

In [21]:
dftrain['stemmed_text'] = stem_texts(dftrain['preprocessed_text'])
dftest['stemmed_text'] = stem_texts(dftest['preprocessed_text'])







In [24]:
stemmed_texts = np.array(dftrain['stemmed_text'])

In [26]:
errors = []
for i, target in enumerate(targets):
    clf = Pipeline([
        ('vec', TfidfVectorizer(binary=True, min_df=3, tokenizer=lambda s: s.split(' '))),
        ('clf', LogisticRegression(penalty='l1'))
    ])
    clf.fit(stemmed_texts[train_idx], y[train_idx][:, i])
    error = log_loss(y[val_idx][:, i], clf.predict_proba(stemmed_texts[val_idx])[:, 1])
    print('{0} loss: '.format(target), error)
    errors.append(error)
print('Total loss: ', np.array(errors).mean())

toxic loss:  0.110571544541
severe_toxic loss:  0.0285683947691
obscene loss:  0.0617498290975
threat loss:  0.0121844556649
insult loss:  0.082138605912
identity_hate loss:  0.0263262219214
Total loss:  0.0535898419844


In [36]:
errors = []
for i, target in enumerate(targets):
    clf = Pipeline([
        ('vec', FeatureUnion([
            ('tfidf', TransformPipeline([
                ('text', FunctionTransformer(
                    lambda X: X[:, 0],
                    validate=False
                )),
                ('vec', TfidfVectorizer(binary=True, min_df=3, tokenizer=lambda s: s.split(' ')))
            ])),
            ('levenshtein', TransformPipeline([
                ('features', FunctionTransformer(
                    lambda X: np.array(X[:, 1:1+len(levenshtein_features)]).astype(np.float),
                    validate=False
                )),
            ])),
        ])),
        ('clf', LogisticRegression(penalty='l1'))
    ])
    X = np.array(dftrain[['stemmed_text'] + levenshtein_features])
    clf.fit(X[train_idx], y[train_idx][:, i])
    error = log_loss(y[val_idx][:, i], clf.predict_proba(X[val_idx])[:, 1])
    print('{0} loss: '.format(target), error)
    errors.append(error)
print('Total loss: ', np.array(errors).mean())

toxic loss:  0.109430148933
severe_toxic loss:  0.026859392916
obscene loss:  0.0607447117394
threat loss:  0.0118839611236
insult loss:  0.0781660489122
identity_hate loss:  0.0260515706028
Total loss:  0.0521893057045


In [37]:
errors = []
for i, target in enumerate(targets):
    clf = Pipeline([
        ('vec', FeatureUnion([
            ('tfidf', TransformPipeline([
                ('text', FunctionTransformer(
                    lambda X: X[:, 0],
                    validate=False
                )),
                ('vec', TfidfVectorizer(binary=True, min_df=3, tokenizer=lambda s: s.split(' ')))
            ])),
            ('levenshtein', TransformPipeline([
                ('features', FunctionTransformer(
                    lambda X: np.array(X[:, 1:1+len(levenshtein_features)]).astype(np.float),
                    validate=False
                )),
                ('idf', TfidfTransformer()),
            ])),
        ])),
        ('clf', LogisticRegression(penalty='l1'))
    ])
    X = np.array(dftrain[['stemmed_text'] + levenshtein_features])
    clf.fit(X[train_idx], y[train_idx][:, i])
    error = log_loss(y[val_idx][:, i], clf.predict_proba(X[val_idx])[:, 1])
    print('{0} loss: '.format(target), error)
    errors.append(error)
print('Total loss: ', np.array(errors).mean())

toxic loss:  0.107130806604
severe_toxic loss:  0.0263669424172
obscene loss:  0.0586768704664
threat loss:  0.0119581965625
insult loss:  0.0776366380974
identity_hate loss:  0.0255054268334
Total loss:  0.0512124801636


In [38]:
errors = []
for i, target in enumerate(targets):
    clf = Pipeline([
        ('vec', FeatureUnion([
            ('tfidf', TransformPipeline([
                ('text', FunctionTransformer(
                    lambda X: X[:, 0],
                    validate=False
                )),
                ('vec', TfidfVectorizer(binary=True, min_df=3, tokenizer=lambda s: s.split(' ')))
            ])),
            ('chars', TransformPipeline([
                ('text', FunctionTransformer(lambda X: X[:, 1], validate=False)),
                ('vec', TfidfVectorizer(analyzer='char', ngram_range=(1, 3), lowercase=False)),
            ])),
            ('levenshtein', TransformPipeline([
                ('features', FunctionTransformer(
                    lambda X: np.array(X[:, 2:2+len(levenshtein_features)]).astype(np.float),
                    validate=False
                )),
                ('idf', TfidfTransformer()),
            ])),
        ])),
        ('clf', LogisticRegression(penalty='l1'))
    ])
    X = np.array(dftrain[['stemmed_text', 'comment_text'] + levenshtein_features])
    clf.fit(X[train_idx], y[train_idx][:, i])
    error = log_loss(y[val_idx][:, i], clf.predict_proba(X[val_idx])[:, 1])
    print('{0} loss: '.format(target), error)
    errors.append(error)
print('Total loss: ', np.array(errors).mean())

toxic loss:  0.100878550564
severe_toxic loss:  0.0255812745147
obscene loss:  0.0567030156597
threat loss:  0.0117274833456
insult loss:  0.0754305249903
identity_hate loss:  0.0250575666985
Total loss:  0.0492297359622
