In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from crossvalidation import multilabel_cross_validation
from multilabel_classifier import MultilabelClassifier
from transform_pipeline import TransformPipeline

from nltk.tokenize import wordpunct_tokenize
from nltk.stem.snowball import EnglishStemmer
from nltk.stem import WordNetLemmatizer
from functools import lru_cache

from textblob import TextBlob

from collections import OrderedDict

from sklearn.dummy import DummyRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, MaxAbsScaler

# Dataset preparation

I'll use features based on:

- comment text "as is"
- stemmed/lemmatized comment text
- (maybe) sentiment

In [2]:
dftrain = pd.read_csv("input/train.csv")
dftrain['comment_text'] = dftrain['comment_text'].apply(str) # some values parsed as float
dftest = pd.read_csv("input/test.csv")
dftest['comment_text'] = dftest['comment_text'].apply(str) # some values parsed as float

In [3]:
dftrain.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0
1,27450690,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0
2,54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",0,0,0,0,0,0
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0


In [4]:
dftest.head()

Unnamed: 0,id,comment_text
0,6044863,==Orphaned non-free media (Image:41cD1jboEvL. ...
1,6102620,::Kentuckiana is colloquial. Even though the ...
2,14563293,"Hello fellow Wikipedians,\nI have just modifie..."
3,21086297,"AKC Suspensions \nThe Morning Call - Feb 24, 2..."
4,22982444,== [WIKI_LINK: Talk:Celts] ==


In [5]:
stemmer = EnglishStemmer()

@lru_cache(30000)
def stem_word(word):
    return stemmer.stem(word)

lemmatizer = WordNetLemmatizer()

@lru_cache(30000)
def lemmatize_word(word):
    return lemmatizer.lemmatize(word)

def simplify_text(text, simplifier):
    tokens = wordpunct_tokenize(text.lower())
    simplified_tokens = map(simplifier, tokens)
    return " ".join(simplified_tokens)

def simplify_texts(texts, simplifier):
    return [simplify_text(text, simplifier) for text in tqdm(texts)]

In [6]:
dftrain['comment_text_stemmed'] = simplify_texts(dftrain['comment_text'], stem_word)
dftest['comment_text_stemmed'] = simplify_texts(dftest['comment_text'], stem_word)
dftrain['comment_text_lemmatized'] = simplify_texts(dftrain['comment_text'], lemmatize_word)
dftest['comment_text_lemmatized'] = simplify_texts(dftest['comment_text'], lemmatize_word)

100%|██████████████████████████████████████████████████████████████████████████| 95851/95851 [00:17<00:00, 5506.56it/s]
100%|████████████████████████████████████████████████████████████████████████| 226998/226998 [00:42<00:00, 5358.53it/s]
100%|██████████████████████████████████████████████████████████████████████████| 95851/95851 [00:14<00:00, 6462.49it/s]
100%|████████████████████████████████████████████████████████████████████████| 226998/226998 [00:27<00:00, 8211.61it/s]


In [7]:
def sentiment(df):
    blobs = [TextBlob(text) for text in tqdm(df['comment_text'])]
    sentiments = [blob.sentiment for blob in tqdm(blobs)]
    polatity = np.array([sentiment.polarity for sentiment in sentiments])
    subjectivity = np.array([sentiment.subjectivity for sentiment in sentiments])
    sentiment = polatity * subjectivity
    return polatity, subjectivity, sentiment

In [8]:
dftrain['polarity'], dftrain['subjectivity'], dftrain['sentiment'] = sentiment(dftrain)
dftest['polarity'], dftest['subjectivity'], dftest['sentiment'] = sentiment(dftest)

100%|█████████████████████████████████████████████████████████████████████████| 95851/95851 [00:01<00:00, 48018.71it/s]
100%|███████████████████████████████████████████████████████████████████████████| 95851/95851 [02:07<00:00, 750.74it/s]
100%|███████████████████████████████████████████████████████████████████████| 226998/226998 [00:06<00:00, 35007.00it/s]
100%|█████████████████████████████████████████████████████████████████████████| 226998/226998 [05:37<00:00, 672.65it/s]


In [9]:
dftrain.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text_stemmed,comment_text_lemmatized,polarity,subjectivity,sentiment
0,22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0,"nonsens ? kiss off , geek . what i said is tru...","nonsense ? kiss off , geek . what i said is tr...",0.35,0.65,0.2275
1,27450690,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0,""" pleas do not vandal page , as you did with t...",""" please do not vandalize page , a you did wit...",0.0,0.0,0.0
2,54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",0,0,0,0,0,0,""" """" point of interest """" i remov the """" point...",""" """" point of interest """" i removed the """" poi...",-0.040625,0.771875,-0.031357
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0,ask some his nation is a racial offenc . wow w...,asking some his nationality is a racial offenc...,0.175,0.625,0.109375
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0,the reader here is not go by my say so for eth...,the reader here is not going by my say so for ...,-0.075,0.25,-0.01875


# Dummy model

Let's build model that return 0.5 for each label probability:

In [10]:
clf = MultilabelClassifier([
    DummyRegressor(strategy='constant', constant=0.5)
    for _ in range(6)
])
multilabel_cross_validation(clf,
                            np.zeros([len(dftrain), 1]),
                            np.array(dftrain[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]))

array([ 0.69314718,  0.69314718,  0.69314718])

# Logistic regression baseline

In [11]:
clf = Pipeline([
    ('vec', TfidfVectorizer()),
    ('clf', MultilabelClassifier([
        LogisticRegression(penalty='l1')
        for _ in range(6)
    ]))
])
multilabel_cross_validation(clf,
                            dftrain['comment_text'],
                            np.array(dftrain[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]))

array([ 0.05563897,  0.05629918,  0.05564405])

In [12]:
clf = Pipeline([
    ('vec', TfidfVectorizer()),
    ('clf', MultilabelClassifier([
        LogisticRegression(penalty='l1')
        for _ in range(6)
    ]))
])
multilabel_cross_validation(clf,
                dftrain['comment_text_stemmed'],
                np.array(dftrain[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]))

array([ 0.05410874,  0.05448132,  0.05392603])

In [13]:
clf = Pipeline([
    ('vec', TfidfVectorizer()),
    ('clf', MultilabelClassifier([
        LogisticRegression(penalty='l1')
        for _ in range(6)
    ]))
])
multilabel_cross_validation(clf,
                dftrain['comment_text_lemmatized'],
                np.array(dftrain[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]))

array([ 0.05456093,  0.05511094,  0.05463724])

So for bare logistic regression - stemmed text is a best choise.

# Character features

Let's add character n-gram based features (from source text)

In [14]:
clf = Pipeline([
    ('vec', FeatureUnion([
        ('words', TransformPipeline([
            ('stemmed', FunctionTransformer(lambda X: X[:, 0], validate=False)),
            ('vec', TfidfVectorizer()),
        ])),
        ('chars', TransformPipeline([
            ('text', FunctionTransformer(lambda X: X[:, 1], validate=False)),
            ('vec', TfidfVectorizer(analyzer='char', ngram_range=(1, 3), lowercase=False)),
        ])),
    ])),
    ('clf', MultilabelClassifier([
        LogisticRegression(penalty='l1')
        for _ in range(6)
    ]))
])
multilabel_cross_validation(clf,
                dftrain[['comment_text_stemmed', 'comment_text']],
                np.array(dftrain[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]))

array([ 0.05096648,  0.05124373,  0.05062442])

And let's show top 30 features - from words list and from character n-grams list:

In [15]:
def topn_features(clf, n):
    weights = np.abs(np.stack([
        estimator.coef_[0]
        for estimator in clf.steps[1][1].estimators
    ]).max(axis=0))
    word_tfidf = clf.steps[0][1].transformer_list[0][1].steps[1][1]
    words = np.array(word_tfidf.get_feature_names())
    chars_tfidf = clf.steps[0][1].transformer_list[1][1].steps[1][1]
    chars = np.array(chars_tfidf.get_feature_names())
    word_weights = weights[0:len(words)]
    char_weights = weights[len(words): len(words) + len(chars)]
    sorted_word_weights_indices = word_weights.argsort()[::-1]
    sorted_char_weights_indices = char_weights.argsort()[::-1]
    top_words = words[sorted_word_weights_indices[:n]]
    top_words_weights = word_weights[sorted_word_weights_indices[:n]]
    top_words_dict = OrderedDict([
        (word, weight)
        for word, weight in zip(top_words, top_words_weights)
    ])
    top_chars = chars[sorted_char_weights_indices[:n]]
    top_chars_weights = char_weights[sorted_char_weights_indices[:n]]
    top_chars_dict = OrderedDict([
        (char, weight)
        for char, weight in zip(top_chars, top_chars_weights)
    ])
    return top_words_dict, top_chars_dict

In [16]:
topn_features(clf, 30)

(OrderedDict([('fuck', 22.256612793114751),
              ('idiot', 19.499189613074524),
              ('shit', 16.627050488263766),
              ('stupid', 16.320979068380609),
              ('bullshit', 16.236003420307831),
              ('bitch', 15.682711191483374),
              ('ass', 14.570270833755579),
              ('asshol', 13.1267843780647),
              ('pussi', 13.091680790135561),
              ('nigger', 12.802405334832791),
              ('crap', 11.852676828939789),
              ('peni', 11.476155729194247),
              ('nigga', 11.189665961834825),
              ('suck', 11.003739007093383),
              ('kill', 10.942203500100117),
              ('bastard', 10.939507236513503),
              ('cunt', 10.910946776785497),
              ('pathet', 10.696496106666764),
              ('dick', 10.672477301479374),
              ('die', 10.506547418470959),
              ('moron', 10.295647343955036),
              ('faggot', 10.241505137630787),
              

# Per-class error/features visualization

## Toxic

In [17]:
clf = Pipeline([
    ('vec', FeatureUnion([
        ('words', TransformPipeline([
            ('stemmed', FunctionTransformer(lambda X: X[:, 0], validate=False)),
            ('vec', TfidfVectorizer()),
        ])),
        ('chars', TransformPipeline([
            ('text', FunctionTransformer(lambda X: X[:, 1], validate=False)),
            ('vec', TfidfVectorizer(analyzer='char', ngram_range=(1, 3), lowercase=False)),
        ])),
    ])),
    ('clf', MultilabelClassifier([
        LogisticRegression(penalty='l1')
        for _ in range(1)
    ]))
])
multilabel_cross_validation(clf,
                dftrain[['comment_text_stemmed', 'comment_text']],
                np.array(dftrain[['toxic']]))

array([ 0.10511021,  0.10260038,  0.10240544])

In [18]:
topn_features(clf, 30)

(OrderedDict([('fuck', 21.556863310636921),
              ('idiot', 19.310399690208996),
              ('shit', 17.108999100303659),
              ('stupid', 16.373437740915879),
              ('bullshit', 16.262366358222966),
              ('ass', 13.507173437598825),
              ('asshol', 13.160349136903445),
              ('crap', 11.627398715155017),
              ('peni', 11.515223645680223),
              ('bitch', 11.01325263311629),
              ('suck', 11.009684635219401),
              ('bastard', 10.844944458429749),
              ('dick', 10.793772454481308),
              ('pathet', 10.593332732154837),
              ('moron', 10.43499772724507),
              ('faggot', 9.9707598124878185),
              ('hell', 9.0134137320652066),
              ('whore', 8.8995634547882965),
              ('thank', 8.6532420679754409),
              ('nigger', 8.5201402199601173),
              ('cunt', 8.4793882287285207),
              ('nazi', 8.3095274484037756),
             

## Severe toxic

In [19]:
clf = Pipeline([
    ('vec', FeatureUnion([
        ('words', TransformPipeline([
            ('stemmed', FunctionTransformer(lambda X: X[:, 0], validate=False)),
            ('vec', TfidfVectorizer()),
        ])),
        ('chars', TransformPipeline([
            ('text', FunctionTransformer(lambda X: X[:, 1], validate=False)),
            ('vec', TfidfVectorizer(analyzer='char', ngram_range=(1, 3), lowercase=False)),
        ])),
    ])),
    ('clf', MultilabelClassifier([
        LogisticRegression(penalty='l1')
        for _ in range(1)
    ]))
])
multilabel_cross_validation(clf,
                dftrain[['comment_text_stemmed', 'comment_text']],
                np.array(dftrain[['severe_toxic']]))

array([ 0.02819114,  0.02658199,  0.02732254])

In [20]:
topn_features(clf, 30)

(OrderedDict([('die', 4.5241908895974863),
              ('anus', 4.3392228638903827),
              ('filthi', 4.3387998833207755),
              ('for', 4.2208991381500809),
              ('you', 3.7871714817410589),
              ('nazi', 3.7512498891152219),
              ('asshol', 3.6754582798040727),
              ('rape', 3.6512539462794114),
              ('cunt', 3.4334216260871395),
              ('pathet', 3.3461932965370935),
              ('nigger', 3.2294079384615459),
              ('peni', 3.219114746132604),
              ('dumb', 3.1812718147254953),
              ('bitch', 3.1390526249996693),
              ('jew', 3.0912162754395496),
              ('shit', 3.0638882873775213),
              ('pleas', 3.0562066519661233),
              ('pig', 2.9058545970879819),
              ('bastard', 2.8541498100552416),
              ('moron', 2.8114795948168707),
              ('peopl', 2.7147424376282676),
              ('talk', 2.670487638078356),
              ('homosexu

## Obscene

In [21]:
clf = Pipeline([
    ('vec', FeatureUnion([
        ('words', TransformPipeline([
            ('stemmed', FunctionTransformer(lambda X: X[:, 0], validate=False)),
            ('vec', TfidfVectorizer()),
        ])),
        ('chars', TransformPipeline([
            ('text', FunctionTransformer(lambda X: X[:, 1], validate=False)),
            ('vec', TfidfVectorizer(analyzer='char', ngram_range=(1, 3), lowercase=False)),
        ])),
    ])),
    ('clf', MultilabelClassifier([
        LogisticRegression(penalty='l1')
        for _ in range(1)
    ]))
])
multilabel_cross_validation(clf,
                dftrain[['comment_text_stemmed', 'comment_text']],
                np.array(dftrain[['obscene']]))

array([ 0.05844435,  0.06063655,  0.06011712])

In [22]:
topn_features(clf, 30)

(OrderedDict([('fuck', 18.265656261246004),
              ('bitch', 16.360600632215004),
              ('ass', 14.854779003131794),
              ('bullshit', 14.654044134853201),
              ('asshol', 13.017216149297594),
              ('pussi', 12.921839077354855),
              ('shit', 11.541233293174219),
              ('cunt', 11.010894692406449),
              ('bastard', 10.731539432597934),
              ('stupid', 9.6411276001351158),
              ('crap', 9.356520011588275),
              ('idiot', 9.2372485344475184),
              ('dick', 8.5039668448287582),
              ('peni', 8.0166724165648269),
              ('damn', 7.8817247511098056),
              ('cock', 7.3046687909445769),
              ('suck', 7.2465326006490978),
              ('faggot', 7.1794684107467779),
              ('dumbass', 7.1552196934730672),
              ('dickhead', 7.1131029460335808),
              ('wtf', 7.0822914791625573),
              ('cocksuck', 6.9768330391335924),
        

## Threat

In [23]:
clf = Pipeline([
    ('vec', FeatureUnion([
        ('words', TransformPipeline([
            ('stemmed', FunctionTransformer(lambda X: X[:, 0], validate=False)),
            ('vec', TfidfVectorizer()),
        ])),
        ('chars', TransformPipeline([
            ('text', FunctionTransformer(lambda X: X[:, 1], validate=False)),
            ('vec', TfidfVectorizer(analyzer='char', ngram_range=(1, 3), lowercase=False)),
        ])),
    ])),
    ('clf', MultilabelClassifier([
        LogisticRegression(penalty='l1')
        for _ in range(1)
    ]))
])
multilabel_cross_validation(clf,
                dftrain[['comment_text_stemmed', 'comment_text']],
                np.array(dftrain[['threat']]))

array([ 0.0122937 ,  0.01140329,  0.01157872])

In [24]:
topn_features(clf, 30)

(OrderedDict([('die', 10.648476461392372),
              ('kill', 10.346868433147195),
              ('shoot', 10.092111092819124),
              ('will', 7.1737946794102445),
              ('rape', 6.6868067914182738),
              ('death', 6.6192326359451199),
              ('cut', 5.7428963524895114),
              ('burn', 5.2066808509506979),
              ('you', 4.8308704557227102),
              ('stab', 4.5497406099194473),
              ('gonna', 4.5306958169627842),
              ('your', 4.5148335351371927),
              ('ll', 4.4859291891961535),
              ('deserv', 4.4600400036417165),
              ('ass', 4.4152586350877732),
              ('ya', 4.2623563648853375),
              ('punch', 4.0960584178586581),
              ('dead', 3.9628381602954295),
              ('kick', 3.5617121602747956),
              ('the', 3.505404550580459),
              ('thank', 3.2945355269076653),
              ('hous', 3.0875479127600878),
              ('it', 3.044762689016

## Insult

In [25]:
clf = Pipeline([
    ('vec', FeatureUnion([
        ('words', TransformPipeline([
            ('stemmed', FunctionTransformer(lambda X: X[:, 0], validate=False)),
            ('vec', TfidfVectorizer()),
        ])),
        ('chars', TransformPipeline([
            ('text', FunctionTransformer(lambda X: X[:, 1], validate=False)),
            ('vec', TfidfVectorizer(analyzer='char', ngram_range=(1, 3), lowercase=False)),
        ])),
    ])),
    ('clf', MultilabelClassifier([
        LogisticRegression(penalty='l1')
        for _ in range(1)
    ]))
])
multilabel_cross_validation(clf,
                dftrain[['comment_text_stemmed', 'comment_text']],
                np.array(dftrain[['insult']]))

array([ 0.07649189,  0.07662821,  0.07866182])

In [26]:
topn_features(clf, 30)

(OrderedDict([('idiot', 14.616490335233244),
              ('stupid', 10.457621118352366),
              ('bitch', 10.072553193582161),
              ('asshol', 9.3076422786496469),
              ('bastard', 9.2867052743397807),
              ('moron', 9.1222214723918338),
              ('loser', 7.9753045257333133),
              ('faggot', 7.8295440840863764),
              ('cunt', 7.5671112350704579),
              ('ass', 6.7679912424449986),
              ('fool', 6.7654298821606558),
              ('dickhead', 6.6830191102982415),
              ('pathet', 6.6703511825327464),
              ('retard', 6.6669358744124816),
              ('pig', 6.6309203480658585),
              ('goddamn', 5.4861742467558434),
              ('jerk', 5.2029577153186706),
              ('nigger', 5.1395652810676067),
              ('whore', 5.1287129525926769),
              ('fat', 5.0401393490227546),
              ('thank', 5.0328154603322188),
              ('scum', 4.9166954302241264),
       

## Identity hate

In [27]:
clf = Pipeline([
    ('vec', FeatureUnion([
        ('words', TransformPipeline([
            ('stemmed', FunctionTransformer(lambda X: X[:, 0], validate=False)),
            ('vec', TfidfVectorizer()),
        ])),
        ('chars', TransformPipeline([
            ('text', FunctionTransformer(lambda X: X[:, 1], validate=False)),
            ('vec', TfidfVectorizer(analyzer='char', ngram_range=(1, 3), lowercase=False)),
        ])),
    ])),
    ('clf', MultilabelClassifier([
        LogisticRegression(penalty='l1')
        for _ in range(1)
    ]))
])
multilabel_cross_validation(clf,
                dftrain[['comment_text_stemmed', 'comment_text']],
                np.array(dftrain[['identity_hate']]))

array([ 0.02691533,  0.02579004,  0.02610342])

In [28]:
topn_features(clf, 30)

(OrderedDict([('nigger', 12.759037521275481),
              ('nigga', 11.147609260536905),
              ('homosexu', 9.6700961660905929),
              ('gay', 9.4781637261634142),
              ('jew', 7.4979543549309771),
              ('homo', 6.8577903362231885),
              ('black', 6.5458585084710936),
              ('muslim', 6.3850104875205567),
              ('nazi', 6.2518252347333476),
              ('talk', 6.2302387818109324),
              ('faggot', 6.0820215947901248),
              ('negro', 5.376064331910646),
              ('american', 4.910696050893554),
              ('fagot', 4.4042940246812901),
              ('racist', 4.2587095248069184),
              ('turk', 4.0208334741833331),
              ('mexican', 3.9881974726680509),
              ('asian', 3.8862894599380109),
              ('paki', 3.8678541453647681),
              ('semit', 3.7136666716299742),
              ('fucker', 3.6797347055951355),
              ('thank', 3.6281273742440261),
        

In [32]:
from nltk import corpus
import re

In [33]:
set("abdc")

{'a', 'b', 'c', 'd'}

In [35]:
stopwords = set(corpus.stopwords.words('english')) | set("[]`()[]{},<>/\?")

def clean_text(text):
    tokens = wordpunct_tokenize(text)
    filtered = filter(lambda token: token.lower() not in stopwords, tokens)
    return " ".join(filtered)

def clean_texts(texts):
    return [clean_text(text) for text in tqdm(texts)]