In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from crossvalidation import multilabel_cross_validation, multilabel_label_combinations
from multilabel_classifier import MultilabelClassifier
from transform_pipeline import TransformPipeline

from nltk.tokenize import wordpunct_tokenize
from nltk.stem.snowball import EnglishStemmer
from nltk.stem import WordNetLemmatizer
from functools import lru_cache

from textblob import TextBlob

from collections import OrderedDict

from sklearn.dummy import DummyRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, MaxAbsScaler

from visualizations import topn_features, confusion_matrix

# Dataset preparation

I'll use features based on:

- comment text "as is"
- stemmed/lemmatized comment text
- (maybe) sentiment

In [2]:
dftrain = pd.read_csv("input/train.csv")
dftrain['comment_text'] = dftrain['comment_text'].apply(str) # some values parsed as float
dftest = pd.read_csv("input/test.csv")
dftest['comment_text'] = dftest['comment_text'].apply(str) # some values parsed as float

In [3]:
dftrain.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0
1,27450690,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0
2,54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",0,0,0,0,0,0
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0


In [4]:
dftest.head()

Unnamed: 0,id,comment_text
0,6044863,==Orphaned non-free media (Image:41cD1jboEvL. ...
1,6102620,::Kentuckiana is colloquial. Even though the ...
2,14563293,"Hello fellow Wikipedians,\nI have just modifie..."
3,21086297,"AKC Suspensions \nThe Morning Call - Feb 24, 2..."
4,22982444,== [WIKI_LINK: Talk:Celts] ==


In [5]:
stemmer = EnglishStemmer()

@lru_cache(30000)
def stem_word(word):
    return stemmer.stem(word)

lemmatizer = WordNetLemmatizer()

@lru_cache(30000)
def lemmatize_word(word):
    return lemmatizer.lemmatize(word)

def simplify_text(text, simplifier):
    tokens = wordpunct_tokenize(text.lower())
    simplified_tokens = map(simplifier, tokens)
    return " ".join(simplified_tokens)

def simplify_texts(texts, simplifier):
    return [simplify_text(text, simplifier) for text in tqdm(texts)]

In [6]:
dftrain['comment_text_stemmed'] = simplify_texts(dftrain['comment_text'], stem_word)
dftest['comment_text_stemmed'] = simplify_texts(dftest['comment_text'], stem_word)
dftrain['comment_text_lemmatized'] = simplify_texts(dftrain['comment_text'], lemmatize_word)
dftest['comment_text_lemmatized'] = simplify_texts(dftest['comment_text'], lemmatize_word)

100%|██████████████████████████████████████████████████████████████████████████| 95851/95851 [00:24<00:00, 3969.80it/s]
100%|████████████████████████████████████████████████████████████████████████| 226998/226998 [00:48<00:00, 4725.14it/s]
100%|██████████████████████████████████████████████████████████████████████████| 95851/95851 [00:15<00:00, 6037.28it/s]
100%|████████████████████████████████████████████████████████████████████████| 226998/226998 [00:35<00:00, 6364.01it/s]


In [7]:
def sentiment(df):
    blobs = [TextBlob(text) for text in tqdm(df['comment_text'])]
    sentiments = [blob.sentiment for blob in tqdm(blobs)]
    polatity = np.array([sentiment.polarity for sentiment in sentiments])
    subjectivity = np.array([sentiment.subjectivity for sentiment in sentiments])
    sentiment = polatity * subjectivity
    return polatity, subjectivity, sentiment

In [8]:
dftrain['polarity'], dftrain['subjectivity'], dftrain['sentiment'] = sentiment(dftrain)
dftest['polarity'], dftest['subjectivity'], dftest['sentiment'] = sentiment(dftest)

100%|█████████████████████████████████████████████████████████████████████████| 95851/95851 [00:02<00:00, 39456.15it/s]
100%|███████████████████████████████████████████████████████████████████████████| 95851/95851 [02:41<00:00, 595.33it/s]
100%|███████████████████████████████████████████████████████████████████████| 226998/226998 [00:07<00:00, 29511.06it/s]
100%|█████████████████████████████████████████████████████████████████████████| 226998/226998 [06:55<00:00, 546.59it/s]


In [9]:
dftrain.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text_stemmed,comment_text_lemmatized,polarity,subjectivity,sentiment
0,22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0,"nonsens ? kiss off , geek . what i said is tru...","nonsense ? kiss off , geek . what i said is tr...",0.35,0.65,0.2275
1,27450690,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0,""" pleas do not vandal page , as you did with t...",""" please do not vandalize page , a you did wit...",0.0,0.0,0.0
2,54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",0,0,0,0,0,0,""" """" point of interest """" i remov the """" point...",""" """" point of interest """" i removed the """" poi...",-0.040625,0.771875,-0.031357
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0,ask some his nation is a racial offenc . wow w...,asking some his nationality is a racial offenc...,0.175,0.625,0.109375
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0,the reader here is not go by my say so for eth...,the reader here is not going by my say so for ...,-0.075,0.25,-0.01875


In [2]:
dftrain = pd.read_csv("input/train_adv.csv")
# dftrain['comment_text'] = dftrain['comment_text'].apply(str) # some values parsed as float
dftest = pd.read_csv("input/test_adv.csv")
# dftest['comment_text'] = dftest['comment_text'].apply(str) # some values parsed as float

# Dummy model

Let's build model that return 0.5 for each label probability:

In [3]:
clf = MultilabelClassifier([
    DummyRegressor(strategy='constant', constant=0.5)
    for _ in range(6)
])
multilabel_cross_validation(clf,
                            np.zeros([len(dftrain), 1]),
                            np.array(dftrain[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]))

array([0.69314718, 0.69314718, 0.69314718])

# Logistic regression baseline

In [4]:
clf = Pipeline([
    ('vec', TfidfVectorizer()),
    ('clf', MultilabelClassifier([
        LogisticRegression(penalty='l1')
        for _ in range(6)
    ]))
])
multilabel_cross_validation(clf,
                            dftrain['comment_text'],
                            np.array(dftrain[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]))

array([0.0556387 , 0.05629894, 0.05564404])

In [9]:
clf = Pipeline([
    ('vec', TfidfVectorizer(max_features=10000, binary=True, norm='l2', smooth_idf=True, sublinear_tf=True, use_idf=True)),
    ('clf', MultilabelClassifier([
        LogisticRegression(penalty='l1')
        for _ in range(6)
    ]))
])
multilabel_cross_validation(clf,
                dftrain['comment_text_stemmed'],
                np.array(dftrain[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]))

array([0.05290194, 0.05321799, 0.05259117])

In [6]:
clf = Pipeline([
    ('vec', TfidfVectorizer()),
    ('clf', MultilabelClassifier([
        LogisticRegression(penalty='l1')
        for _ in range(6)
    ]))
])
multilabel_cross_validation(clf,
                dftrain['comment_text_lemmatized'],
                np.array(dftrain[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]))

array([0.05456107, 0.05511099, 0.05463727])

So for bare logistic regression - stemmed text is a best choise.

# Character features

Let's add character n-gram based features (from source text)

In [13]:
clf = Pipeline([
    ('vec', FeatureUnion([
        ('words', TransformPipeline([
            ('stemmed', FunctionTransformer(lambda X: X[:, 0], validate=False)),
            ('vec', TfidfVectorizer(max_features=6500, binary=True, norm='l2', smooth_idf=True, sublinear_tf=True, use_idf=True)),
        ])),
        ('chars', TransformPipeline([
            ('text', FunctionTransformer(lambda X: X[:, 1], validate=False)),
            ('vec', TfidfVectorizer(analyzer='char', ngram_range=(1, 3), lowercase=False)),
        ])),
    ])),
    ('clf', MultilabelClassifier([
        LogisticRegression(penalty='l1')
        for _ in range(6)
    ]))
])
multilabel_cross_validation(clf,
                dftrain[['comment_text_stemmed', 'comment_text']],
                np.array(dftrain[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]))

array([0.04972983, 0.04990565, 0.04951608])

In [None]:
6500  array([0.04972983, 0.04990565, 0.04951608])
7500  array([0.04974486, 0.0499596 , 0.04946345])
8000  array([0.04983992, 0.049903  , 0.04944069])
10000 array([0.04984456, 0.04995329, 0.04949518])

And let's show top 30 features - from words list and from character n-grams list:

In [15]:
topn_features(clf, 30)

(OrderedDict([('fuck', 22.258629928450581),
              ('idiot', 19.496728472845685),
              ('shit', 16.627022092542468),
              ('stupid', 16.321173442144822),
              ('bullshit', 16.234971783888582),
              ('bitch', 15.685465620721198),
              ('ass', 14.566450399722072),
              ('asshol', 13.127424796729347),
              ('pussi', 13.094236593684162),
              ('nigger', 12.802131977647788),
              ('crap', 11.850163237299537),
              ('peni', 11.476548473431224),
              ('nigga', 11.191932448267101),
              ('suck', 11.005929809981092),
              ('kill', 10.942967855170675),
              ('bastard', 10.939489330136974),
              ('cunt', 10.911695491594049),
              ('pathet', 10.696962979232737),
              ('dick', 10.672028146667035),
              ('die', 10.507648532169508),
              ('moron', 10.295281680816011),
              ('faggot', 10.242676674637879),
            

# Per-class error/features visualization

There I'll build for each class:
- cross-validation scores per class
- list of 30 top word/character features
- confusion matrix with:
    - false positive rate (part of negative samples predicted as positive)
    - false negative rate (part of positive samples predicted as negative)
    - true positive rate (part of positive samples predicted as positive)
    - true negative rate (part of negative samples predicted as negative)

## Toxic

In [16]:
clf = Pipeline([
    ('vec', FeatureUnion([
        ('words', TransformPipeline([
            ('stemmed', FunctionTransformer(lambda X: X[:, 0], validate=False)),
            ('vec', TfidfVectorizer()),
        ])),
        ('chars', TransformPipeline([
            ('text', FunctionTransformer(lambda X: X[:, 1], validate=False)),
            ('vec', TfidfVectorizer(analyzer='char', ngram_range=(1, 3), lowercase=False)),
        ])),
    ])),
    ('clf', MultilabelClassifier([
        LogisticRegression(penalty='l1')
        for _ in range(1)
    ]))
])
multilabel_cross_validation(clf,
                dftrain[['comment_text_stemmed', 'comment_text']],
                np.array(dftrain[['toxic']]))

array([ 0.105112  ,  0.10259742,  0.1023982 ])

In [17]:
topn_features(clf, 30)

(OrderedDict([('fuck', 21.556119614103412),
              ('idiot', 19.317049558444587),
              ('shit', 17.110099475784853),
              ('stupid', 16.36257824598469),
              ('bullshit', 16.26183782367546),
              ('ass', 13.507419437664371),
              ('asshol', 13.16003029944566),
              ('crap', 11.628581292841442),
              ('peni', 11.516648658328211),
              ('bitch', 11.013434505924108),
              ('suck', 11.009842443430925),
              ('bastard', 10.843526134232516),
              ('dick', 10.787729163039733),
              ('pathet', 10.589895397067135),
              ('moron', 10.436272972665206),
              ('faggot', 9.9766246865283144),
              ('hell', 9.0134897970140937),
              ('whore', 8.9041253750908975),
              ('thank', 8.6488614249627904),
              ('nigger', 8.5229018974044202),
              ('cunt', 8.4803818059381708),
              ('nazi', 8.3083609068579403),
              

In [18]:
clf = Pipeline([
    ('vec', FeatureUnion([
        ('words', TransformPipeline([
            ('stemmed', FunctionTransformer(lambda X: X[:, 0], validate=False)),
            ('vec', TfidfVectorizer()),
        ])),
        ('chars', TransformPipeline([
            ('text', FunctionTransformer(lambda X: X[:, 1], validate=False)),
            ('vec', TfidfVectorizer(analyzer='char', ngram_range=(1, 3), lowercase=False)),
        ])),
    ])),
    ('clf', MultilabelClassifier([
        LogisticRegression(penalty='l1')
        for _ in range(1)
    ]))
])
confusion_matrix(clf,
                dftrain[['comment_text_stemmed', 'comment_text']],
                np.array(dftrain[['toxic']]))

Unnamed: 0,predicted negative,predicted positive
negative,0.990348,0.009652
positive,0.314855,0.685145


## Severe toxic

In [19]:
clf = Pipeline([
    ('vec', FeatureUnion([
        ('words', TransformPipeline([
            ('stemmed', FunctionTransformer(lambda X: X[:, 0], validate=False)),
            ('vec', TfidfVectorizer()),
        ])),
        ('chars', TransformPipeline([
            ('text', FunctionTransformer(lambda X: X[:, 1], validate=False)),
            ('vec', TfidfVectorizer(analyzer='char', ngram_range=(1, 3), lowercase=False)),
        ])),
    ])),
    ('clf', MultilabelClassifier([
        LogisticRegression(penalty='l1')
        for _ in range(1)
    ]))
])
multilabel_cross_validation(clf,
                dftrain[['comment_text_stemmed', 'comment_text']],
                np.array(dftrain[['severe_toxic']]))

array([ 0.0281903 ,  0.02658158,  0.0273223 ])

In [20]:
topn_features(clf, 30)

(OrderedDict([('die', 4.5263704831269633),
              ('filthi', 4.3384082283558136),
              ('anus', 4.3372787615404009),
              ('for', 4.2245073938649496),
              ('you', 3.7642516916357232),
              ('nazi', 3.7507138357978307),
              ('asshol', 3.673707292716796),
              ('rape', 3.6500550640802643),
              ('cunt', 3.4341921683995738),
              ('pathet', 3.3443010832459823),
              ('nigger', 3.2309395092759465),
              ('peni', 3.2199889358307567),
              ('dumb', 3.1817894396460162),
              ('bitch', 3.1378055264838149),
              ('jew', 3.0932868150425117),
              ('shit', 3.0630431375692786),
              ('pleas', 3.055809159036424),
              ('pig', 2.9056214795834578),
              ('bastard', 2.8533346999478373),
              ('moron', 2.8125867279913326),
              ('peopl', 2.7133439357101903),
              ('homosexu', 2.6730763968730566),
              ('talk

In [21]:
clf = Pipeline([
    ('vec', FeatureUnion([
        ('words', TransformPipeline([
            ('stemmed', FunctionTransformer(lambda X: X[:, 0], validate=False)),
            ('vec', TfidfVectorizer()),
        ])),
        ('chars', TransformPipeline([
            ('text', FunctionTransformer(lambda X: X[:, 1], validate=False)),
            ('vec', TfidfVectorizer(analyzer='char', ngram_range=(1, 3), lowercase=False)),
        ])),
    ])),
    ('clf', MultilabelClassifier([
        LogisticRegression(penalty='l1')
        for _ in range(1)
    ]))
])
confusion_matrix(clf,
                dftrain[['comment_text_stemmed', 'comment_text']],
                np.array(dftrain[['severe_toxic']]))

Unnamed: 0,predicted negative,predicted positive
negative,0.996965,0.003035
positive,0.701245,0.298755


## Obscene

In [22]:
clf = Pipeline([
    ('vec', FeatureUnion([
        ('words', TransformPipeline([
            ('stemmed', FunctionTransformer(lambda X: X[:, 0], validate=False)),
            ('vec', TfidfVectorizer()),
        ])),
        ('chars', TransformPipeline([
            ('text', FunctionTransformer(lambda X: X[:, 1], validate=False)),
            ('vec', TfidfVectorizer(analyzer='char', ngram_range=(1, 3), lowercase=False)),
        ])),
    ])),
    ('clf', MultilabelClassifier([
        LogisticRegression(penalty='l1')
        for _ in range(1)
    ]))
])
multilabel_cross_validation(clf,
                dftrain[['comment_text_stemmed', 'comment_text']],
                np.array(dftrain[['obscene']]))

array([ 0.05844263,  0.06063497,  0.06012041])

In [23]:
topn_features(clf, 30)

(OrderedDict([('fuck', 18.266791247746649),
              ('bitch', 16.362478869621985),
              ('ass', 14.853128494532696),
              ('bullshit', 14.653320957807894),
              ('asshol', 13.018990465908336),
              ('pussi', 12.920922736131878),
              ('shit', 11.540668736325051),
              ('cunt', 11.00799444571949),
              ('bastard', 10.735868196959578),
              ('stupid', 9.6381743695724147),
              ('crap', 9.3555266637787309),
              ('idiot', 9.2377273259359036),
              ('dick', 8.5019932226675792),
              ('peni', 8.0180774082955129),
              ('damn', 7.8829963802860625),
              ('cock', 7.3013123759169609),
              ('suck', 7.2489875723766888),
              ('faggot', 7.1858384462856506),
              ('dumbass', 7.1558718104891899),
              ('dickhead', 7.1168858959576218),
              ('wtf', 7.0828228964832061),
              ('cocksuck', 6.9772820597801841),
        

In [24]:
clf = Pipeline([
    ('vec', FeatureUnion([
        ('words', TransformPipeline([
            ('stemmed', FunctionTransformer(lambda X: X[:, 0], validate=False)),
            ('vec', TfidfVectorizer()),
        ])),
        ('chars', TransformPipeline([
            ('text', FunctionTransformer(lambda X: X[:, 1], validate=False)),
            ('vec', TfidfVectorizer(analyzer='char', ngram_range=(1, 3), lowercase=False)),
        ])),
    ])),
    ('clf', MultilabelClassifier([
        LogisticRegression(penalty='l1')
        for _ in range(1)
    ]))
])
confusion_matrix(clf,
                dftrain[['comment_text_stemmed', 'comment_text']],
                np.array(dftrain[['obscene']]))

Unnamed: 0,predicted negative,predicted positive
negative,0.995151,0.004849
positive,0.285826,0.714174


## Threat

In [25]:
clf = Pipeline([
    ('vec', FeatureUnion([
        ('words', TransformPipeline([
            ('stemmed', FunctionTransformer(lambda X: X[:, 0], validate=False)),
            ('vec', TfidfVectorizer()),
        ])),
        ('chars', TransformPipeline([
            ('text', FunctionTransformer(lambda X: X[:, 1], validate=False)),
            ('vec', TfidfVectorizer(analyzer='char', ngram_range=(1, 3), lowercase=False)),
        ])),
    ])),
    ('clf', MultilabelClassifier([
        LogisticRegression(penalty='l1')
        for _ in range(1)
    ]))
])
multilabel_cross_validation(clf,
                dftrain[['comment_text_stemmed', 'comment_text']],
                np.array(dftrain[['threat']]))

array([ 0.01229396,  0.01140302,  0.01157844])

In [26]:
topn_features(clf, 30)

(OrderedDict([('die', 10.647688909907837),
              ('kill', 10.345985014225247),
              ('shoot', 10.093137263103564),
              ('will', 7.1736492500765685),
              ('rape', 6.6852856630364226),
              ('death', 6.6221636896929965),
              ('cut', 5.7417548014601687),
              ('burn', 5.2044836931849563),
              ('you', 4.8260395358764923),
              ('stab', 4.5506834263939799),
              ('gonna', 4.5303218631765532),
              ('your', 4.5142581595666247),
              ('ll', 4.4851084387028619),
              ('deserv', 4.4621357510804671),
              ('ass', 4.4103939918060062),
              ('ya', 4.2604268752057264),
              ('punch', 4.0920921424254502),
              ('dead', 3.9608575979529363),
              ('kick', 3.5578818157611276),
              ('the', 3.5088202325086724),
              ('thank', 3.2971763605901847),
              ('hous', 3.0832267409827758),
              ('it', 3.05390099055

In [27]:
clf = Pipeline([
    ('vec', FeatureUnion([
        ('words', TransformPipeline([
            ('stemmed', FunctionTransformer(lambda X: X[:, 0], validate=False)),
            ('vec', TfidfVectorizer()),
        ])),
        ('chars', TransformPipeline([
            ('text', FunctionTransformer(lambda X: X[:, 1], validate=False)),
            ('vec', TfidfVectorizer(analyzer='char', ngram_range=(1, 3), lowercase=False)),
        ])),
    ])),
    ('clf', MultilabelClassifier([
        LogisticRegression(penalty='l1')
        for _ in range(1)
    ]))
])
confusion_matrix(clf,
                dftrain[['comment_text_stemmed', 'comment_text']],
                np.array(dftrain[['threat']]))

Unnamed: 0,predicted negative,predicted positive
negative,0.999539,0.000461
positive,0.828947,0.171053


## Insult

In [28]:
clf = Pipeline([
    ('vec', FeatureUnion([
        ('words', TransformPipeline([
            ('stemmed', FunctionTransformer(lambda X: X[:, 0], validate=False)),
            ('vec', TfidfVectorizer()),
        ])),
        ('chars', TransformPipeline([
            ('text', FunctionTransformer(lambda X: X[:, 1], validate=False)),
            ('vec', TfidfVectorizer(analyzer='char', ngram_range=(1, 3), lowercase=False)),
        ])),
    ])),
    ('clf', MultilabelClassifier([
        LogisticRegression(penalty='l1')
        for _ in range(1)
    ]))
])
multilabel_cross_validation(clf,
                dftrain[['comment_text_stemmed', 'comment_text']],
                np.array(dftrain[['insult']]))

array([ 0.07649148,  0.0766289 ,  0.07866396])

In [29]:
topn_features(clf, 30)

(OrderedDict([('idiot', 14.61493401320846),
              ('stupid', 10.460707863000124),
              ('bitch', 10.068198552822121),
              ('asshol', 9.3061777198414628),
              ('bastard', 9.2856195589938437),
              ('moron', 9.121413446122336),
              ('loser', 7.9746274054634316),
              ('faggot', 7.8300219514181473),
              ('cunt', 7.5646356738279161),
              ('ass', 6.7678215949833813),
              ('fool', 6.7641099580020558),
              ('dickhead', 6.6816746161238454),
              ('pathet', 6.6719829595830022),
              ('retard', 6.6661075767170868),
              ('pig', 6.6304465355617435),
              ('goddamn', 5.4838582697541458),
              ('jerk', 5.2033711209353175),
              ('nigger', 5.1427837628813027),
              ('whore', 5.1291570014010208),
              ('fat', 5.0383341804619866),
              ('thank', 5.030816872102152),
              ('scum', 4.9127856365103799),
          

In [30]:
clf = Pipeline([
    ('vec', FeatureUnion([
        ('words', TransformPipeline([
            ('stemmed', FunctionTransformer(lambda X: X[:, 0], validate=False)),
            ('vec', TfidfVectorizer()),
        ])),
        ('chars', TransformPipeline([
            ('text', FunctionTransformer(lambda X: X[:, 1], validate=False)),
            ('vec', TfidfVectorizer(analyzer='char', ngram_range=(1, 3), lowercase=False)),
        ])),
    ])),
    ('clf', MultilabelClassifier([
        LogisticRegression(penalty='l1')
        for _ in range(1)
    ]))
])
confusion_matrix(clf,
                dftrain[['comment_text_stemmed', 'comment_text']],
                np.array(dftrain[['insult']]))

Unnamed: 0,predicted negative,predicted positive
negative,0.991964,0.008036
positive,0.440806,0.559194


## Identity hate

In [31]:
clf = Pipeline([
    ('vec', FeatureUnion([
        ('words', TransformPipeline([
            ('stemmed', FunctionTransformer(lambda X: X[:, 0], validate=False)),
            ('vec', TfidfVectorizer()),
        ])),
        ('chars', TransformPipeline([
            ('text', FunctionTransformer(lambda X: X[:, 1], validate=False)),
            ('vec', TfidfVectorizer(analyzer='char', ngram_range=(1, 3), lowercase=False)),
        ])),
    ])),
    ('clf', MultilabelClassifier([
        LogisticRegression(penalty='l1')
        for _ in range(1)
    ]))
])
multilabel_cross_validation(clf,
                dftrain[['comment_text_stemmed', 'comment_text']],
                np.array(dftrain[['identity_hate']]))

array([ 0.02691722,  0.02579108,  0.02610433])

In [32]:
topn_features(clf, 30)

(OrderedDict([('nigger', 12.764406797245742),
              ('nigga', 11.153887367146481),
              ('homosexu', 9.6661401511341953),
              ('gay', 9.4736620791336961),
              ('jew', 7.4960558066314116),
              ('homo', 6.8583479533617346),
              ('black', 6.5465079342987682),
              ('muslim', 6.3840189731608463),
              ('nazi', 6.2521839762790714),
              ('talk', 6.2307795578939107),
              ('faggot', 6.0887844521414705),
              ('negro', 5.3776978496225913),
              ('american', 4.9083247662213232),
              ('fagot', 4.4063654082752919),
              ('racist', 4.2579558026969382),
              ('turk', 4.0178618178692549),
              ('mexican', 3.9855816742388699),
              ('asian', 3.8882240504775378),
              ('paki', 3.8665816350835498),
              ('semit', 3.7121166563254069),
              ('fucker', 3.6795101881780741),
              ('thank', 3.6275443289448104),
      

In [33]:
clf = Pipeline([
    ('vec', FeatureUnion([
        ('words', TransformPipeline([
            ('stemmed', FunctionTransformer(lambda X: X[:, 0], validate=False)),
            ('vec', TfidfVectorizer()),
        ])),
        ('chars', TransformPipeline([
            ('text', FunctionTransformer(lambda X: X[:, 1], validate=False)),
            ('vec', TfidfVectorizer(analyzer='char', ngram_range=(1, 3), lowercase=False)),
        ])),
    ])),
    ('clf', MultilabelClassifier([
        LogisticRegression(penalty='l1')
        for _ in range(1)
    ]))
])
confusion_matrix(clf,
                dftrain[['comment_text_stemmed', 'comment_text']],
                np.array(dftrain[['identity_hate']]))

Unnamed: 0,predicted negative,predicted positive
negative,0.998864,0.001136
positive,0.754902,0.245098
