In [1]:
import pandas as pd, numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score

In [3]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
subm = pd.read_csv('./data/sample_submission.csv')

trainingdata = train.comment_text
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

test_labels = pd.read_csv('./Assignment1-3/data/test_labels.csv')
test_labels_filter = test_labels[test_labels['toxic']>-1]
test_filter = test[test.id.isin(test_labels_filter.id)]

In [4]:
vect = TfidfVectorizer()
X_train = vect.fit_transform(trainingdata)
rocs = []
for i, j in enumerate(labels):
    print('fit', j)
    text_clf = Pipeline([('vect', vect), ('clf', MultinomialNB())])
    text_clf = text_clf.fit(trainingdata, train[j])
    pred_filter = text_clf.predict_proba(test_filter.comment_text)[:,1]
    roc = roc_auc_score(test_labels_filter[j], pred_filter)
    print(j, 'ROC AUC:', roc)
    rocs.append(roc)
print('mean column-wise ROC AUC:', np.mean(rocs))

fit toxic
toxic ROC AUC: 0.8736072008934528
fit severe_toxic
severe_toxic ROC AUC: 0.8034275042913465
fit obscene
obscene ROC AUC: 0.8605632134849668
fit threat
threat ROC AUC: 0.7720454361505827
fit insult
insult ROC AUC: 0.8443708296031005
fit identity_hate
identity_hate ROC AUC: 0.810686418268932
mean column-wise ROC AUC: 0.8274501004487301


In [6]:
def toxic_comment (vect):
    X_train = vect.fit_transform(trainingdata)
    rocs = []
    for i, j in enumerate(labels):
        print('fit', j)
        text_clf = Pipeline([('vect', vect), ('clf', MultinomialNB())])
        text_clf = text_clf.fit(trainingdata, train[j])
        pred_filter = text_clf.predict_proba(test_filter.comment_text)[:,1]
        roc = roc_auc_score(test_labels_filter[j], pred_filter)
        print(j, 'ROC AUC:', roc)
        rocs.append(roc)
    print('mean column-wise ROC AUC:', np.mean(rocs))

In [7]:
toxic_comment(TfidfVectorizer(min_df=0.00009, max_features=20000, ngram_range=(1,3)))

fit toxic
toxic ROC AUC: 0.9315681161334363
fit severe_toxic
severe_toxic ROC AUC: 0.9524723394326645
fit obscene
obscene ROC AUC: 0.94440573219987
fit threat
threat ROC AUC: 0.9104192789552188
fit insult
insult ROC AUC: 0.9385631615070469
fit identity_hate
identity_hate ROC AUC: 0.92462126869714
mean column-wise ROC AUC: 0.9336749828208961


### TfidfVectorizer参数含义

min_df - 最小文档频率，去掉出现次数很少、没有统计意义的词

max_features - 维数最大值，即最多取多少个词作为特征

ngram_range - 几个词连起来

"windows xp crush"

ngram_range=(1,2):
    windows
    xp
    crush
    windows xp
    xp crush
    
ngram_range=(1,3):
    windows
    xp
    crush
    windows xp
    xp crush
    windows xp crush