### 加载并查看数据集

In [1]:
import pandas as pd, numpy as np

In [2]:
train = pd.read_csv('./data/train.csv') #DataFrame
test = pd.read_csv('./data/test.csv')
subm = pd.read_csv('./data/sample_submission.csv')

In [3]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
train['comment_text'][0]

"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"

In [5]:
train['comment_text'][1]

"D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)"

In [6]:
train.comment_text[1]

"D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)"

### 参考上课的NB例子，提取特征

In [7]:
trainingdata = train.comment_text

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer() 
X_train_counts = count_vect.fit_transform(trainingdata)

### 参考上课的NB例子，对每个label训练模型

In [9]:
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [10]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
text_clfs = []
for i, j in enumerate(labels):
    print('fit', j)
    text_clf = Pipeline([('vect', count_vect), ('clf', MultinomialNB())])
    text_clf = text_clf.fit(trainingdata, train[j])
    text_clfs.append(text_clf) #Code Revision: to save the model for each label 

fit toxic
fit severe_toxic
fit obscene
fit threat
fit insult
fit identity_hate


### 参考上课的NB例子，对每个text预测每个label的概率，存入preds

In [11]:
preds = np.zeros((len(test), len(labels)))

In [12]:
for i, j in enumerate(labels): #i=0, j='toxic', preds[:,i]所有数据的'toxic'的概率
    print('predict', j)
    preds[:,i] = text_clfs[i].predict_proba(test.comment_text)[:,1]

predict toxic
predict severe_toxic
predict obscene
predict threat
predict insult
predict identity_hate


In [13]:
text_clfs[0].predict_proba(test.comment_text) #predict, predict_proba, predict_log_proba

array([[7.95736840e-23, 1.00000000e+00],
       [9.99963515e-01, 3.64848338e-05],
       [9.61613215e-01, 3.83867849e-02],
       ...,
       [1.00000000e+00, 2.89645093e-16],
       [1.00000000e+00, 1.55357970e-20],
       [6.56736421e-02, 9.34326358e-01]])

In [14]:
text_clfs[0].predict_proba(test.comment_text)[:,1]

array([1.00000000e+00, 3.64848338e-05, 3.83867849e-02, ...,
       2.89645093e-16, 1.55357970e-20, 9.34326358e-01])

In [15]:
text_clfs[0].predict(test.comment_text)

array([1, 0, 0, ..., 0, 0, 1])

In [16]:
L = ['Michael', 'Sarah', 'Tracy', 'Bob', 'Jack']

In [17]:
L[0:3]

['Michael', 'Sarah', 'Tracy']

In [18]:
L[:3]

['Michael', 'Sarah', 'Tracy']

In [19]:
L[:]

['Michael', 'Sarah', 'Tracy', 'Bob', 'Jack']

### 生成所需文件

In [20]:
submid = pd.DataFrame({'id': subm["id"]})
submission = pd.concat([submid, pd.DataFrame(preds, columns = labels)], axis=1)
submission.to_csv('submission.csv', index=False)

In [21]:
train.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


### 生成测试结果

In [22]:
test_labels = pd.read_csv('./data/test_labels.csv')
test_labels.head(10)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,-1,-1,-1,-1,-1,-1
3,00017563c3f7919a,-1,-1,-1,-1,-1,-1
4,00017695ad8997eb,-1,-1,-1,-1,-1,-1
5,0001ea8717f6de06,0,0,0,0,0,0
6,00024115d4cbde0f,-1,-1,-1,-1,-1,-1
7,000247e83dcc1211,0,0,0,0,0,0
8,00025358d4737918,-1,-1,-1,-1,-1,-1
9,00026d1092fe71cc,-1,-1,-1,-1,-1,-1


In [23]:
test_labels_filter = test_labels[test_labels['toxic']>-1]

In [24]:
test_labels_filter.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
5,0001ea8717f6de06,0,0,0,0,0,0
7,000247e83dcc1211,0,0,0,0,0,0
11,0002f87b16116a7f,0,0,0,0,0,0
13,0003e1cccfd5a40a,0,0,0,0,0,0
14,00059ace3e3e9a53,0,0,0,0,0,0


In [25]:
test_labels_filter.shape

(63978, 7)

In [26]:
test_labels.shape

(153164, 7)

In [27]:
test_filter = test[test.id.isin(test_labels_filter.id)]
test_filter.shape

(63978, 2)

In [28]:
test_filter.head()

Unnamed: 0,id,comment_text
5,0001ea8717f6de06,Thank you for understanding. I think very high...
7,000247e83dcc1211,:Dear god this site is horrible.
11,0002f87b16116a7f,"""::: Somebody will invariably try to add Relig..."
13,0003e1cccfd5a40a,""" \n\n It says it right there that it IS a typ..."
14,00059ace3e3e9a53,""" \n\n == Before adding a new product to the l..."


In [29]:
from sklearn.metrics import roc_auc_score
rocs = []
for i, j in enumerate(labels): #i=0, j='toxic'
    pred_filter = text_clfs[i].predict_proba(test_filter.comment_text)[:,1]
    roc = roc_auc_score(test_labels_filter[j], pred_filter)
    print(j, 'ROC AUC:', roc)
    rocs.append(roc)
print('mean column-wise ROC AUC:', np.mean(rocs))

toxic ROC AUC: 0.8970667467488319
severe_toxic ROC AUC: 0.8369947368707372
obscene ROC AUC: 0.8913601150411584
threat ROC AUC: 0.7646027595875
insult ROC AUC: 0.8780799934067208
identity_hate ROC AUC: 0.8145669039798789
mean column-wise ROC AUC: 0.8471118759391377
