### 加载数据

In [1]:
import pandas as pd, numpy as np

train = pd.read_csv('../Assignment1-3/data/train.csv')
test = pd.read_csv('../Assignment1-3/data/test.csv')
subm = pd.read_csv('../Assignment1-3/data/sample_submission.csv')

trainingdata = train.comment_text
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

test_labels = pd.read_csv('../Assignment1-3/data/test_labels.csv')
test_labels_filter = test_labels[test_labels['toxic']>-1]
test_filter = test[test.id.isin(test_labels_filter.id)]

### 生成TFIDF向量，查看每个类下TFIDF值Top20的词

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(stop_words='english')
X_train_tfidf = vect.fit_transform(trainingdata)
X_train_tfidf.shape

(159571, 189460)

In [3]:
print(X_train_tfidf[0].toarray().shape)

(1, 189460)


In [4]:
print(X_train_tfidf[0].toarray())

[[0. 0. 0. ... 0. 0. 0.]]


In [5]:
tf_idf_dict = {}
tf_idf_count = {}
for targetid, target in enumerate(labels):
    tf_idf_dict[target] = np.zeros((1, X_train_tfidf.shape[1]))
    tf_idf_count[target] = 0
for i in range(X_train_tfidf.shape[0]):
    for targetid, target in enumerate(labels):
        if train[target][i] == 1:
            tf_idf_dict[target] += X_train_tfidf[i].toarray()
            tf_idf_count[target] += 1

In [6]:
mapping = {}
for key, val in vect.vocabulary_.items():
    mapping[val] = key

In [7]:
tf_idf_numbers = {}
for targetid, target in enumerate(labels):
    print (target)
    tf_idf_numbers[targetid] = tf_idf_dict[target]/tf_idf_count[target]
    #print(len(tf_idf_numbers[targetid]))
    #print(len(tf_idf_numbers[targetid][0]))
    for x in np.argsort(tf_idf_numbers[targetid][0])[-10:]:
        print("\t"+mapping[x])

toxic
	wikipedia
	suck
	stupid
	bitch
	ass
	don
	like
	shit
	fucking
	fuck
severe_toxic
	faggot
	cunt
	dick
	asshole
	ass
	suck
	shit
	bitch
	fucking
	fuck
obscene
	faggot
	cunt
	dick
	asshole
	suck
	ass
	bitch
	shit
	fucking
	fuck
threat
	ll
	rape
	hope
	shit
	ass
	going
	fuck
	fucking
	kill
	die
insult
	faggot
	idiot
	stupid
	asshole
	ass
	suck
	shit
	bitch
	fucking
	fuck
identity_hate
	like
	jew
	ass
	bitch
	shit
	fucking
	nigger
	faggot
	fuck
	gay


### Logistic Regression Practice

（1）在trainingdata上训练Logistic Regression模型并预测结果

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline

lrs = []
rocs = []
for targetid, target in enumerate(labels):
    lr = Pipeline([('vect', vect), ('clf', LogisticRegression())])
    lr = lr.fit(trainingdata, train[target])
    #lr = LogisticRegression().fit(X_train_tfidf, train[target])
    lrs.append(lr)
    #pred_training = lr.predict_proba(X_train_tfidf)[:,1]
    pred_training = lr.predict_proba(trainingdata)[:,1]
    roc = roc_auc_score(train[target], pred_training)
    print(target, 'ROC AUC:', roc)
    rocs.append(roc)
print('mean column-wise ROC AUC:', np.mean(rocs))

toxic ROC AUC: 0.9853635693332268
severe_toxic ROC AUC: 0.992690155863523
obscene ROC AUC: 0.993793914320817
threat ROC AUC: 0.9955089227434588
insult ROC AUC: 0.9882714531617584
identity_hate ROC AUC: 0.9907858102863504
mean column-wise ROC AUC: 0.9910689709515225


（2）在test_filter上计算ROC AUC指标

In [9]:
for targetid, target in enumerate(labels):
    X_test_tfidf = vect.transform(test_filter.comment_text)
    pred_testing = lr.predict_proba(X_test_tfidf)[:,1]
    roc = roc_auc_score(test_labels_filter[target], pred_testing)
    print(target, 'ROC AUC:', roc)
    rocs.append(roc)
print('mean column-wise ROC AUC:', np.mean(rocs))

toxic ROC AUC: 0.9005865638510603
severe_toxic ROC AUC: 0.9750375847544405
obscene ROC AUC: 0.9132678872099901
threat ROC AUC: 0.9390919042720474
insult ROC AUC: 0.9109509279959951
identity_hate ROC AUC: 0.9792107370272192
mean column-wise ROC AUC: 0.9637132862096275


（3）分析模型属于High Variance还是High Bias，抑或是二者都有？

> High Bias: Training Error and Testing Error are both large

> High Variance: The difference between Training Error and Testing Error is large

### K-Fold Practice

对trainingdata做5折交叉验证，计算training的平均ROC AUC指标、dev的平均ROC AUC指标

In [16]:
from sklearn.model_selection import KFold

k_fold = KFold(n_splits=5)

rocs_train_folds = []
rocs_dev_folds = []
for train_indices, dev_indices in k_fold.split(train):
    X_train = [trainingdata[i] for i in train_indices]
    X_dev = [trainingdata[i] for i in dev_indices]
    for label_id, label in enumerate(labels):
        rocs_train = []
        rocs_dev = []
        Y_train = [train[label][i] for i in train_indices]
        Y_dev = [train[label][i] for i in dev_indices]
        #print(len(X_train), len(Y_train), len(X_test), len(Y_test))
        lr.fit(X_train, Y_train)  
        Y_train_predicted = lr.predict_proba(X_train)[:,1]
        roc = roc_auc_score(Y_train, Y_train_predicted)
        rocs_train.append(roc)
        Y_dev_predicted = lr.predict_proba(X_dev)[:,1]
        roc = roc_auc_score(Y_dev, Y_dev_predicted)
        rocs_dev.append(roc)
    rocs_train_fold = np.mean(rocs_train)
    rocs_train_folds.append(rocs_train_fold)
    print('Train Mean Column-wise ROC AUC:', rocs_train_fold)
    rocs_dev_fold = np.mean(rocs_dev)
    rocs_dev_folds.append(rocs_dev_fold)
    print('Dev Mean Column-wise ROC AUC:', rocs_dev_fold)
print('5-Fold Train Mean Column-wise ROC AUC:', np.mean(rocs_train_folds))
print('5-Fold Dev Mean Column-wise ROC AUC:', np.mean(rocs_dev_folds))

Train Mean Column-wise ROC AUC: 0.9908182751983459
Dev Mean Column-wise ROC AUC: 0.9764494301108031
Train Mean Column-wise ROC AUC: 0.9913464681077744
Dev Mean Column-wise ROC AUC: 0.9732581142104844
Train Mean Column-wise ROC AUC: 0.9911611490342116
Dev Mean Column-wise ROC AUC: 0.9713320292875582
Train Mean Column-wise ROC AUC: 0.990984704961287
Dev Mean Column-wise ROC AUC: 0.9728900755004672
Train Mean Column-wise ROC AUC: 0.9906165421689256
Dev Mean Column-wise ROC AUC: 0.9777218980483355
5-Fold Train Mean Column-wise ROC AUC: 0.990985427894109
5-Fold Dev Mean Column-wise ROC AUC: 0.9743303094315298


### Grid Search Practice

（1）对vect\__max_df，vect\__max_features，vect\__ngram_range，clf\__C在trainingdata上进行网格搜索调参。

（2）在gridsearch时使用5折交叉验证

In [17]:
from sklearn.model_selection import GridSearchCV
Cs = np.logspace(-3, -1, 2)
parameters = {
    'vect__max_df': (0.5, 0.75),
    'vect__max_features': (None, 20000),
    'vect__ngram_range': ((1, 2), (1, 3)),  # 1-gram or 2-grams or 3-grams
    'clf__C': tuple(Cs)
}

In [18]:
k_fold = KFold(n_splits=50)

Y_devs = []
for train_indices, dev_indices in k_fold.split(train):
    X_dev = [trainingdata[i] for i in dev_indices]
    for label_id, label in enumerate(labels):
        Y_dev = [train[label][i] for i in dev_indices]
        Y_devs.append(Y_dev)
    break

In [20]:
from time import time

grid_search = GridSearchCV(lr, parameters, n_jobs=-1, verbose=1, cv=5, scoring='roc_auc')

best_rocs = []
best_models = []
for label_id, label in enumerate(labels):
    print()
    print("Performing grid search for", label, " ...")
    print("parameters:")
    print(parameters)
    t0 = time()
    #grid_search.fit(trainingdata, train[label])
    grid_search.fit(X_dev, Y_devs[label_id])
    print("done in %0.3fs" % (time() - t0))
    print()
    print(label, "Best ROC AUC: %0.3f" % grid_search.best_score_)
    best_rocs.append(grid_search.best_score_)
    print(label, "Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    best_models.append(grid_search.best_estimator_)
    for param_name in sorted(parameters.keys()): 
        print("\t%s: %r" % (param_name, best_parameters[param_name]))


Performing grid search for toxic  ...
parameters:
{'vect__max_df': (0.5, 0.75), 'vect__max_features': (None, 20000), 'vect__ngram_range': ((1, 2), (1, 3)), 'clf__C': (0.001, 0.1)}
Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   24.6s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:   47.5s finished


done in 49.107s

toxic Best ROC AUC: 0.927
toxic Best parameters set:
	clf__C: 0.1
	vect__max_df: 0.5
	vect__max_features: 20000
	vect__ngram_range: (1, 3)

Performing grid search for severe_toxic  ...
parameters:
{'vect__max_df': (0.5, 0.75), 'vect__max_features': (None, 20000), 'vect__ngram_range': ((1, 2), (1, 3)), 'clf__C': (0.001, 0.1)}
Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   26.3s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:   47.4s finished


done in 48.299s

severe_toxic Best ROC AUC: 0.977
severe_toxic Best parameters set:
	clf__C: 0.1
	vect__max_df: 0.5
	vect__max_features: None
	vect__ngram_range: (1, 2)

Performing grid search for obscene  ...
parameters:
{'vect__max_df': (0.5, 0.75), 'vect__max_features': (None, 20000), 'vect__ngram_range': ((1, 2), (1, 3)), 'clf__C': (0.001, 0.1)}
Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   30.3s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:   53.9s finished


done in 55.323s

obscene Best ROC AUC: 0.966
obscene Best parameters set:
	clf__C: 0.1
	vect__max_df: 0.5
	vect__max_features: None
	vect__ngram_range: (1, 3)

Performing grid search for threat  ...
parameters:
{'vect__max_df': (0.5, 0.75), 'vect__max_features': (None, 20000), 'vect__ngram_range': ((1, 2), (1, 3)), 'clf__C': (0.001, 0.1)}
Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   26.5s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:   50.2s finished


done in 51.472s

threat Best ROC AUC: 0.980
threat Best parameters set:
	clf__C: 0.1
	vect__max_df: 0.5
	vect__max_features: 20000
	vect__ngram_range: (1, 3)

Performing grid search for insult  ...
parameters:
{'vect__max_df': (0.5, 0.75), 'vect__max_features': (None, 20000), 'vect__ngram_range': ((1, 2), (1, 3)), 'clf__C': (0.001, 0.1)}
Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   27.8s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:   50.9s finished


done in 52.318s

insult Best ROC AUC: 0.948
insult Best parameters set:
	clf__C: 0.1
	vect__max_df: 0.5
	vect__max_features: 20000
	vect__ngram_range: (1, 3)

Performing grid search for identity_hate  ...
parameters:
{'vect__max_df': (0.5, 0.75), 'vect__max_features': (None, 20000), 'vect__ngram_range': ((1, 2), (1, 3)), 'clf__C': (0.001, 0.1)}
Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   21.8s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:   44.2s finished


done in 45.190s

identity_hate Best ROC AUC: 0.921
identity_hate Best parameters set:
	clf__C: 0.1
	vect__max_df: 0.5
	vect__max_features: None
	vect__ngram_range: (1, 2)


（3）使用最优参数得到的模型，在test_filter上生成测试结果

In [23]:
rocs = []
for labelid, label in enumerate(labels): 
    pred_filter = best_models[labelid].predict_proba(test_filter.comment_text)[:,1]
    roc = roc_auc_score(test_labels_filter[label], pred_filter)
    print(label, 'ROC AUC:', roc)
    rocs.append(roc)
print('mean column-wise ROC AUC:', np.mean(rocs))

toxic ROC AUC: 0.9202435513887415
severe_toxic ROC AUC: 0.9712177905925735
obscene ROC AUC: 0.9406179374530438
threat ROC AUC: 0.9447219613288513
insult ROC AUC: 0.9253114732382458
identity_hate ROC AUC: 0.9263522781642128
mean column-wise ROC AUC: 0.9380774986942781
