In [2]:
import os
import math
import jieba
import random
import warnings
import re
import numpy as np
import pandas as pd
warnings.filterwarnings("ignore")
from scipy.sparse import hstack
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
train=pd.read_csv('./DATA/training.csv')
test=pd.read_csv('./DATA/test.csv')

In [4]:
test.head()

Unnamed: 0,id,comment_text
0,00054a5e18b50dd4,bbq \n\nbe a man and lets discuss it-maybe ove...
1,00078f8ce7eb276d,"\n\nJuelz Santanas Age\n\nIn 2002, Juelz Santa..."
2,000897889268bc93,REDIRECT Talk:Voydan Pop Georgiev- Chernodrinski
3,0009801bd85e5806,The Mitsurugi point made no sense - why not ar...
4,000c6a3f0cd3ba8e,\n\n The Signpost: 24 September 2012 \n\n Read...


In [5]:
# class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [4]:
train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

In [5]:
word_vectorizer = TfidfVectorizer(
    min_df=50,
    max_df=0.5,
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1,1),
    max_features=20000)
word_vectorizer.fit(all_text)
train_word_tezhen = word_vectorizer.transform(train_text)
test_word_tezhen = word_vectorizer.transform(test_text)

In [7]:
char_vectorizer = TfidfVectorizer(
    min_df=10,
    max_df=0.5,
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(1, 4),
    max_features=20000)
char_vectorizer.fit(all_text)
train_char_tezhen = char_vectorizer.transform(train_text)
test_char_tezhen = char_vectorizer.transform(test_text)

In [8]:
train_tezhen = hstack((train_word_tezhen, train_char_tezhen))
test_tezhen = hstack((test_word_tezhen, test_char_tezhen))

In [15]:
train_tezhen

<127657x40000 sparse matrix of type '<class 'numpy.float64'>'
	with 85683626 stored elements in COOrdinate format>

In [9]:
import sklearn
from sklearn.model_selection import train_test_split
train_target = train['toxic']
X_train, X_test, y_train, y_test = train_test_split(train_tezhen, train_target, test_size=0.3, random_state=112)

classifier = LogisticRegression(C=3.0, solver='sag')
classifier.fit(X_train, y_train)
train_pred_toxic = classifier.predict(X_train)
test_pred_toxic = classifier.predict(X_test)

predict_toxic_0 = classifier.predict(test_tezhen)
predict_toxic=pd.DataFrame(predict_toxic_0,columns=['toxic'])

from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve, auc
print('toxic-训练集F1值:%s'%f1_score(train_pred_toxic,y_train))
print('toxic-验证集F1值:%s'%f1_score(test_pred_toxic,y_test))

toxic-训练集F1值:0.885279187817259
toxic-验证集F1值:0.7766901193025391


In [13]:

train_target = train['severe_toxic']
X_train, X_test, y_train, y_test = train_test_split(train_tezhen, train_target, test_size=0.3, random_state=112)
classifier = LogisticRegression(C=3.0, solver='sag')
classifier.fit(X_train, y_train)
train_pred_severe_toxic = classifier.predict(X_train)
test_pred_severe_toxic = classifier.predict(X_test)

predict_severe_toxic_0 = classifier.predict(test_tezhen)
predict_severe_toxic=pd.DataFrame(predict_severe_toxic_0,columns=['severe_toxic'])

print('severe_toxic-训练集F1值:%s'%f1_score(train_pred_severe_toxic,y_train))
print('severe_toxic-验证集F1值:%s'%f1_score(test_pred_severe_toxic,y_test))

severe_toxic-训练集F1值:0.6534653465346535
severe_toxic-验证集F1值:0.36303630363036304


In [14]:

train_target = train['obscene']
X_train, X_test, y_train, y_test = train_test_split(train_tezhen, train_target, test_size=0.3, random_state=112)
classifier = LogisticRegression(C=3.0, solver='sag')
classifier.fit(X_train, y_train)
train_pred_obscene = classifier.predict(X_train)
test_pred_obscene = classifier.predict(X_test)

predict_obscene_0 = classifier.predict(test_tezhen)
predict_obscene=pd.DataFrame(predict_obscene_0,columns=['obscene'])


print('obscene-训练集F1值:%s'%f1_score(train_pred_obscene,y_train))
print('obscene-验证集F1值:%s'%f1_score(test_pred_obscene,y_test))

obscene-训练集F1值:0.8890395480225989
obscene-验证集F1值:0.7997799779977998


In [15]:

train_target = train['threat']
X_train, X_test, y_train, y_test = train_test_split(train_tezhen, train_target, test_size=0.3, random_state=112)
classifier = LogisticRegression(C=3.0, solver='sag')
classifier.fit(X_train, y_train)
train_pred_threat = classifier.predict(X_train)
test_pred_threat = classifier.predict(X_test)

predict_threat_0 = classifier.predict(test_tezhen)
predict_threat=pd.DataFrame(predict_threat_0,columns=['threat'])


print('threat-训练集F1值:%s'%f1_score(train_pred_threat,y_train))
print('threat-验证集F1值:%s'%f1_score(test_pred_threat,y_test))

threat-训练集F1值:0.6553398058252426
threat-验证集F1值:0.34355828220858897


In [16]:

train_target = train['insult']
X_train, X_test, y_train, y_test = train_test_split(train_tezhen, train_target, test_size=0.3, random_state=112)
classifier = LogisticRegression(C=3.0, solver='sag')
classifier.fit(X_train, y_train)
train_pred_insult = classifier.predict(X_train)
test_pred_insult = classifier.predict(X_test)

predict_insult_0 = classifier.predict(test_tezhen)
predict_insult=pd.DataFrame(predict_insult_0,columns=['insult'])


print('insult-训练集F1值:%s'%f1_score(train_pred_insult,y_train))
print('insult-验证集F1值:%s'%f1_score(test_pred_insult,y_test))

insult-训练集F1值:0.834514484645033
insult-验证集F1值:0.7006331022007838


In [17]:

train_target = train['identity_hate']
X_train, X_test, y_train, y_test = train_test_split(train_tezhen, train_target, test_size=0.3, random_state=112)
classifier = LogisticRegression(C=3.0, solver='sag')
classifier.fit(X_train, y_train)
train_pred_identity_hate = classifier.predict(X_train)
test_pred_identity_hate = classifier.predict(X_test)

predict_identity_hate_0 = classifier.predict(test_tezhen)
predict_identity_hate=pd.DataFrame(predict_identity_hate_0,columns=['identity_hate'])


print('identity_hate-训练集F1值:%s'%f1_score(train_pred_identity_hate,y_train))
print('identity_hate-验证集F1值:%s'%f1_score(test_pred_identity_hate,y_test))

identity_hate-训练集F1值:0.6769480519480519
identity_hate-验证集F1值:0.40579710144927533


In [18]:
final=pd.concat([test[['id']],predict_toxic,
                     predict_severe_toxic,predict_obscene,predict_threat,
                     predict_insult,predict_identity_hate],axis=1)


In [19]:
final.to_csv('xx_result.csv')