In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data_raw = pd.read_csv('train.csv')
#data_raw = data_raw.loc[np.random.choice(data_raw.index, size=2000)]
data = data_raw
data = data_raw.loc[np.random.choice(data_raw.index, size=2000)]
data.shape


(2000, 8)

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re

import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [4]:
def cleanHtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', str(sentence))
    return cleantext


def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned


def keepAlpha(sentence):
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-z A-Z]+', ' ', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent

In [5]:
data['comment_text'] = data['comment_text'].str.lower()
data['comment_text'] = data['comment_text'].apply(cleanHtml)
data['comment_text'] = data['comment_text'].apply(cleanPunc)
data['comment_text'] = data['comment_text'].apply(keepAlpha)
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
155524,bf4af9ec01b6b1aa,gary indiana u s died los angeles california...,0,0,0,0,0,0
127627,aa9f04bd11e52714,it seems that according to sywlia being polish...,0,0,0,0,0,0
5968,0fed0a3ab2ad5942,tell these certain editors to stop removing va...,0,0,0,0,0,0
150260,69326b995f3f2ada,bargo rural fire brigade,0,0,0,0,0,0
109581,4a30483c5c1ecec3,there now will you d,0,0,0,0,0,0


In [6]:
stop_words = set(stopwords.words('english'))
stop_words.update(['zero','one','two','three','four','five','six','seven','eight','nine','ten','may','also','across','among','beside','however','yet','within'])
re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)
def removeStopWords(sentence):
    global re_stop_words
    return re_stop_words.sub(" ", sentence)

data['comment_text'] = data['comment_text'].apply(removeStopWords)
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
155524,bf4af9ec01b6b1aa,gary indiana u died los angeles california u s,0,0,0,0,0,0
127627,aa9f04bd11e52714,seems according sywlia polish choice ...,0,0,0,0,0,0
5968,0fed0a3ab2ad5942,tell certain editors stop removing valid mat...,0,0,0,0,0,0
150260,69326b995f3f2ada,bargo rural fire brigade,0,0,0,0,0,0
109581,4a30483c5c1ecec3,d,0,0,0,0,0,0


In [7]:
stemmer = SnowballStemmer("english")
def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence

data['comment_text'] = data['comment_text'].apply(stemming)
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
155524,bf4af9ec01b6b1aa,gari indiana u die los angel california u s,0,0,0,0,0,0
127627,aa9f04bd11e52714,seem accord sywlia polish choic life ethnic po...,0,0,0,0,0,0
5968,0fed0a3ab2ad5942,tell certain editor stop remov valid materi ca...,0,0,0,0,0,0
150260,69326b995f3f2ada,bargo rural fire brigad,0,0,0,0,0,0
109581,4a30483c5c1ecec3,d,0,0,0,0,0,0


In [8]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, random_state=42, test_size=0.30, shuffle=True)
print(train.shape)
print(test.shape)

(1400, 8)
(600, 8)


In [9]:
train_text = train['comment_text']
test_text = test['comment_text']

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2')
vectorizer.fit(train_text)
vectorizer.fit(test_text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [63]:
x_train = vectorizer.transform(train_text).toarray()
y_train = train.drop(labels = ['id','comment_text'], axis=1).values

x_test = vectorizer.transform(test_text).toarray()
y_test = test.drop(labels = ['id','comment_text'], axis=1).values

In [66]:
import os
os.environ['CUDA_VISIBLE_DIVICES'] = '-1'
import keras
from keras.models import Sequential
from keras.layers import Dense

In [67]:
from bpmll.bpmll import bp_mll_loss

In [68]:
from keras.backend.tensorflow_backend import clear_session

In [99]:
clear_session()


In [100]:
model = Sequential()
model.add(Dense(512,activation='relu',input_shape=(x_train.shape[1],),kernel_initializer='glorot_uniform'))
model.add(Dense(256,activation='relu',kernel_initializer='glorot_uniform'))
model.add(Dense(y_train.shape[1],activation='sigmoid',kernel_initializer='glorot_uniform'))
model.compile(loss='binary_crossentropy', optimizer='adagrad', metrics=['acc'])

In [101]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 512)               22503936  
_________________________________________________________________
dense_2 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_3 (Dense)              (None, 6)                 1542      
Total params: 22,636,806
Trainable params: 22,636,806
Non-trainable params: 0
_________________________________________________________________


In [103]:
model.fit(x=x_train,y=y_train,epochs=10,validation_data=(x_test,y_test))

Train on 1400 samples, validate on 600 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
 192/1400 [===>..........................] - ETA: 21s - loss: 0.0222 - acc: 0.9913

KeyboardInterrupt: 

In [111]:
# using classifier chains
from skmultilearn.problem_transform import ClassifierChain
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# initialize classifier chains multi-label classifier
classifier = ClassifierChain(SVC())

# Training logistic regression model on train data
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)

# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))
print("\n")

Accuracy =  0.9




In [114]:
predictions.toarray()

array([[ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.]])

In [126]:
np.array([list(np.round(x).astype(int)) for x in model.predict(x_test)]).shape

(600, 6)

In [125]:
y_test.shape

(600, 6)

In [106]:
from sklearn.metrics import accuracy_score

In [127]:
accuracy_score(np.array([list(np.round(x).astype(int)) for x in model.predict(x_test)]),y_test)

0.91000000000000003

In [108]:
y_train

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ..., 
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int64)

In [62]:
x_train.toarray()

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])