In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers import Input, Embedding, Activation, Flatten, Dense
from keras.layers import Conv1D, MaxPooling1D, Dropout
from keras.models import Model
from keras.callbacks import EarlyStopping, ModelCheckpoint

Using TensorFlow backend.


In [2]:
data = pd.read_csv("hate_speech_lower_data.csv", encoding='ISO-8859-1')
data

Unnamed: 0.1,Unnamed: 0,comment_text,hate
0,0,explanation why the edits made under my userna...,0.0
1,1,d'aww! he matches this background colour i'm s...,0.0
2,2,"hey man, i'm really not trying to edit war. it...",0.0
3,3,""" more i can't make any real suggestions on im...",0.0
4,4,"you, sir, are my hero. any chance you remember...",0.0
5,5,""" congratulations from me as well, use the to...",0.0
6,6,cocksucker before you piss around on my work,1.0
7,7,your vandalism to the matt shirvington article...,0.0
8,8,sorry if the word 'nonsense' was offensive to ...,0.0
9,9,alignment on this subject and which are contra...,0.0


In [3]:
#data['comment_text'] = data['comment_text'].str.replace(r'[0-9]+', '')

X = data['comment_text'].values
y = data['hate'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
tf = Tokenizer(num_words=None, char_level=True, oov_token='UNK')
tf.fit_on_texts(X)

train_sequences = tf.texts_to_sequences(X_train)
test_sequences = tf.texts_to_sequences(X_test)

# Padding
train_data = pad_sequences(train_sequences, maxlen=1000, padding='post')
test_data = pad_sequences(test_sequences, maxlen=1000, padding='post')

# Convert to numpy array
train_data = np.array(train_data, dtype='float32')
test_data = np.array(test_data, dtype='float32')

In [5]:
# parameter
input_size = 1000
vocab_size = len(tf.word_index)
embedding_size = 69


model = Sequential()
model.add(Embedding(vocab_size+1, embedding_size, input_length=input_size))
model.add(Conv1D(16, kernel_size=2, activation='relu'))
model.add(MaxPooling1D())
model.add(Conv1D(32, kernel_size=2, activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(32))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))


hist_CV = model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1000, 69)          10902     
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 999, 16)           2224      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 499, 16)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 498, 32)           1056      
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 249, 32)           0         
_________________________________________________________________
flatten_1 (Flatten)  

In [6]:
file_path="char_level_best_weights.h5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early = EarlyStopping(monitor="val_loss", mode="auto", min_delta=0.0001, patience=3)

callbacks_list = [checkpoint, early] 
hist_CV = model.fit(train_data, y_train, validation_data=(test_data, y_test),
          batch_size=64, 
          epochs=10, 
          shuffle = True,
          callbacks=callbacks_list)

Instructions for updating:
Use tf.cast instead.
Train on 161788 samples, validate on 40448 samples
Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.22126, saving model to char_level_best_weights.h5
Epoch 2/10

Epoch 00002: val_loss improved from 0.22126 to 0.21060, saving model to char_level_best_weights.h5
Epoch 3/10

Epoch 00003: val_loss improved from 0.21060 to 0.20762, saving model to char_level_best_weights.h5
Epoch 4/10

Epoch 00004: val_loss improved from 0.20762 to 0.19697, saving model to char_level_best_weights.h5
Epoch 5/10

Epoch 00005: val_loss did not improve from 0.19697
Epoch 6/10

Epoch 00006: val_loss did not improve from 0.19697
Epoch 7/10

Epoch 00007: val_loss did not improve from 0.19697


In [7]:
model.save("Hate_calss_char_CNN.h5")

In [8]:
from sklearn.metrics import classification_report

y_true, y_pred = y_test, model.predict_classes(test_data)
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       0.95      0.98      0.96     36523
         1.0       0.73      0.49      0.59      3925

   micro avg       0.93      0.93      0.93     40448
   macro avg       0.84      0.74      0.78     40448
weighted avg       0.93      0.93      0.93     40448



In [11]:
comment = ["racism is very bad!", 'go back to your country']
comment_sequence = tf.texts_to_sequences(comment)
comment_data = pad_sequences(comment_sequence, maxlen=1000, padding='post')
comment_data = np.array(comment_data)
model.predict_classes(comment_data)

array([[0],
       [0]])