In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers import Input, Embedding, Activation, Flatten, Dense
from keras.layers import Conv1D, MaxPooling1D, Dropout
from keras.models import Model
from keras.callbacks import EarlyStopping, ModelCheckpoint

Using TensorFlow backend.


In [2]:
data = pd.read_csv("hate_speech_lower_data.csv", encoding='ISO-8859-1')
data

Unnamed: 0.1,Unnamed: 0,comment_text,hate
0,0,explanation why the edits made under my userna...,0.0
1,1,d'aww! he matches this background colour i'm s...,0.0
2,2,"hey man, i'm really not trying to edit war. it...",0.0
3,3,""" more i can't make any real suggestions on im...",0.0
4,4,"you, sir, are my hero. any chance you remember...",0.0
5,5,""" congratulations from me as well, use the to...",0.0
6,6,cocksucker before you piss around on my work,1.0
7,7,your vandalism to the matt shirvington article...,0.0
8,8,sorry if the word 'nonsense' was offensive to ...,0.0
9,9,alignment on this subject and which are contra...,0.0


In [3]:
#data['comment_text'] = data['comment_text'].str.replace(r'[0-9]+', '')

X = data['comment_text'].values
y = data['hate'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
tf = Tokenizer(num_words=10000, char_level=False, oov_token='UNK')
tf.fit_on_texts(X)

train_sequences = tf.texts_to_sequences(X_train)
test_sequences = tf.texts_to_sequences(X_test)

# Padding
train_data = pad_sequences(train_sequences, maxlen=500, padding='post')
test_data = pad_sequences(test_sequences, maxlen=500, padding='post')



# Convert to numpy array
train_data = np.array(train_data, dtype='float32')
test_data = np.array(test_data, dtype='float32')

In [14]:
# parameter
input_size = 500
vocab_size = 10000
embedding_size = 32 #word lenth


model = Sequential()
model.add(Embedding(vocab_size, embedding_size, input_length=input_size))
model.add(Conv1D(16, kernel_size=2, activation='relu'))
model.add(MaxPooling1D())
model.add(Conv1D(32, kernel_size=2, activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(32))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))


hist_CV = model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 500, 32)           320000    
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 499, 16)           1040      
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 249, 16)           0         
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 248, 32)           1056      
_________________________________________________________________
max_pooling1d_8 (MaxPooling1 (None, 124, 32)           0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 3968)              0         
_________________________________________________________________
dense_10 (Dense)             (None, 64)                254016    
__________

In [15]:
file_path="best_weights.h5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early = EarlyStopping(monitor="val_loss", mode="min", patience=3)

callbacks_list = [checkpoint, early] 
hist_CV = model.fit(train_data, y_train, validation_data=(test_data, y_test),
          batch_size=64, 
          epochs=10, 
          shuffle = True,
          callbacks=callbacks_list)

Train on 161788 samples, validate on 40448 samples
Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.13839, saving model to best_weights.h5
Epoch 2/10

Epoch 00002: val_loss did not improve from 0.13839
Epoch 3/10

Epoch 00003: val_loss did not improve from 0.13839
Epoch 4/10

Epoch 00004: val_loss did not improve from 0.13839


In [17]:
model.save("Hate_calss_CNN.h5")

In [20]:
from sklearn.metrics import classification_report

y_true, y_pred = y_test, model.predict_classes(test_data)
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       0.97      0.97      0.97     36523
         1.0       0.69      0.73      0.71      3925

   micro avg       0.94      0.94      0.94     40448
   macro avg       0.83      0.85      0.84     40448
weighted avg       0.94      0.94      0.94     40448



In [16]:
comment = ["racism is bad!", "go back to your country!"]
comment_sequence = tf.texts_to_sequences(comment)
comment_data = pad_sequences(comment_sequence, maxlen=500, padding='post')
#print(omment_data.shape)
comment_data = np.array(comment_data)
model.predict_classes(comment_data)

array([[0],
       [1]])