In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("/content/drive/MyDrive/NLP/HateSpeechDetection/Data/HateSpeechDatasetBalanced.csv")
df

Unnamed: 0,Content,Label
0,denial of normal the con be asked to comment o...,1
1,just by being able to tweet this insufferable ...,1
2,that is retarded you too cute to be single tha...,1
3,thought of a real badass mongol style declarat...,1
4,afro american basho,1
...,...,...
726114,i mute this telecasting and played kanye west ...,1
726115,but hell yeah he s not a bachelor but looooooo...,1
726116,great video musician but s not my musician lol...,1
726117,not great pop video yeah he s not a pedophile ...,1


### Checking for null values

In [None]:
df.isnull().sum()

Content    0
Label      0
dtype: int64

# Work flow

- Split values into X and y and create a copy from it to be cleaned
- Clean the text using WordNetLematizer and using the stopwords to remove redundant texts
- Create a one_hot_representation from it.
- Create a pre-padding to it using the `pad_sequence`
- Return the model back to form a set of independent variable.
- Splt the dataset into train and validate.
- Train the model.
- Carry out inferences

## Splitting the dataset into dependent and independent

In [None]:
X = df['Content']
y = df['Label']

In [None]:
messages = X.copy()

## Preprocessing the dataset and applying lematization

In [None]:
messages[1]

'just by being able to tweet this insufferable bullshit proves trump a nazi you vagina'

In [None]:
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
import re

nltk.download('stopwords')
nltk.download('wordnet')
lematizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
corpus = []

for i in range(0, len(messages)):
  result = messages[i]
  # We remove all the special characters and the numbers
  result = re.sub('[^A-Za-z]', ' ', messages[i])
  # We make sure it is in lower case
  result = result.lower().split()
  result = [lematizer.lemmatize(word) for word in result if not word in stopwords.words('english')]
  result = ' '.join(result)

  corpus.append(result)

In [None]:
corpus

['denial normal con asked comment tragedy emotional retard',
 'able tweet insufferable bullshit prof trump nazi vagina',
 'retarded cute single life',
 'thought real badass mongol style declaration war attacker capture citizen soon',
 'afro american basho',
 'yeah retard haha',
 'ching chong chung stuff',
 'dead slut still warm tweeted slut vagina mate obama sent fucking lowlife',
 'let tweet harmless affect way faggot one n',
 'latino problem immigration enforcement stay shithole',
 'feel much secondhand embarrassment white person call ppi milo say spic like want move',
 'got gorgeous figure unfunny twat',
 'vile vagina',
 'oh shut twat',
 'fucking retard twat chop',
 'fucking faggot',
 'holy shit faggot laughing as',
 'hate man laughing as go live life sour vagina stay spectating',
 'country honest god shithole fuck irrelevant foreigner former leader murder capital',
 'use cup mongoloid',
 'oh said nun mean cool k afro american',
 'fuck vagina fuck mike ashley prick nfc',
 'heard phr

# Creating a one_hot representation

In [None]:
!pip install tensorflow



In [None]:
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# To create a one_hot representation
voc_size = 500
one_hot_repr = [one_hot(word, voc_size) for word in corpus]
one_hot_repr

# Using padding techniques (pre-padding) - Embedding Representation

In [None]:
sent_length = 20

embedding_docs = pad_sequences(one_hot_repr, padding = 'pre', maxlen = sent_length)
embedding_docs

array([[  0,   0,   0, ..., 484, 364, 246],
       [  0,   0,   0, ..., 410, 169,  44],
       [  0,   0,   0, ..., 423, 178, 157],
       ...,
       [  0,   0,   0, ..., 327, 177, 111],
       [355, 289, 341, ...,  72, 359, 104],
       [  0,   0,   0, ..., 379, 327, 104]], dtype=int32)

# Creating a model and and an architecture

In [None]:
embedding_features = 40

model = Sequential()
model.add(Embedding(voc_size, embedding_features, input_length = sent_length))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 20, 40)            20000     
                                                                 
 lstm_1 (LSTM)               (None, 100)               56400     
                                                                 
 dropout_2 (Dropout)         (None, 100)               0         
                                                                 
 dense_1 (Dense)             (None, 1)                 101       
                                                                 
Total params: 76501 (298.83 KB)
Trainable params: 76501 (298.83 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
X_final = np.array(embedding_docs)
y_final = np.array(y)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, random_state = 1234)

In [None]:
history = model.fit(X_train, y_train, validation_data = (X_test, y_test), epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Saving the model in pickle file

In [None]:
import pickle

with open("HateSpeechDetect74%.pkl", 'wb') as f:
  pickle.dump(model, f)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
y_pred = model.predict(X_test)



In [None]:
y_pred = np.where(y_pred > 0.5, 1, 0)

In [None]:
accuracy_score(y_test, y_pred)

0.7418167795956592

In [None]:
confusion_matrix(y_test, y_pred)

array([[62412, 27758],
       [19110, 72250]])

In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.69      0.73     90170
           1       0.72      0.79      0.76     91360

    accuracy                           0.74    181530
   macro avg       0.74      0.74      0.74    181530
weighted avg       0.74      0.74      0.74    181530



In [None]:
test = pd.read_csv("/content/drive/MyDrive/NLP/HateSpeechDetection/Data/HateSpeechDatasetBalanced.csv")
test

Unnamed: 0,Content,Label
0,denial of normal the con be asked to comment o...,1
1,just by being able to tweet this insufferable ...,1
2,that is retarded you too cute to be single tha...,1
3,thought of a real badass mongol style declarat...,1
4,afro american basho,1
...,...,...
726114,i mute this telecasting and played kanye west ...,1
726115,but hell yeah he s not a bachelor but looooooo...,1
726116,great video musician but s not my musician lol...,1
726117,not great pop video yeah he s not a pedophile ...,1


In [None]:
def predicting_pipeline(message):

  corpus = []

  for i in range(0, len(message)):
    result = message[i]
    # We remove all the special characters and the numbers
    result = re.sub('[^A-Za-z]', ' ', message[i])
    # We make sure it is in lower case
    result = result.lower().split()
    result = [lematizer.lemmatize(word) for word in result if not word in stopwords.words('english')]
    result = ' '.join(result)

    corpus.append(result)


  # To create a one_hot representation
  voc_size = 500
  one_hot_repr = [one_hot(word, voc_size) for word in corpus]

  sent_length = 20

  embedding_docs = pad_sequences(one_hot_repr, padding = 'pre', maxlen = sent_length)
  y_pred = model.predict(np.array(embedding_docs).reshape(1,-1))
  predictions = np.where(y_pred > 0.6, 0, 1)

  return predictions

In [None]:
predicting_pipeline("Hello motherfucker")



array([[1]])