In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Conv1D, GlobalMaxPooling1D,LSTM, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
df = pd.read_csv("datasets/HateSpeech/HateSpeech.csv")

In [3]:
df.head()

Unnamed: 0,text,label
0,It should be legal for a husband to rape his wife,1
1,dalits are lowlife scum,1
2,dalits are lowlives,1
3,It was a better world when women dare not ques...,1
4,I don't work this hard so that those immigrant...,1


In [4]:
# Randomly shuffling the dataframe 
df = df.sample(frac = 1)
df.head()

Unnamed: 0,text,label
25362,our country requires a good scrub I will begin...,1
23614,lets talk about the antithesis of emotional s...,0
48493,Aye we all needa keep are heads up high who ca...,1
23330,No one uses 'niglets' because its fucking rude,0
53168,Lmfao RT @Luvv_55st Eating pussy RT @DanaBlack...,1


In [5]:
# Reset the index
df.reset_index(inplace = True)
df.drop(["index"], axis = 1, inplace = True)
df.head()

Unnamed: 0,text,label
0,our country requires a good scrub I will begin...,1
1,lets talk about the antithesis of emotional s...,0
2,Aye we all needa keep are heads up high who ca...,1
3,No one uses 'niglets' because its fucking rude,0
4,Lmfao RT @Luvv_55st Eating pussy RT @DanaBlack...,1


In [6]:
#Creating a function to convert the text in lowercase, remove the extra space, special chr., ulr and links.
import re
import string
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)    
    return text
# function call
df['text']=df['text'].apply(wordopt)

#### Lemmatization
##### Lemmatization is the process of reducing words to their base or root form, which can help to group together words with similar meanings and reduce the number of unique words in a dataset. 

In [7]:
import nltk
from nltk.stem import WordNetLemmatizer
# Download necessary resources for tokenization and lemmatization
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords
# Create a lemmatizer object
lemmatizer = WordNetLemmatizer()
# Define a function to lemmatize a list of words
def lemmatize_text(text):
    words = nltk.word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    return ' '.join(lemmatized_words)
# Apply the lemmatization function to the 'text' column of the DataFrame
df['text'] = df['text'].apply(lemmatize_text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ashisgupta\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ashisgupta\AppData\Roaming\nltk_data...
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ashisgupta\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [8]:
# Defining dependent and independent variable as x and y
X = df["text"]
Y = df["label"]

In [9]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X)
max_len = 500 # Maximum length of input sequences
vocab_size = len(tokenizer.word_index) + 1 # Size of the vocabulary
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, padding='post', maxlen=max_len)
# Exporting Tokenizer
import joblib
joblib.dump(tokenizer,"models/hateSpeech/tokenizer")

['models/hateSpeech/tokenizer']

In [10]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y,test_size=0.2, random_state=42)

### Convolutional Neural Networks (CNNs)
#### CNNs are commonly used for text classification tasks such as fake news detection. They can learn to detect patterns and features in the text by using convolutional layers and pooling layers.

In [11]:
CNN = Sequential()
CNN.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_len))
CNN.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
CNN.add(GlobalMaxPooling1D())
CNN.add(Dense(units=64, activation='relu'))
CNN.add(Dropout(rate=0.2))
CNN.add(Dense(units=1, activation='sigmoid'))
# Compile the model
CNN.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [12]:
# Train the model
CNN.fit(X_train, y_train, epochs=5, batch_size=64, verbose=1, validation_data=(X_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x16fe32f4408>

In [13]:
# Print Accuracy and Confusion Matrix
y_pred = CNN.predict(X_test)
y_pred = np.round(y_pred)
acc_score = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print(f'Accuracy: {round(acc_score*100,2)}%')
print("Confusion Matrix: ", cm)
# Save the model
CNN.save('models/hateSpeech/CNN.h5')

Accuracy: 80.45%
Confusion Matrix:  [[3282 1191]
 [1366 7243]]


### Recurrent Neural Networks (RNNs)
#### RNNs are another popular choice for text classification tasks. They can process sequential data by using feedback loops, allowing them to capture the context and meaning of the text.

In [14]:
# Define RNN model
RNN = Sequential()
RNN.add(Embedding(5000, 128, input_length=max_len))
RNN.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
RNN.add(Dense(1, activation='sigmoid'))
# Compile the model
RNN.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [15]:
# Train the model
RNN.fit(X_train, y_train, epochs=5, batch_size=64, verbose=1, validation_data=(X_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x16fe33a4888>

In [16]:
# Print Accuracy and Confusion Matrix
y_pred = RNN.predict(X_test)
y_pred = np.round(y_pred)
acc_score = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print(f'Accuracy: {round(acc_score*100,2)}%')
print("Confusion Matrix: ", cm)
# Save the model
RNN.save('models/hateSpeech/RNN.h5')

Accuracy: 65.81%
Confusion Matrix:  [[   0 4473]
 [   0 8609]]


In [17]:
def manual_testing(speech):
    speech = wordopt(speech)
    speech = lemmatize_text(speech)
    speech_seq = tokenizer.texts_to_sequences([speech])
    speech_pad = pad_sequences(speech_seq, padding='post', maxlen=500)
    pred_CNN = CNN.predict(speech_pad)
    pred_RNN = RNN.predict(speech_pad)
    return print("\n\nCNN Prediction: {} \nRNN Prediction: {}".format(pred_CNN,pred_RNN))

### Test the Model With manual Output

In [18]:
speech = str(input())
manual_testing(speech)



CNN Prediction: [[0.12929419]] 
RNN Prediction: [[0.63576514]]
