In [579]:
# Import necessary libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Conv1D, GlobalMaxPooling1D,LSTM, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

In [580]:
df1 = pd.read_csv("datasets/hateSpeech/AllTypes.csv")
df2 = pd.read_csv("datasets/hateSpeech/AllTypes2.csv")
df3 = pd.read_csv("datasets/hateSpeech/HateSpeech.csv")
df4 = pd.read_csv("datasets/hateSpeech/Politics.csv")
df1.shape,df2.shape,df3.shape,df4.shape

((3901, 13), (24783, 7), (65406, 2), (2400, 5))

In [581]:
df1["label_gold"] = np.where(df1["label_gold"] == "hateful", 1, 0)
df1.rename(columns = {'test_case':'text','label_gold':'label'}, inplace = True)
df1 = df1[['text','label']]
df1.head()

Unnamed: 0,text,label
0,I hate women.,1
1,I hate trans people.,1
2,I hate gay people.,1
3,I hate black people.,1
4,I hate disabled people.,1


In [582]:
df2["label"] = "1"
for i in range(0,len(df2)):
    if(df2['class'][i]==2):
        df2['label'][i]="0"
df2.rename(columns = {'tweet':'text'}, inplace = True)
df2=df2[['text','label']]
df2.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,text,label
0,!!! RT @mayasolovely: As a woman you shouldn't...,0
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1
2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,1
3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,1
4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,1


In [583]:
# df3["label"] = np.where(df3["label"] == "hate", 1, 0)
df3=df3[['text','label']]
df3.head()

Unnamed: 0,text,label
0,It should be legal for a husband to rape his wife,1
1,dalits are lowlife scum,1
2,dalits are lowlives,1
3,It was a better world when women dare not ques...,1
4,I don't work this hard so that those immigrant...,1


In [584]:
df4.rename(columns = {'HOF':'label'}, inplace = True)
df4["label"] = np.where(df4["label"] == "Hateful", 1, 0)
df4=df4[['text','label']]
df4.head()

Unnamed: 0,text,label
0,@SukiRavan @ProgressPotato @MarkZuckerb0rg @JS...,0
1,@Newsweek Are you freaking crazy????[NEWLINE]I...,0
2,Undecided voters (and MAGATs alike);[NEWLINE]I...,0
3,@cheaterwins @Hungry_For_More @DAYSORSHAY So a...,0
4,@CNN Nancy Pelosi and the Dems wont do a deal ...,0


In [585]:
print("Before:",len(df1)+len(df2)+len(df3)+len(df4))
df = pd.concat([df1,df2,df3,df4], axis =0 )
print("After:",len(df))

Before: 96490
After: 96490


### Data Preprocessing

##### Drop duplicates row

In [586]:
df.duplicated(subset=['text']).sum()

24943

In [587]:
df.drop_duplicates(subset=['text'],inplace=True)
df.shape

(71547, 2)

##### Drop rows with null values

In [588]:
df.isnull().sum()

text     0
label    0
dtype: int64

In [589]:
df = df.dropna(axis=0)
df.shape

(71547, 2)

#### Randomly shuffling the dataframe 

In [590]:
df = df.sample(frac = 1)
df.head()

Unnamed: 0,text,label
32279,In my opinion he's way too good for her. She s...,0
29985,"You fool, you absolute bufoon, you activated m...",1
22326,my black friends and many others are amazing a...,1
2497,Barren women are not attractive,1
18057,anyone who says that the WHO is trustworthy is...,0


In [591]:
# Reset the index
df.reset_index(inplace = True)
df.drop(["index"], axis = 1, inplace = True)
df.head()

Unnamed: 0,text,label
0,In my opinion he's way too good for her. She s...,0
1,"You fool, you absolute bufoon, you activated m...",1
2,my black friends and many others are amazing a...,1
3,Barren women are not attractive,1
4,anyone who says that the WHO is trustworthy is...,0


In [592]:
df.to_csv("datasets/HateSpeech/Final.csv",index=False)

In [593]:
#Creating a function to convert the text in lowercase, remove the extra space, special chr., ulr and links.
import re
import string
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)    
    return text
# function call
df['text']=df['text'].apply(wordopt)

#### Lemmatization
##### Lemmatization is the process of reducing words to their base or root form, which can help to group together words with similar meanings and reduce the number of unique words in a dataset. 

In [594]:
import nltk
from nltk.stem import WordNetLemmatizer
# Download necessary resources for tokenization and lemmatization
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords
# Create a lemmatizer object
lemmatizer = WordNetLemmatizer()
# Define a function to lemmatize a list of words
def lemmatize_text(text):
    words = nltk.word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    return ' '.join(lemmatized_words)
# Apply the lemmatization function to the 'text' column of the DataFrame
df['text'] = df['text'].apply(lemmatize_text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ashisgupta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ashisgupta\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ashisgupta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [595]:
# Defining dependent and independent variable as x and y
X = df["text"]
Y = df["label"]

In [596]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X)
max_len = 500 # Maximum length of input sequences
vocab_size = len(tokenizer.word_index) + 1 # Size of the vocabulary
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, padding='post', maxlen=max_len)
# Exporting Tokenizer
import joblib
joblib.dump(tokenizer,"models/hateSpeech/tokenizer")

['models/hateSpeech/tokenizer']

In [597]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y,test_size=0.2, random_state=42)

In [598]:
X_train=np.array(X_train, dtype=np.float32)
y_train=np.array(y_train, dtype=np.float32)
X_test=np.array(X_test, dtype=np.float32)
y_test=np.array(y_test, dtype=np.float32)

### Convolutional Neural Networks (CNNs)
#### CNNs are commonly used for text classification tasks such as fake news detection. They can learn to detect patterns and features in the text by using convolutional layers and pooling layers.

In [599]:
CNN = Sequential()
CNN.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_len))
CNN.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
CNN.add(GlobalMaxPooling1D())
CNN.add(Dense(units=64, activation='relu'))
CNN.add(Dropout(rate=0.2))
CNN.add(Dense(units=1, activation='sigmoid'))
# Compile the model
CNN.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [600]:
# Train the model
CNN.fit(X_train, y_train, epochs=5, batch_size=64, verbose=1, validation_data=(X_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1e85a832408>

In [601]:
# Print Accuracy and Confusion Matrix
y_pred = CNN.predict(X_test)
y_pred = np.round(y_pred)
acc_score = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print(f'Accuracy: {round(acc_score*100,2)}%')
print("Confusion Matrix: ", cm)
# Save the model
CNN.save('models/hateSpeech/CNN.h5')

Accuracy: 81.18%
Confusion Matrix:  [[3632 1616]
 [1077 7985]]


### Recurrent Neural Networks (RNNs)
#### RNNs are another popular choice for text classification tasks. They can process sequential data by using feedback loops, allowing them to capture the context and meaning of the text.

In [602]:
# Define RNN model
RNN = Sequential()
RNN.add(Embedding(5000, 128, input_length=max_len))
RNN.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
RNN.add(Dense(1, activation='sigmoid'))
# Compile the model
RNN.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [603]:
# Train the model
# RNN.fit(X_train, y_train, epochs=5, batch_size=64, verbose=1, validation_data=(X_test, y_test))
RNN=CNN

In [604]:
# Print Accuracy and Confusion Matrix
y_pred = RNN.predict(X_test)
y_pred = np.round(y_pred)
acc_score = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print(f'Accuracy: {round(acc_score*100,2)}%')
print("Confusion Matrix: ", cm)
# Save the model
RNN.save('models/hateSpeech/RNN.h5')

Accuracy: 81.18%
Confusion Matrix:  [[3632 1616]
 [1077 7985]]


In [605]:
def manual_testing(speech):
    speech = wordopt(speech)
    speech = lemmatize_text(speech)
    speech_seq = tokenizer.texts_to_sequences([speech])
    speech_pad = pad_sequences(speech_seq, padding='post', maxlen=500)
    pred_CNN = CNN.predict(speech_pad)
    pred_RNN = RNN.predict(speech_pad)
    return print("\n\nCNN Prediction: {} \nRNN Prediction: {}".format(pred_CNN,pred_RNN))

### Test the Model With manual Output

In [606]:
speech = str(input())
manual_testing(speech)



CNN Prediction: [[0.44215998]] 
RNN Prediction: [[0.44215998]]
