In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import seaborn as sn
import tensorflow as tf

In [26]:
dataset = pd.read_csv("labeled_data.csv")
dataset["labels"] = dataset["class"].map({
    0: "Hate Speech",
    1: "Offensive Language",
    2: "Neither Hate Nor Offensive"
})
data = dataset[["tweet","class"]]
data.shape

(24783, 2)

In [27]:
set_stopwords = set(stopwords.words("English"))

In [28]:
stemmer = nltk.SnowballStemmer(language = "english")

In [29]:
# DATA CLEANING
def clean_tweet(tweet):
    tweet = tweet.lower()
    tweet = re.sub("https?://\S+|www.\S+",'',tweet)
    tweet = re.sub('\[.?\]','',tweet)
    tweet = re.sub("\[%s\]"%re.escape(string.punctuation),'',tweet)
    tweet = re.sub('\n','',tweet)
    tweet = re.sub('\w\d\w','',tweet)
    tweet = [word for word in tweet.split(' ') if word not in set_stopwords]
    tweet = " ".join(tweet)
    tweet = [stemmer.stem(word) for word in tweet.split(' ')]
    tweet = " ".join(tweet)
    return tweet

In [30]:
data.loc[:,"tweet"] = data.loc[:,"tweet"].apply(clean_tweet) 

In [31]:
X = np.array(data["tweet"])
Y = np.array(data["class"])
Y

array([2, 1, 1, ..., 1, 1, 2], dtype=int64)

In [32]:
from sklearn.feature_extraction.text import CountVectorizer

In [33]:
cv = CountVectorizer()
X = cv.fit_transform(X)

In [34]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.25, random_state = 55)

In [35]:
for i in Y_test:
    print(i)

1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
1
1
2
0
1
1
1
1
1
1
1
1
1
1
1
1
1
2
1
1
1
1
1
1
1
2
1
2
1
2
1
1
1
2
1
1
1
0
0
1
1
2
1
0
1
1
1
1
1
2
1
1
1
2
1
1
2
1
1
1
1
2
2
1
1
1
2
1
1
2
1
2
1
1
2
1
1
1
1
1
1
2
1
1
1
2
1
2
1
1
1
1
1
1
1
1
1
1
2
1
1
1
1
1
1
2
1
1
0
2
1
1
1
1
1
2
1
1
1
2
1
1
2
1
2
1
1
1
2
1
1
1
1
1
2
1
1
1
1
1
2
1
2
1
1
1
1
1
1
1
1
1
1
1
1
2
1
1
2
1
2
1
2
1
1
1
2
0
1
1
1
1
1
2
1
1
1
1
1
0
1
2
2
1
0
1
1
1
1
1
1
1
1
1
1
1
1
1
0
2
1
1
1
1
1
1
1
1
1
2
1
2
1
2
1
2
1
2
2
1
1
1
0
2
1
1
2
1
1
1
1
1
1
1
1
1
0
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
2
1
1
1
1
1
1
1
1
2
1
1
1
1
1
1
1
1
1
1
1
1
2
1
2
1
1
2
1
2
2
1
1
1
1
1
1
1
1
1
1
0
1
2
1
1
2
1
1
2
1
1
1
1
2
1
1
1
2
2
1
1
1
1
1
2
1
1
1
1
1
1
2
1
1
1
1
2
1
1
1
1
1
1
0
1
2
1
2
1
1
1
1
1
1
2
1
1
1
2
1
1
1
1
1
1
1
2
1
2
1
1
1
1
0
2
1
2
1
2
1
1
2
1
1
1
1
1
2
1
1
2
1
1
1
2
1
1
1
2
2
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
1
2
1
1
1
1
1
0
1
1
1
1
2
1
2
2
2
1
1
1
1
1
1
1
1
2
2
1
1
1
1
2
2
2
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
1
0
1
1
2
0
1
1
1
2
1
2
1


In [36]:
from tensorflow.keras.utils import to_categorical

# Convert class labels to one-hot encoded format
Y_train = to_categorical(Y_train, num_classes=3)
Y_test = to_categorical(Y_test, num_classes=3)

In [37]:
for i in Y_test:
    print(i)

[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 0. 1.]
[0. 1. 0.]
[0. 1. 0.]
[0. 0. 1.]
[1. 0. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 0. 1.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 0. 1.]
[0. 1. 0.]
[0. 0. 1.]
[0. 1. 0.]
[0. 0. 1.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 0. 1.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[1. 0. 0.]
[1. 0. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 0. 1.]
[0. 1. 0.]
[1. 0. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 0. 1.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 0. 1.]
[0. 1. 0.]
[0. 1. 0.]
[0. 0. 1.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 0. 1.]
[0. 0. 1.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 0. 1.]
[0. 1. 0.]
[0. 1. 0.]
[0. 0. 1.]
[0. 1. 0.]
[0. 0. 1.]
[0. 1. 0.]
[0. 1. 0.]
[0. 0. 1.]
[0. 1. 0.]

In [38]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [39]:
X_train
type(X_train)

scipy.sparse._csr.csr_matrix

In [40]:
model = Sequential([
    Dense(units = 100, activation = "relu"),
    Dense(units = 50, activation = "relu"),
    Dense(units = 10, activation = "relu"),
    Dense(units = 3, activation = 'softmax')
])

In [41]:
model

<keras.src.engine.sequential.Sequential at 0x2d55e697c40>

In [42]:
from tensorflow.keras.losses import categorical_crossentropy

In [43]:
model.compile(loss = 'categorical_crossentropy',optimizer = 'adam',metrics = ['accuracy'])

In [44]:
model.summary

<bound method Model.summary of <keras.src.engine.sequential.Sequential object at 0x000002D55E697C40>>

In [45]:
def convert_sparse_matrix_to_sparse_tensor(X):
    coo = X.tocoo()
    indices = np.mat([coo.row, coo.col]).transpose()
    return tf.SparseTensor(indices, coo.data, coo.shape)
X_train = convert_sparse_matrix_to_sparse_tensor(X_train)


In [46]:
X_train = tf.sparse.reorder(X_train)
type(X_train)

tensorflow.python.framework.sparse_tensor.SparseTensor

In [47]:
model.fit(X_train,Y_train,epochs = 100, batch_size = 50)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x2d55e694dc0>

In [48]:
X_test = convert_sparse_matrix_to_sparse_tensor(X_test)

In [49]:
X_test = tf.sparse.reorder(X_test)

In [50]:
Y_pred = model.predict(X_test)



In [51]:
from sklearn.metrics import accuracy_score

In [52]:
Y_pred

array([[1.1884557e-06, 9.9999881e-01, 1.8180914e-10],
       [2.9668186e-07, 9.9999976e-01, 1.2063677e-08],
       [4.2825695e-12, 1.0000000e+00, 2.1186466e-09],
       ...,
       [1.4283438e-08, 1.0000000e+00, 7.0539095e-09],
       [1.0886817e-17, 1.0000000e+00, 1.9812627e-16],
       [1.0583282e-22, 1.0000000e+00, 1.9046385e-20]], dtype=float32)

In [53]:
Y_test

array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       ...,
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.]], dtype=float32)

In [54]:
length = len(Y_pred)
output = np.zeros(length)
test = np.zeros(length)
for i in range(length):
    output[i] = np.argmax(Y_pred[i])
    test[i] = np.argmax(Y_test[i])

In [55]:
output

array([1., 1., 1., ..., 1., 1., 1.])

In [56]:
test

array([1., 1., 1., ..., 1., 1., 1.])

In [57]:
print(accuracy_score(output,test))

0.8863783085861846
