In [83]:
import pandas as pd
import numpy as np
import spacy
import tensorflow as tf
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from tensorflow import keras


In [84]:
df = pd.read_csv('labeled_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [85]:
df.shape

(24783, 7)

In [86]:
df.isnull().sum()

Unnamed: 0            0
count                 0
hate_speech           0
offensive_language    0
neither               0
class                 0
tweet                 0
dtype: int64

In [87]:
df.columns

Index(['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither',
       'class', 'tweet'],
      dtype='object')

In [88]:
#delete unwanted columns
df.drop(columns = ['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither'],inplace= True)

0 - hate speech
1 - offensive language
2 - neither

In [89]:
df.head()

Unnamed: 0,class,tweet
0,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [90]:
df.isnull().sum()

class    0
tweet    0
dtype: int64

In [91]:
df['tweet'].iloc[0]

"!!! RT @mayasolovely: As a woman you shouldn't complain about cleaning up your house. &amp; as a man you should always take the trash out..."

In [92]:
df['tweet'].iloc[1000]

'&#128514;&#128514;&#128514;&#128514;&#128514;&#128514;&#128514;"@betysweetcocker: That pussy is just....&#128561; imma assume she just had a baby like..the day before"'

In [93]:
#deleting unwanted sybols and numeric data
df['processed_tweet'] = df['tweet'].str.replace(r'[^a-zA-Z]', ' ',regex = True)

In [94]:
df.head()

Unnamed: 0,class,tweet,processed_tweet
0,2,!!! RT @mayasolovely: As a woman you shouldn't...,RT mayasolovely As a woman you shouldn t...
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,RT mleew boy dats cold tyga dwn ba...
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,RT UrKindOfBrand Dawg RT sbaby...
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,RT C G Anderson viva based she lo...
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,RT ShenikaRoberts The shit you...


In [95]:
df['processed_tweet'].iloc[1000]

'                                                                 betysweetcocker  That pussy is just              imma assume she just had a baby like  the day before '

In [96]:
#handling unwanted space
df['processed_tweet_2'] = df['processed_tweet'].str.replace(r'[\s]+', ' ',regex = True)

In [97]:
df['processed_tweet_2'].iloc[1000]

' betysweetcocker That pussy is just imma assume she just had a baby like the day before '

In [98]:
df.columns

Index(['class', 'tweet', 'processed_tweet', 'processed_tweet_2'], dtype='object')

In [99]:
#deleting unwanted columns
df.drop(columns=['tweet', 'processed_tweet'],inplace = True)

In [100]:
df.head()

Unnamed: 0,class,processed_tweet_2
0,2,RT mayasolovely As a woman you shouldn t comp...
1,1,RT mleew boy dats cold tyga dwn bad for cuffi...
2,1,RT UrKindOfBrand Dawg RT sbaby life You ever ...
3,1,RT C G Anderson viva based she look like a tr...
4,1,RT ShenikaRoberts The shit you hear about me ...


In [101]:
#NLP
nlp = spacy.load('en_core_web_sm')


In [102]:
# 6. LEMMATIZATION
def lemmatization(text):
  doc = nlp(text)
  lemmaList = [word.lemma_ for word in doc]
  return ' '.join(lemmaList)

In [103]:
df['lemma_tweet']  = df['processed_tweet_2'].apply(lemmatization)

In [104]:
df.head()

Unnamed: 0,class,processed_tweet_2,lemma_tweet
0,2,RT mayasolovely As a woman you shouldn t comp...,RT mayasolovely as a woman you shouldn t com...
1,1,RT mleew boy dats cold tyga dwn bad for cuffi...,RT mleew boy dat cold tyga dwn bad for cuffi...
2,1,RT UrKindOfBrand Dawg RT sbaby life You ever ...,RT UrKindOfBrand Dawg RT sbaby life you ever...
3,1,RT C G Anderson viva based she look like a tr...,RT C G Anderson viva base she look like a tr...
4,1,RT ShenikaRoberts The shit you hear about me ...,RT ShenikaRoberts the shit you hear about I ...


In [105]:
df['lemma_tweet'].iloc[1000]

'  betysweetcocker that pussy be just imma assume she just have a baby like the day before'

In [106]:
#removing stopwords
def remove_stopwords(text):
  doc = nlp(text)
  no_stopwords_list = [word.text for word in doc if not word.is_stop]
  return ' '.join(no_stopwords_list)



In [107]:
df['final_tweet'] = df['lemma_tweet'].apply(remove_stopwords)

In [108]:
df.head()

Unnamed: 0,class,processed_tweet_2,lemma_tweet,final_tweet
0,2,RT mayasolovely As a woman you shouldn t comp...,RT mayasolovely as a woman you shouldn t com...,RT mayasolovely woman shouldn t complain cl...
1,1,RT mleew boy dats cold tyga dwn bad for cuffi...,RT mleew boy dat cold tyga dwn bad for cuffi...,RT mleew boy dat cold tyga dwn bad cuffin d...
2,1,RT UrKindOfBrand Dawg RT sbaby life You ever ...,RT UrKindOfBrand Dawg RT sbaby life you ever...,RT UrKindOfBrand Dawg RT sbaby life fuck bi...
3,1,RT C G Anderson viva based she look like a tr...,RT C G Anderson viva base she look like a tr...,RT C G Anderson viva base look like tranny
4,1,RT ShenikaRoberts The shit you hear about me ...,RT ShenikaRoberts the shit you hear about I ...,RT ShenikaRoberts shit hear true faker bitc...


In [109]:
df['lemma_tweet'].iloc[1000]

'  betysweetcocker that pussy be just imma assume she just have a baby like the day before'

In [110]:
df['final_tweet'].iloc[1000]

'   betysweetcocker pussy imma assume baby like day'

In [111]:
df.head()

Unnamed: 0,class,processed_tweet_2,lemma_tweet,final_tweet
0,2,RT mayasolovely As a woman you shouldn t comp...,RT mayasolovely as a woman you shouldn t com...,RT mayasolovely woman shouldn t complain cl...
1,1,RT mleew boy dats cold tyga dwn bad for cuffi...,RT mleew boy dat cold tyga dwn bad for cuffi...,RT mleew boy dat cold tyga dwn bad cuffin d...
2,1,RT UrKindOfBrand Dawg RT sbaby life You ever ...,RT UrKindOfBrand Dawg RT sbaby life you ever...,RT UrKindOfBrand Dawg RT sbaby life fuck bi...
3,1,RT C G Anderson viva based she look like a tr...,RT C G Anderson viva base she look like a tr...,RT C G Anderson viva base look like tranny
4,1,RT ShenikaRoberts The shit you hear about me ...,RT ShenikaRoberts the shit you hear about I ...,RT ShenikaRoberts shit hear true faker bitc...


In [112]:
#one hot represtaniton
vocab_size = 10000
one_hot_rep = [one_hot(words,vocab_size) for words in df['final_tweet']]

In [113]:
for i in range(0,4):
    print(df['final_tweet'].iloc[i])

   RT mayasolovely woman shouldn t complain clean house amp man trash
   RT mleew boy dat cold tyga dwn bad cuffin dat hoe st place
   RT UrKindOfBrand Dawg RT sbaby life fuck bitch start cry confuse shit
   RT C G Anderson viva base look like tranny


In [114]:
for i in range(0,4):
    print(one_hot_rep[i])

[9323, 1265, 349, 2953, 1518, 8471, 178, 5167, 6599, 1664, 7232]
[9323, 6157, 8854, 621, 9146, 1729, 3702, 6729, 5033, 621, 9880, 942, 8342]
[9323, 2239, 249, 9323, 9859, 1690, 5521, 2921, 8228, 3340, 7488, 7629]
[9323, 3492, 6657, 501, 1211, 4809, 4143, 788, 8835]


In [126]:
#padding
sentence_length = 20
padded = pad_sequences(one_hot_rep, padding='pre', maxlen = sentence_length)
     

In [136]:
X.shape

(42543, 20)

In [127]:
for i in range(0,4):
    print(padded[i])

[   0    0    0    0    0    0    0    0    0 9323 1265  349 2953 1518
 8471  178 5167 6599 1664 7232]
[   0    0    0    0    0    0    0 9323 6157 8854  621 9146 1729 3702
 6729 5033  621 9880  942 8342]
[   0    0    0    0    0    0    0    0 9323 2239  249 9323 9859 1690
 5521 2921 8228 3340 7488 7629]
[   0    0    0    0    0    0    0    0    0    0    0 9323 3492 6657
  501 1211 4809 4143  788 8835]


Making model


In [128]:
X = np.array(padded)
y = np.array(df['class'])

In [129]:
df['class'].value_counts()

class
1    19190
2     4163
0     1430
Name: count, dtype: int64

In [130]:
smote = SMOTE(sampling_strategy = 'minority')
X,y = smote.fit_resample(X,y)

In [131]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)


In [132]:
X.shape,X_train.shape,X_test.shape

((42543, 20), (34034, 20), (8509, 20))

In [137]:
sentence_length = X.shape[1]


In [138]:
dimension = 50

model = keras.Sequential([
    #embedding layer
    keras.layers.Embedding(vocab_size, dimension, input_length = sentence_length),
    # LSTM layer
    keras.layers.LSTM(100,return_sequences=True),
    keras.layers.LSTM(50, return_sequences=True),
    keras.layers.LSTM(50),
    #output layer
    keras.layers.Dense(3, activation = 'softmax')
])





In [139]:
#compilation
model.compile(optimizer = 'adam',
              loss = 'sparse_categorical_crossentropy',
              metrics = ['accuracy'])

In [142]:
model.fit(X_train, y_train, epochs = 10, batch_size=32)


Epoch 1/10
[1m1064/1064[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 14ms/step - accuracy: 0.8710 - loss: 0.3418
Epoch 2/10
[1m1064/1064[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 15ms/step - accuracy: 0.9430 - loss: 0.1730
Epoch 3/10
[1m1064/1064[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 14ms/step - accuracy: 0.9601 - loss: 0.1255
Epoch 4/10
[1m1064/1064[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 14ms/step - accuracy: 0.9706 - loss: 0.0965
Epoch 5/10
[1m1064/1064[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 15ms/step - accuracy: 0.9769 - loss: 0.0757
Epoch 6/10
[1m1064/1064[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 15ms/step - accuracy: 0.9823 - loss: 0.0550
Epoch 7/10
[1m1064/1064[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 15ms/step - accuracy: 0.9859 - loss: 0.0411
Epoch 8/10
[1m1064/1064[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 15ms/step - accuracy: 0.9904 - loss: 0.0303
Epoch 9/

<keras.src.callbacks.history.History at 0x23d97a74bd0>

In [143]:

loss, accuracy = model.evaluate(X_test, y_test)
print(f'Model Accuracy : {accuracy * 100}')

[1m266/266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.8949 - loss: 0.6096
Model Accuracy : 89.49347734451294


In [145]:
pred = np.argmax(model.predict(X_test), axis = -1)


[1m266/266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step


In [146]:
y_test[:5]

array([1, 1, 0, 1, 2], dtype=int64)

In [147]:
pred[:5]

array([1, 1, 0, 1, 2], dtype=int64)

In [151]:
df['final_tweet'][0]

'   RT mayasolovely woman shouldn t complain clean house amp man trash'

Predcation function


In [149]:
import numpy as np
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Label mapping
label_map = {
    0: "Hate Speech",
    1: "Offensive Language",
    2: "Neither"
}

def predict_text(model, text, vocab_size, sentence_length):
    # Step 1: One-hot encode
    encoded = one_hot(text, vocab_size)

    # Step 2: Pad
    padded = pad_sequences([encoded], maxlen=sentence_length, padding='pre')

    # Step 3: Predict
    pred = model.predict(padded)
    
    # Step 4: Convert softmax → class index
    predicted_class = np.argmax(pred)

    # Step 5: Return label & probabilities
    return label_map[predicted_class], pred


In [153]:
text = "RT mayasolovely woman shouldn t complain clean house amp man trash"

label, raw_output = predict_text(model, text, vocab_size, sentence_length)

print("Prediction:", label)
print("Raw Output:", raw_output)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
Prediction: Neither
Raw Output: [[5.3158361e-05 1.1137132e-04 9.9983549e-01]]
