In [None]:
!pip install tensorflow-text

**Import essential libraries and remove unnecessary coloumns**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

# Load dataset
data = "https://raw.githubusercontent.com/t-davidson/hate-speech-and-offensive-language/master/data/labeled_data.csv"
df = pd.read_csv(data)
df.drop(['Unnamed: 0', "count", "hate_speech", "offensive_language", "neither"], axis =1, inplace =True)
df.head()

Unnamed: 0,class,tweet
0,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


**Describe the Dataframe using Attribute Class**

In [None]:
df.groupby('class').describe()
#0 - hate speech
#1 - offensive language
#2 - neither

Unnamed: 0_level_0,tweet,tweet,tweet,tweet
Unnamed: 0_level_1,count,unique,top,freq
class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,1430,1430,"""@Blackman38Tide: @WhaleLookyHere @HowdyDowdy1...",1
1,19190,19190,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1
2,4163,4163,!!! RT @mayasolovely: As a woman you shouldn't...,1


**Do preprocessiing and apply to the tweets in Dataframe**

In [None]:
import re
def preprocess_text(text):
    # Remove links
    text = re.sub(r'http\S+', '', text)
    # Remove punctuation and convert to lowercase
    text = re.sub(r'[^\w\s]', '', text).lower()
    return text

df["tweet"] = df["tweet"].apply(lambda x : preprocess_text(x))
df.head()

Unnamed: 0,class,tweet
0,2,rt mayasolovely as a woman you shouldnt compl...
1,1,rt mleew17 boy dats coldtyga dwn bad for cuff...
2,1,rt urkindofbrand dawg rt 80sbaby4life you eve...
3,1,rt c_g_anderson viva_based she look like a tr...
4,1,rt shenikaroberts the shit you hear about me ...


**Balnce the Dataset using Sample function**

In [None]:
df_hate = df[df['class'] == 0]
df_off = df[df['class'] == 1]
df_nei = df[df['class'] == 2]


df_hate = df_hate.sample(df_hate.shape[0])
df_off = df_off.sample(df_hate.shape[0])
df_nei = df_nei.sample(df_hate.shape[0])

df = pd.concat([df_hate, df_off, df_nei])
df["class"].value_counts()

0    1430
1    1430
2    1430
Name: class, dtype: int64

**Split the Dataset into Train And Test**

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['tweet'],df['class'], stratify=df['class'])

y_train = tf.keras.utils.to_categorical(y_train, num_classes=3)
y_test = tf.keras.utils.to_categorical(y_test, num_classes=3)


**Download the Bert Model and its Encoder**

In [None]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

**Create a Neural network rchitecture using bert model **

In [None]:
# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(3, activation='softmax', name="output")(l)

# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [l])


**Compile the Model with its loss and its Metrics**

In [None]:
model.compile(optimizer='adam',
              loss="categorical_crossentropy",
              metrics=['accuracy'])

model.fit(X_train, y_train, epochs=20, validation_data=(X_test, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7e526dbbeb90>

**Predict the Test Data using Trained model**

In [None]:
y_predicted = model.predict(X_test)



**Converting the output values to the prediction**

In [None]:
y_pred_labels = tf.argmax(y_predicted, axis=1).numpy()
y_true_labels = tf.argmax(y_test, axis=1).numpy()


**Create a Classification report for the predicted values**

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print("Accuracy:", accuracy_score(y_true_labels, y_pred_labels))
print("\nClassification Report:\n", classification_report(y_true_labels, y_pred_labels))
print("Confusion Matrix:\n", confusion_matrix(y_true_labels, y_pred_labels))

Accuracy: 0.6150978564771669

Classification Report:
               precision    recall  f1-score   support

           0       0.55      0.55      0.55       357
           1       0.67      0.51      0.58       358
           2       0.63      0.78      0.70       358

    accuracy                           0.62      1073
   macro avg       0.62      0.62      0.61      1073
weighted avg       0.62      0.62      0.61      1073

Confusion Matrix:
 [[198  69  90]
 [104 182  72]
 [ 58  20 280]]


**Create a new text file and test the text using trained model**

In [None]:
test = ["wtf that bitch is so rude on me"]
test = [preprocess_text(t) for t in test]
new_test = model.predict(test)
label = tf.argmax(new_test, axis=1).numpy()[0]

if label == 0:
    print("Hate Speech")
elif label == 1:
    print("Offensive Language")
else:  # label == 2
    print("Neither")


Offensive Language
