In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot
%matplotlib inline
plt.style.use('ggplot')

from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
from plotly.subplots import make_subplots
import plotly.graph_objs as go
import re
import tensorflow as tf


#train split and fit models
from nltk.tokenize import TweetTokenizer

import warnings
warnings.filterwarnings('ignore')

In [None]:
dataset = pd.read_csv('./model_ready_combined.csv')
dataset.shape

In [None]:
dataset.dropna(inplace=True)
dataset.drop_duplicates(inplace=True)
dataset.shape

In [None]:
dataset.head()

### class values - 
+ label == 0 - Not Hateful/ Non-Risky
+ label == 1 - Potentially Risky 
+ label == 2 - hateful / Risky

In [None]:
label_lst = ["Non-Risky", "Potentially Risky", "Risky"]

# Hybrid Model (BERT+LSTM)

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, LSTM, Dropout, GlobalMaxPooling1D, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from transformers import BertTokenizer, TFBertModel
from sklearn.model_selection import train_test_split

# Load data
train = pd.read_csv('./train-data.csv')
test = pd.read_csv('./test-data.csv')
train.drop_duplicates(inplace=True)
test.dropna(inplace=True)
train.drop_duplicates(inplace=True)
test.dropna(inplace=True)


# Define BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')


# Tokenize text data
MAX_LEN = 128
X_train_tokenized = tokenizer.batch_encode_plus(
    train['clean training'].tolist(),
    max_length=MAX_LEN,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    return_token_type_ids=False
)
X_test_tokenized = tokenizer.batch_encode_plus(
    test['clean training'].tolist(),
    max_length=MAX_LEN,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    return_token_type_ids=False
)

# Create input tensors
attention_masks = np.array(X_train_tokenized['attention_mask'])
X_train = np.array(X_train_tokenized['input_ids'])
X_test = np.array(X_test_tokenized['input_ids'])

y_train = np.array(train['label'])
y_test = np.array(test['label'])

# Split combined validation-test set into separate validation and test sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, stratify=y_train)

# Define input layer for BERT model
input_layer = Input(shape=(MAX_LEN,), dtype=tf.int32, name="input_ids")

# Connect tokenizer output to BERT model
bert_output = bert_model(input_layer)[0]

# Define LSTM layer
reshape_layer = tf.keras.layers.Reshape((1, -1))(bert_output)
lstm_layer = Bidirectional(LSTM(128, return_sequences=True))(reshape_layer)

# Define pooling layer
pooling_layer = GlobalMaxPooling1D()(lstm_layer)

# Define output layer
output_layer = Dense(3, activation='softmax')(pooling_layer)

# Define the model
model_1 = Model(inputs=[input_layer], outputs=[output_layer])

# Compile the model
model_1.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=1e-5), metrics=['accuracy'])


# Train model
earlyStop = EarlyStopping(monitor='val_accuracy', patience=2, 
                        verbose=1, mode='max',restore_best_weights=True)
batch_size = 1600  # Set batch size to the same value as used in training

history = model_1.fit(
    x=X_train,
    y=tf.keras.utils.to_categorical(y_train),
    validation_data=(X_val, tf.keras.utils.to_categorical(y_val)),
    epochs=5,
    batch_size=batch_size,
    callbacks=[earlyStop]
)

# Evaluate model on test set

loss, accuracy = model_1.evaluate([X_test], tf.keras.utils.to_categorical(y_test, num_classes=3), batch_size=batch_size)
print('Test accuracy:', accuracy)


In [None]:
# Evaluate model on test set
batch_size = 16  # Set batch size to the same value as used in training
loss, accuracy = model_1.evaluate([X_test], tf.keras.utils.to_categorical(y_test, num_classes=3), batch_size=batch_size)
print('Test accuracy:', accuracy)

y_pred = model_1.predict(X_test)

y_pred = np.argmax(y_pred, axis=1) 

# Print Classification report
from sklearn.metrics import classification_report
print(f"\nClassification Report :\n{classification_report(y_test, y_pred)}")

In [None]:
# Make prediction on user input
text = 'This is a hateful and offensive message.'
text_tokenized = tokenizer.encode_plus(
    text,
    max_length=MAX_LEN,
    padding='max_length',
    truncation=True,
    return_token_type_ids=False,
    return_tensors='tf'
)
prediction = model_1.predict([text_tokenized['input_ids'].numpy() ])

print('Prediction:', np.argmax(prediction))

In [None]:
model_1.save("DL_model_Bart_Lstm.h5")

In [None]:
model_1.save("DL_model_Bart_Lstm_2.hdf5", include_optimizer=False)

## How to load the model : TFBertModel and include optimizer

In [None]:
from transformers import BertTokenizer, TFBertModel
from keras.models import load_model

from keras.optimizers import Adam

loaded_model = load_model("/kaggle/working/DL_model_Bart_Lstm_2.hdf5", custom_objects = {'TFBertModel': TFBertModel})

# Compile the model
optimizer = Adam(lr=1e-5)
loss = 'categorical_crossentropy'
loaded_model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])



In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from transformers import BertTokenizer, TFBertModel

# Load data
test = pd.read_csv('/kaggle/input/hatefull-and-offensive-language/test-data.csv')
test.dropna(inplace=True)
test.dropna(inplace=True)


# Define BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')


# Tokenize text data
MAX_LEN = 128
X_test_tokenized = tokenizer.batch_encode_plus(
    test['clean training'].tolist(),
    max_length=MAX_LEN,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    return_token_type_ids=False
)

# Create input tensors
attention_masks = np.array(X_test_tokenized['attention_mask'])
X_test = np.array(X_test_tokenized['input_ids'])
y_test = np.array(test['label'])

batch_size = 16  # Set batch size to the same value as used in training

In [None]:
# Evaluate model on test set
batch_size = 16  # Set batch size to the same value as used in training
loss, accuracy = model_1.evaluate([X_test], tf.keras.utils.to_categorical(y_test, num_classes=3), batch_size=batch_size)
print('Test accuracy:', accuracy)

y_pred = model_1.predict(X_test)

y_pred = np.argmax(y_pred, axis=1) 

# Print Classification report
from sklearn.metrics import classification_report
print(f"\nClassification Report :\n{classification_report(y_test, y_pred)}")

In [None]:
from transformers import BertTokenizer, TFBertModel

# Define BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

MAX_LEN = 128

# Make prediction on user input
text = 'I do not like differently abled'
text_tokenized = tokenizer.encode_plus(
    text,
    max_length=MAX_LEN,
    padding='max_length',
    truncation=True,
    return_token_type_ids=False,
    return_tensors='tf'
)
prediction = loaded_model.predict([text_tokenized['input_ids'].numpy() ])
print(prediction)
print('Prediction:', np.argmax(prediction))

In [None]:
from sklearn import metrics
import matplotlib.pyplot as plt

confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix)
cm_display.plot()
plt.show()

# DistilBert

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertModel
from tensorflow.keras.layers import Input, Dense, LSTM, Dropout, GlobalMaxPooling1D, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

# Load data
train = pd.read_csv('/kaggle/input/hatefull-and-offensive-language/train-data.csv')
test = pd.read_csv('/kaggle/input/hatefull-and-offensive-language/test-data.csv')
train.drop_duplicates(inplace=True)
test.dropna(inplace=True)
train.drop_duplicates(inplace=True)
test.dropna(inplace=True)

# Define BERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
bert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

# Tokenize text data
MAX_LEN = 128
X_train_tokenized = tokenizer.batch_encode_plus(
    train['clean training'].tolist(),
    max_length=MAX_LEN,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    return_token_type_ids=False
)
X_test_tokenized = tokenizer.batch_encode_plus(
    test['clean training'].tolist(),
    max_length=MAX_LEN,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    return_token_type_ids=False
)

# Create input tensors
attention_masks = np.array(X_train_tokenized['attention_mask'])
X_train = np.array(X_train_tokenized['input_ids'])
X_test = np.array(X_test_tokenized['input_ids'])

y_train = np.array(train['label'])
y_test = np.array(test['label'])

# Define input layer for BERT model
input_layer = Input(shape=(MAX_LEN,), dtype=tf.int32, name="input_ids")

# Connect tokenizer output to BERT model
bert_output = bert_model(input_layer)[0]

# Define LSTM layer
lstm_layer = Bidirectional(LSTM(128, return_sequences=True))(bert_output)

# Define pooling layer
pooling_layer = GlobalMaxPooling1D()(lstm_layer)

# Define output layer
output_layer = Dense(3, activation='softmax')(pooling_layer)

# Define the model
model = Model(inputs=[input_layer], outputs=[output_layer])

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=1e-5), metrics=['accuracy'])

# Train model
history = model.fit(
    x=X_train,
    y=y_train,
    validation_split=0.2,
    epochs=5,
    batch_size=16,
    callbacks=[EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True)]
)

# Evaluate model on test set
batch_size = 16  # Set batch size to the same value as used in training
loss, accuracy = model.evaluate(X_test, y_test, batch_size=batch_size)
print('Test accuracy:', accuracy)

# Use model to make predictions
text = 'This is a hateful tweet'
text_tokenized = tokenizer.encode_plus(
    text,
    max_length=MAX_LEN,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    return_token_type_ids=False
)
text_input = np.array(text_tokenized['input_ids']).reshape(1, -1)
prediction = model.predict(text_input)
predicted_label = np.argmax(prediction)
print('Predicted label:', predicted_label)


In [None]:

y_pred = model.predict(X_test)

y_pred = np.argmax(y_pred, axis=1) 

# Print Classification report
from sklearn.metrics import classification_report
print(f"\nClassification Report :\n{classification_report(y_test, y_pred)}")

In [None]:
from sklearn import metrics
import matplotlib.pyplot as plt

confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix)
cm_display.plot()
plt.show()

In [None]:
model.save("DL_model_DistilBert_Lstm.h5")

## How to load the model : TFDistilBertModel and include optimizer

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertModel
from tensorflow.keras.layers import Input, Dense, LSTM, Dropout, GlobalMaxPooling1D, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

test = pd.read_csv('/kaggle/input/hatefull-and-offensive-language/test-data.csv')
test.drop_duplicates(inplace=True)
test.dropna(inplace=True)
X_test = test['clean training']
y_test = test['label']

# Define BERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
bert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

# Tokenize text data
MAX_LEN = 128
X_test_tokenized = tokenizer.batch_encode_plus(
    X_test.tolist(),
    max_length=MAX_LEN,
    padding='max_length',
    truncation=True,
    return_token_type_ids=False
)

# Create input tensors
X_test = np.array(X_test_tokenized['input_ids'])
attention_masks = np.array(X_test_tokenized['attention_mask'])
y_test = np.array(y_test)

In [None]:
from transformers import DistilBertTokenizer, TFDistilBertModel

# Evaluate model on test set
batch_size = 16  # Set batch size to the same value as used in training

from keras.models import load_model
loaded_model = load_model("/kaggle/working/DL_model_DistilBert_Lstm.h5", custom_objects = {'TFDistilBertModel': TFDistilBertModel})

# Evaluate model on test set
loss, accuracy = loaded_model.evaluate(X_test, y_test, batch_size=batch_size)

# Get predicted labels
y_pred = loaded_model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)

# Print Classification report
from sklearn.metrics import classification_report
print(f"\nClassification Report :\n{classification_report(y_test, y_pred)}")

In [None]:
from transformers import BertTokenizer, TFBertModel

# Define BERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
bert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

MAX_LEN = 128

# Make prediction on user input
text = 'I do not like differently abled'
text_tokenized = tokenizer.encode_plus(
    text,
    max_length=MAX_LEN,
    padding='max_length',
    truncation=True,
    return_token_type_ids=False,
    return_tensors='tf'
)
prediction = loaded_model.predict([text_tokenized['input_ids'].numpy() ])
print(prediction)
print('Prediction:', np.argmax(prediction))

In [None]:
from sklearn import metrics
import matplotlib.pyplot as plt

confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix)
cm_display.plot()
plt.show()