## Pre-processing steps:

1. Converted to lowercase
2. Removed all stopwords except 'you' and 'not'
3. Removed all alphanumeric characters
4. Corrected contracted words
5. Converted emojis to their sentiment

## Models:
1. GRU
2. BGRU

Applied two fold cross validation for the above.


In [None]:
# Packages

import pandas as pd
import numpy as np

from numpy.random import seed
# from tensorflow import set_random_seed
import tensorflow as tf
import random as rn
import os
from nltk import tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Read file as panda dataframe
train           = pd.read_csv('train_cleaned_no_punkt.csv')
test_labelled   = pd.read_csv('test_labelled_cleaned_no_punkt.csv')
test_unlabelled = pd.read_csv('test_unlabelled_cleaned_no_punkt.csv')

In [None]:
train['mal']    = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) >= 1
train.drop(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], axis=1, inplace=True)
train.comment_text.fillna("empty", inplace=True)

test_labelled['mal'] = test_labelled[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) >= 1
test_labelled.drop(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], axis=1, inplace=True)
test_labelled.comment_text.fillna("empty", inplace=True)

test_unlabelled.comment_text.fillna("empty", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train.comment_text.fillna("empty", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_labelled.comment_text.fillna("empty", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are se

In [None]:
# CHANGE TRAIN AND TEST, MIX TO GET SIMILAR DISTRIBUTION
from sklearn.model_selection import train_test_split
rs=42
X_train1, X_test1, y_train1, y_test1  = train_test_split(train.drop('mal', axis=1), train.mal, stratify=train.mal, test_size=0.29, random_state=rs )
X_train2, X_test2, y_train2, y_test2  = train_test_split(test_labelled.drop('mal', axis=1), test_labelled.mal, stratify=test_labelled.mal, test_size=0.29, random_state=rs)

X = np.concatenate((X_train1.comment_text, X_train2.comment_text))
y = np.concatenate((y_train1, y_train2))

X_test = np.concatenate((X_test1.comment_text, X_test2.comment_text))
y_test = np.concatenate((y_test1, y_test2))

In [None]:
max_features = 40000
maxlen       = 400
dropout_rate = 0.25
rs           = 42
epochs       = 4
batch_size   = 256
embed_dim    = 50
rec_units    = 150


max_sen_len    = 100
max_sent_amount = 4

seed(rs)
# set_random_seed(rs)
rn.seed(rs)

os.environ['PYTHONHASHSEED']=str(rs)

In [None]:
#models.train_model(X=X, y=y, mtype = 'GRU', cv=True, nfolds=5, epochs=4, cv_models_path=cv_models_path, train=False, rs=rs)

In [None]:
import keras
from keras.layers import Embedding, SpatialDropout1D
from keras.layers import Dense, Input, GRU, LSTM
from keras.layers import Bidirectional, Dropout, GlobalMaxPool1D
from tensorflow.keras.layers import GRU  # Handles both CPU and GPU
from keras.layers import Conv1D, GlobalMaxPooling1D, TimeDistributed
from keras.layers import Dense, Embedding, Input

from keras.models import Model, Sequential
from keras.optimizers import RMSprop
import keras.backend as K
# from keras.engine.topology import Layer, InputSpec
from tensorflow.keras.layers import GlobalAveragePooling1D
from tensorflow.keras.layers import Layer
from tensorflow.keras.layers import InputSpec
from keras.models import load_model

# from keras.preprocessing import text, sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from keras import initializers as initializers, regularizers, constraints

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.metrics import precision_score, recall_score, f1_score

In [None]:
from nltk import tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
import os

cv_models_path = 'saved_models'
os.makedirs(cv_models_path, exist_ok=True)

In [None]:
def gru_keras(max_features, maxlen, bidirectional, dropout_rate, embed_dim, rec_units,mtype='GRU', reduction = None):

    if K.backend == 'tensorflow':
        K.clear_session()

    input_layer     = Input(shape=(maxlen,))
    embedding_layer = Embedding(max_features, output_dim=embed_dim, trainable=True)(input_layer)
    x               = SpatialDropout1D(dropout_rate)(embedding_layer)

    if reduction:
        if mtype   == 'GRU':
            if bidirectional:
                x           = Bidirectional(GRU(units=rec_units, return_sequences=True))(x)
            else:
                x           = GRU(units=rec_units, return_sequences=True)(x)

        if reduction == 'average':
          x = GlobalAveragePooling1D()(x)
        elif reduction == 'maximum':
          x = GlobalMaxPool1D()(x)
        # elif reduction == 'attention':
        #   x = AttentionWithContext()(x)
    else:
        if mtype   == 'GRU':
            if bidirectional:
                x           = Bidirectional(GRU(units=rec_units, return_sequences=False))(x)
            else:
                x           = GRU(units=rec_units, return_sequences=False)(x)

    output_layer = Dense(1, activation="sigmoid")(x)
    model        = Model(inputs=input_layer, outputs=output_layer)
    model.compile(loss='binary_crossentropy',
                  # Choose either clipnorm or clipvalue, but not both
                  optimizer=RMSprop(clipvalue=1),  # Or optimizer=RMSprop(clipnorm=1)
                  metrics=['acc'])
    return model

In [None]:
def dl_model(model_type='BGRU', max_features=40000, embed_dim=50, rec_units=150, dropout_rate=0.25, maxlen=400, max_sent_len=100, max_sent_amount=4):

    if model_type == 'GRU':
        return gru_keras(max_features=max_features, maxlen=maxlen, bidirectional=False, mtype='GRU',
                         dropout_rate=dropout_rate, embed_dim=embed_dim, rec_units=rec_units)
    if model_type == 'BGRU':
        return gru_keras(max_features=max_features, maxlen=maxlen, bidirectional=True, mtype='GRU',
                         dropout_rate=dropout_rate, embed_dim=embed_dim, rec_units=rec_units)


In [None]:
def train_model(X, y, mtype, cv, epochs,
                train, X_test=None, y_test=None, nfolds=None,
                rs=42, max_features=40000, maxlen=400, dropout_rate=0.25,
                rec_units=150, embed_dim=50, batch_size=256, max_sen_len=100,
                max_sent_amount=4, threshold=0.3, cv_models_path='saved_models'):
    if cv:
        kf = StratifiedKFold(n_splits=nfolds, random_state=rs, shuffle=True)
        auc = []
        roc = []
        fscore_ = []

        for c, (train_index, val_index) in enumerate(kf.split(X, y)):

            print(f' fold {c}')

            X_train, X_val = X[train_index], X[val_index]
            y_train, y_val = y[train_index], y[val_index]

            tokenizer = Tokenizer(num_words=max_features)
            tokenizer.fit_on_texts(X_train)

            if mtype == 'HAN':
                def clean_str(string):
                    #string = string.replace(",", ".").replace(";", ".").replace(":", ".").replace("-", ".")
                    return string.strip().lower()

                def tok_sentence(s):
                    temp = tokenizer.texts_to_sequences(s)
                    if len(temp)==0:
                        return np.array([0])
                    return temp


                train_posts = []
                train_labels = []
                train_texts = []

                #TRAIN
                for i, value in enumerate(X_train):
                    if(i%10000==0):
                        print(i)
                    text = clean_str(value)
                    train_texts.append(text)
                    sentences = tokenize.sent_tokenize(text)
                    sentences = tok_sentence(sentences)
                    x = len(sentences)<max_sent_amount
                    while x:
                        sentences.append(np.array([0]))
                        x = len(sentences)<max_sent_amount

                    if len(sentences)>max_sent_amount:
                        sentences = sentences[0:max_sent_amount]
                    sentences = pad_sequences(sentences, maxlen=max_sen_len)

                    train_posts.append(sentences)

                val_posts = []
                val_labels = []
                val_texts = []

                #VAL
                for i, value in enumerate(X_val):
                    if(i%10000==0):
                        print(i)
                    text = clean_str(value)
                    val_texts.append(text)
                    sentences = tokenize.sent_tokenize(text)
                    sentences = tok_sentence(sentences)


                    x = len(sentences)<max_sent_amount
                    while x:
                        sentences.append(np.array([0]))
                        x = len(sentences)<max_sent_amount

                    if len(sentences)>max_sent_amount:
                        sentences = sentences[0:max_sent_amount]
                    sentences = pad_sequences(sentences, maxlen=max_sen_len)
                    val_posts.append(sentences)

                X_train = np.array(train_posts)
                y_train = np.array(y_train)
                X_val =  np.array(val_posts)
                y_val = np.array(y_val)

                del train_posts
                del val_posts
            elif mtype =='psHAN':
                X_train = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=max_sen_len*max_sent_amount)
                X_val = pad_sequences(tokenizer.texts_to_sequences(X_val), maxlen=max_sen_len*max_sent_amount)
                X_train = np.array([line.reshape(max_sent_amount,max_sen_len) for line in X_train])
                X_val = np.array([line.reshape(max_sent_amount,max_sen_len) for line in X_val])
            else:
                list_tokenized_train = tokenizer.texts_to_sequences(X_train)
                list_tokenized_val   = tokenizer.texts_to_sequences(X_val)

                X_train = pad_sequences(list_tokenized_train, maxlen=maxlen)
                X_val   = pad_sequences(list_tokenized_val, maxlen=maxlen)

            model = dl_model(model_type=mtype, max_features=max_features,
            maxlen=maxlen, dropout_rate=dropout_rate, embed_dim=embed_dim,
            rec_units=rec_units, max_sent_len=max_sen_len, max_sent_amount=max_sent_amount)

            print('Fitting')
            if train:
                # model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, shuffle=True, verbose=1)
                from sklearn.utils import class_weight

                class_weights = class_weight.compute_class_weight(
                    class_weight='balanced',
                    classes=np.unique(y_train),
                    y=y_train
                )
                class_weights_dict = dict(enumerate(class_weights))

                model.fit(
                    X_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    shuffle=True,
                    verbose=1,
                    class_weight=class_weights_dict
                )

                model.save_weights(f'{cv_models_path}/{mtype}_fold_{c}.weights.h5')

            else:

                model.load_weights(f'{cv_models_path}/{mtype}_fold_{c}.weights.h5')


            probs = model.predict(X_val, batch_size=batch_size, verbose=1)

            #for threshold in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
            threshold = threshold
            probs_class = probs.copy()
            probs_class[probs_class >= threshold] = 1
            probs_class[probs_class < threshold] = 0
            precision = precision_score(y_val, probs_class)
            recall    = recall_score(y_val, probs_class)
            fscore    = f1_score(y_val, probs_class)
            print(f' {threshold} fold {c} precision {round(precision, 3)} recall {round(recall, 3)} fscore {round(fscore,3)}')

            auc_f = average_precision_score(y_val, probs)

            auc.append(auc_f)
            roc_f = roc_auc_score(y_val, probs)
            roc.append(roc_f)
            fscore_.append(fscore)
            print(f'fold {c} average precision {round(auc_f, 3)}')
            print(f'fold {c} roc auc {round(roc_f, 3)}')

            del model
            K.clear_session()

        print(f'PR-C {round(np.array(auc).mean(), 3)}')
        print(f'ROC AUC {round(np.array(roc).mean(), 3)}')
        print(f'FScore {round(np.array(fscore_).mean(), 3)}')

        print(f'PR-C std {round(np.array(auc).std(), 3)}')
        print(f'ROC AUC std {round(np.array(roc).std(), 3)}')
        print(f'FScore std {round(np.array(fscore_).std(), 3)}')
    else:
            X_train   = X
            y_train   = y
            tokenizer = Tokenizer(num_words=max_features, oov_token='unknown')
            tokenizer.fit_on_texts(X_train)


            if mtype == 'HAN':

                def clean_str(string):
                    #string = string.replace(",", ".").replace(";", ".").replace(":", ".").replace("-", ".")
                    return string.strip().lower()

                def tok_sentence(s):
                    temp = tokenizer.texts_to_sequences(s)
                    if len(temp)==0:
                        return np.array([0])
                    return temp

                train_posts = []
                train_labels = []
                train_texts = []

                # FULL TRAIN
                for i, value in enumerate(X):
                    if(i%10000==0):
                        print(i)
                    text = clean_str(value)
                    train_texts.append(text)
                    sentences = tokenize.sent_tokenize(text)
                    sentences = tok_sentence(sentences)
                    x = len(sentences)<max_sent_amount
                    while x:
                        sentences.append(np.array([0]))
                        x = len(sentences)<max_sent_amount

                    if len(sentences)>max_sent_amount:
                        sentences = sentences[0:max_sent_amount]
                    sentences = pad_sequences(sentences, maxlen=max_sen_len)

                    train_posts.append(sentences)


                test_posts = []
                test_labels = []
                test_texts = []


                #Test
                for i, value in enumerate(X_test):
                    if(i%10000==0):
                        print(i)
                    text = clean_str(value)
                    test_texts.append(text)
                    sentences = tokenize.sent_tokenize(text)
                    sentences = tok_sentence(sentences)
                    x = len(sentences)<max_sent_amount
                    while x:
                        sentences.append(np.array([0]))
                        x = len(sentences)<max_sent_amount

                    if len(sentences)>max_sent_amount:
                        sentences = sentences[0:max_sent_amount]
                    sentences = pad_sequences(sentences, maxlen=max_sen_len)

                    test_posts.append(sentences)


                X_train = np.array(train_posts)
                y_train = np.array(y)
                X_test =  np.array(test_posts)
                y_test = np.array(y_test)

                del train_posts
                del test_posts
            elif mtype =='psHAN':
                X_train = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=max_sen_len*max_sent_amount, padding='post')
                X_test  = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=max_sen_len*max_sent_amount, padding='post')
                X_train = np.array([line.reshape(max_sent_amount, max_sen_len) for line in X_train])
                X_test  = np.array([line.reshape(max_sent_amount, max_sen_len) for line in X_test])
            else:
                list_tokenized_train = tokenizer.texts_to_sequences(X_train)
                list_tokenized_test  = tokenizer.texts_to_sequences(X_test)
                X_train = pad_sequences(list_tokenized_train, maxlen=maxlen)
                X_test  = pad_sequences(list_tokenized_test, maxlen=maxlen)

            y_train = np.array(y_train)
            y_test  = np.array(y_test)

            model = dl_model(model_type=mtype, max_features=max_features,
            maxlen=maxlen, dropout_rate=dropout_rate, embed_dim=embed_dim,
            rec_units=rec_units, max_sent_len=max_sen_len, max_sent_amount=max_sent_amount)

            print('Fitting')

            if train:
                # model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, shuffle=True, verbose=1)
                from sklearn.utils import class_weight

                class_weights = class_weight.compute_class_weight(
                    class_weight='balanced',
                    classes=np.unique(y_train),
                    y=y_train
                )
                class_weights_dict = dict(enumerate(class_weights))

                model.fit(
                    X_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    shuffle=True,
                    verbose=1,
                    class_weight=class_weights_dict
                )

                model.save_weights(f'{cv_models_path}/{mtype}_fold_{c}.weights.h5')

            else:
                model.load_weights(f'{cv_models_path}/{mtype}_fold_{c}.weights.h5')
            probs = model.predict(X_test, batch_size=batch_size, verbose=1)
            auc_f = average_precision_score(y_test, probs)
            roc_f = roc_auc_score(y_test, probs)


            threshold = threshold
            probs_class = probs.copy()
            probs_class[probs_class >= threshold] = 1
            probs_class[probs_class < threshold] = 0
            precision = precision_score(y_test, probs_class)
            recall    = recall_score(y_test, probs_class)
            fscore    = f1_score(y_test, probs_class)

            print('_________________________________')
            print(f'PR-C is {round(auc_f,3)}')
            print('_________________________________\n')

            print('_________________________________')
            print(f'ROC AUC is {round(roc_f,3)}')
            print('_________________________________')

            print('_________________________________')
            print(f'FScore is {round(fscore,3)}')
            print('_________________________________\n')

In [None]:
train_model(
    X=X,
    y=y,
    mtype='GRU',
    cv=True,
    nfolds=2,
    epochs=4,
    cv_models_path=cv_models_path,
    train=True,  # Set to True if you want to train and save the model
    rs=42
)

 fold 0
Fitting
Epoch 1/4
[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m947s[0m 3s/step - acc: 0.9024 - loss: 0.3061
Epoch 2/4
[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m935s[0m 3s/step - acc: 0.9442 - loss: 0.1668
Epoch 3/4
[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m993s[0m 3s/step - acc: 0.9511 - loss: 0.1375
Epoch 4/4
[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1002s[0m 3s/step - acc: 0.9569 - loss: 0.1212
[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m249s[0m 803ms/step
 0.3 fold 0 precision 0.63 recall 0.837 fscore 0.719
fold 0 average precision 0.829
fold 0 roc auc 0.961
 fold 1
Fitting
Epoch 1/4
[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m943s[0m 3s/step - acc: 0.9017 - loss: 0.3092
Epoch 2/4
[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m969s[0m 3s/step - acc: 0.9434 - loss: 0.1698
Epoch 3/4
[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m965s[0m 3s/step - acc: 

In [None]:
mtype = 'GRU'
cv_models_path = 'saved_models'
nfolds = 2
max_features = 40000
maxlen = 400
batch_size = 256
threshold = 0.3
random_state = 42

kf = StratifiedKFold(n_splits=nfolds, random_state=random_state, shuffle=True)

for fold, (train_index, val_index) in enumerate(kf.split(X, y)):
    print(f"\n--- Fold {fold+1} ---")

    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]

    # Re-create tokenizer and preprocess validation data
    tokenizer = Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(X_train)
    X_val_seq = tokenizer.texts_to_sequences(X_val)
    X_val_pad = pad_sequences(X_val_seq, maxlen=maxlen)

    # Rebuild the model
    model = dl_model(
        model_type=mtype,
        max_features=max_features,
        maxlen=maxlen,
        dropout_rate=0.25,
        embed_dim=50,
        rec_units=150,
        max_sent_len=100,
        max_sent_amount=4
    )

    # Load the corresponding weights
    weight_file = os.path.join(cv_models_path, f"{mtype}_fold_{fold}.weights.h5")
    model.load_weights(weight_file)

    # Predict probabilities and apply threshold
    probs = model.predict(X_val_pad, batch_size=batch_size, verbose=1)
    preds = (probs >= threshold).astype(int).flatten()

    # Print classification report
    print(classification_report(y_val, preds, digits=2))


--- Fold 1 ---


  saveable.load_own_variables(weights_store.get(inner_path))


[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m292s[0m 943ms/step
              precision    recall  f1-score   support

       False       0.98      0.94      0.96     71384
        True       0.63      0.84      0.72      7976

    accuracy                           0.93     79360
   macro avg       0.81      0.89      0.84     79360
weighted avg       0.95      0.93      0.94     79360


--- Fold 2 ---


  saveable.load_own_variables(weights_store.get(inner_path))


[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m264s[0m 851ms/step
              precision    recall  f1-score   support

       False       0.97      0.97      0.97     71383
        True       0.74      0.77      0.75      7976

    accuracy                           0.95     79359
   macro avg       0.86      0.87      0.86     79359
weighted avg       0.95      0.95      0.95     79359



In [None]:
from sklearn.metrics import (
    precision_score, recall_score, f1_score,
    average_precision_score, roc_auc_score,
    classification_report
)

def train_model(X, y, mtype, cv, epochs,
                train, X_test=None, y_test=None, nfolds=None,
                rs=42, max_features=40000, maxlen=400, dropout_rate=0.25,
                rec_units=150, embed_dim=50, batch_size=256, max_sen_len=100,
                max_sent_amount=4, threshold=0.3, cv_models_path='saved_models'):

    if cv:
        kf = StratifiedKFold(n_splits=nfolds, random_state=rs, shuffle=True)
        auc = []
        roc = []
        fscore_ = []

        for c, (train_index, val_index) in enumerate(kf.split(X, y)):
            print(f'\n======== Fold {c} ========')

            X_train, X_val = X[train_index], X[val_index]
            y_train, y_val = y[train_index], y[val_index]

            tokenizer = Tokenizer(num_words=max_features)
            tokenizer.fit_on_texts(X_train)

            # === Preprocessing ===

            X_train = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=maxlen)
            X_val = pad_sequences(tokenizer.texts_to_sequences(X_val), maxlen=maxlen)

            # === Build model ===
            model = dl_model(
                model_type=mtype, max_features=max_features, maxlen=maxlen,
                dropout_rate=dropout_rate, embed_dim=embed_dim,
                rec_units=rec_units, max_sent_len=max_sen_len, max_sent_amount=max_sent_amount
            )

            print('Fitting')
            if train:
                model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, shuffle=True, verbose=1)
                model.save_weights(f'{cv_models_path}/{mtype}_fold_{c}.weights.h5')
            else:
                model.load_weights(f'{cv_models_path}/{mtype}_fold_{c}.weights.h5')

            probs = model.predict(X_val, batch_size=batch_size, verbose=1)
            preds = (probs >= threshold).astype(int)

            # === Metrics ===
            precision = precision_score(y_val, preds)
            recall = recall_score(y_val, preds)
            fscore = f1_score(y_val, preds)
            auc_f = average_precision_score(y_val, probs)
            roc_f = roc_auc_score(y_val, probs)

            auc.append(auc_f)
            roc.append(roc_f)
            fscore_.append(fscore)

            print(f'Precision: {precision:.3f} | Recall: {recall:.3f} | F1-score: {fscore:.3f}')
            print(f'Avg Precision (PR-C): {auc_f:.3f} | ROC AUC: {roc_f:.3f}')

            # >>> Include classification report
            print('\nClassification Report:')
            print(classification_report(y_val, preds, digits=2))

            del model
            K.clear_session()

        print('\n====== Cross-Validation Summary ======')
        print(f'Avg PR-C: {np.mean(auc):.3f} ± {np.std(auc):.3f}')
        print(f'Avg ROC AUC: {np.mean(roc):.3f} ± {np.std(roc):.3f}')
        print(f'Avg F1 Score: {np.mean(fscore_):.3f} ± {np.std(fscore_):.3f}')

    else:
        # ==== Non-CV Mode ====
        tokenizer = Tokenizer(num_words=max_features, oov_token='unknown')
        tokenizer.fit_on_texts(X)


        X_train = pad_sequences(tokenizer.texts_to_sequences(X), maxlen=maxlen)
        X_test = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=maxlen)

        y_train = np.array(y)
        y_test = np.array(y_test)

        model = dl_model(
            model_type=mtype, max_features=max_features, maxlen=maxlen,
            dropout_rate=dropout_rate, embed_dim=embed_dim,
            rec_units=rec_units, max_sent_len=max_sen_len, max_sent_amount=max_sent_amount
        )

        print('Fitting')
        if train:
            model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, shuffle=True, verbose=1)
            model.save_weights(f'{cv_models_path}/{mtype}_full_model.weights.h5')
        else:
            model.load_weights(f'{cv_models_path}/{mtype}_full_model.weights.h5')

        probs = model.predict(X_test, batch_size=batch_size, verbose=1)
        preds = (probs >= threshold).astype(int)

        precision = precision_score(y_test, preds)
        recall = recall_score(y_test, preds)
        fscore = f1_score(y_test, preds)
        auc_f = average_precision_score(y_test, probs)
        roc_f = roc_auc_score(y_test, probs)

        print('_________________________________')
        print(f'PR-C is {round(auc_f,3)}')
        print(f'ROC AUC is {round(roc_f,3)}')
        print(f'F1-score is {round(fscore,3)}')
        print('_________________________________')

        # >>> Include classification report
        print('\nClassification Report:')
        print(classification_report(y_test, preds, digits=2))


In [None]:
train_model(
    X=X,
    y=y,
    mtype='GRU',
    cv=True,
    nfolds=2,
    epochs=4,
    cv_models_path='saved_models',
    train=True,  # Set to True if you want to train and save the model
    rs=42
)

## BGRU

In [None]:
from sklearn.metrics import (
    precision_score, recall_score, f1_score,
    average_precision_score, roc_auc_score,
    classification_report
)

def train_model(X, y, mtype, cv, epochs,
                train, X_test=None, y_test=None, nfolds=None,
                rs=42, max_features=40000, maxlen=400, dropout_rate=0.25,
                rec_units=150, embed_dim=50, batch_size=256, max_sen_len=100,
                max_sent_amount=4, threshold=0.3, cv_models_path='saved_models'):

    if cv:
        kf = StratifiedKFold(n_splits=nfolds, random_state=rs, shuffle=True)
        auc = []
        roc = []
        fscore_ = []

        for c, (train_index, val_index) in enumerate(kf.split(X, y)):
            print(f'\n======== Fold {c} ========')

            X_train, X_val = X[train_index], X[val_index]
            y_train, y_val = y[train_index], y[val_index]

            tokenizer = Tokenizer(num_words=max_features)
            tokenizer.fit_on_texts(X_train)

            # === Preprocessing ===

            X_train = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=maxlen)
            X_val = pad_sequences(tokenizer.texts_to_sequences(X_val), maxlen=maxlen)

            # === Build model ===
            model = dl_model(
                model_type=mtype, max_features=max_features, maxlen=maxlen,
                dropout_rate=dropout_rate, embed_dim=embed_dim,
                rec_units=rec_units, max_sent_len=max_sen_len, max_sent_amount=max_sent_amount
            )

            print('Fitting')
            if train:
                model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, shuffle=True, verbose=1)
                model.save_weights(f'{cv_models_path}/{mtype}_fold_{c}.weights.h5')
            else:
                model.load_weights(f'{cv_models_path}/{mtype}_fold_{c}.weights.h5')

            probs = model.predict(X_val, batch_size=batch_size, verbose=1)
            preds = (probs >= threshold).astype(int)

            # === Metrics ===
            precision = precision_score(y_val, preds)
            recall = recall_score(y_val, preds)
            fscore = f1_score(y_val, preds)
            auc_f = average_precision_score(y_val, probs)
            roc_f = roc_auc_score(y_val, probs)

            auc.append(auc_f)
            roc.append(roc_f)
            fscore_.append(fscore)

            print(f'Precision: {precision:.3f} | Recall: {recall:.3f} | F1-score: {fscore:.3f}')
            print(f'Avg Precision (PR-C): {auc_f:.3f} | ROC AUC: {roc_f:.3f}')

            # >>> Include classification report
            print('\nClassification Report:')
            print(classification_report(y_val, preds, digits=2))

            del model
            K.clear_session()

        print('\n====== Cross-Validation Summary ======')
        print(f'Avg PR-C: {np.mean(auc):.3f} ± {np.std(auc):.3f}')
        print(f'Avg ROC AUC: {np.mean(roc):.3f} ± {np.std(roc):.3f}')
        print(f'Avg F1 Score: {np.mean(fscore_):.3f} ± {np.std(fscore_):.3f}')

    else:
        # ==== Non-CV Mode ====
        tokenizer = Tokenizer(num_words=max_features, oov_token='unknown')
        tokenizer.fit_on_texts(X)


        X_train = pad_sequences(tokenizer.texts_to_sequences(X), maxlen=maxlen)
        X_test = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=maxlen)

        y_train = np.array(y)
        y_test = np.array(y_test)

        model = dl_model(
            model_type=mtype, max_features=max_features, maxlen=maxlen,
            dropout_rate=dropout_rate, embed_dim=embed_dim,
            rec_units=rec_units, max_sent_len=max_sen_len, max_sent_amount=max_sent_amount
        )

        print('Fitting')
        if train:
            model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, shuffle=True, verbose=1)
            model.save_weights(f'{cv_models_path}/{mtype}_full_model.weights.h5')
        else:
            model.load_weights(f'{cv_models_path}/{mtype}_full_model.weights.h5')

        probs = model.predict(X_test, batch_size=batch_size, verbose=1)
        preds = (probs >= threshold).astype(int)

        precision = precision_score(y_test, preds)
        recall = recall_score(y_test, preds)
        fscore = f1_score(y_test, preds)
        auc_f = average_precision_score(y_test, probs)
        roc_f = roc_auc_score(y_test, probs)

        print('_________________________________')
        print(f'PR-C is {round(auc_f,3)}')
        print(f'ROC AUC is {round(roc_f,3)}')
        print(f'F1-score is {round(fscore,3)}')
        print('_________________________________')

        # >>> Include classification report
        print('\nClassification Report:')
        print(classification_report(y_test, preds, digits=2))


In [None]:
train_model(
    X=X,
    y=y,
    mtype='BGRU',
    cv=True,
    nfolds=2,
    epochs=4,
    train=True,
    rs=42,
    cv_models_path='saved_models'
)




Fitting
Epoch 1/4
[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1931s[0m 6s/step - acc: 0.9030 - loss: 0.3168
Epoch 2/4
[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1949s[0m 6s/step - acc: 0.9455 - loss: 0.1593
Epoch 3/4
[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1997s[0m 6s/step - acc: 0.9531 - loss: 0.1302
Epoch 4/4
[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2005s[0m 6s/step - acc: 0.9591 - loss: 0.1133
[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m456s[0m 1s/step
Precision: 0.786 | Recall: 0.714 | F1-score: 0.748
Avg Precision (PR-C): 0.824 | ROC AUC: 0.956

Classification Report:
              precision    recall  f1-score   support

       False       0.97      0.98      0.97     71384
        True       0.79      0.71      0.75      7976

    accuracy                           0.95     79360
   macro avg       0.88      0.85      0.86     79360
weighted avg       0.95      0.95      0.95     79360


Fit

In [None]:
import os

print(os.listdir("saved_models"))


['GRU_fold_0.weights.h5', 'GRU_fold_1.weights.h5']


In [None]:
train['mal']    = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) >= 1
train.drop(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], axis=1, inplace=True)
train.comment_text.fillna("empty", inplace=True)

test_labelled['mal'] = test_labelled[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) >= 1
test_labelled.drop(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], axis=1, inplace=True)
test_labelled.comment_text.fillna("empty", inplace=True)

test_unlabelled.comment_text.fillna("empty", inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train.comment_text.fillna("empty", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_labelled.comment_text.fillna("empty", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are se

In [None]:
# CHANGE TRAIN AND TEST, MIX TO GET SIMILAR DISTRIBUTION
from sklearn.model_selection import train_test_split
rs=42
X_train1, X_test1, y_train1, y_test1  = train_test_split(train.drop('mal', axis=1), train.mal, stratify=train.mal, test_size=0.29, random_state=rs )
X_train2, X_test2, y_train2, y_test2  = train_test_split(test_labelled.drop('mal', axis=1), test_labelled.mal, stratify=test_labelled.mal, test_size=0.29, random_state=rs)

X = np.concatenate((X_train1.comment_text, X_train2.comment_text))
y = np.concatenate((y_train1, y_train2))

X_test = np.concatenate((X_test1.comment_text, X_test2.comment_text))
y_test = np.concatenate((y_test1, y_test2))

In [None]:
pip install transformers



In [None]:
pip install tensorflow



In [None]:
import warnings

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import RMSprop, Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import transformers
import traitlets
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
from tokenizers import BertWordPieceTokenizer
from sklearn.metrics import roc_auc_score
from tensorflow.keras import backend as K
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.metrics import precision_score, recall_score, f1_score

warnings.simplefilter("ignore")

In [None]:
#!pip install -U tensorflow --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m644.9/644.9 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.5/57.5 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.5/24.5 MB[0m [31m79.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m114.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.1/5.1 MB[0m [31m108.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m105.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.5/72.5 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
strategy = tf.distribute.get_strategy()

In [None]:
import tensorflow as tf
print("TensorFlow version:", tf.__version__)
print("GPU is available:", tf.config.list_physical_devices('GPU'))


TensorFlow version: 2.18.0
GPU is available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [None]:
from transformers import TFBertModel, BertTokenizer

# with strategy.scope():
#     bert = TFBertModel.from_pretrained('bert-base-uncased')
with strategy.scope():
    transformer_layer = transformers.TFBertModel.from_pretrained('bert-base-uncased')
    model = build_model(transformer=transformer_layer, max_len=400)


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

ValueError: Exception encountered when calling layer 'tf_bert_model_3' (type TFBertModel).

Data of type <class 'keras.src.backend.common.keras_tensor.KerasTensor'> is not allowed only (<class 'tensorflow.python.framework.tensor.Tensor'>, <class 'bool'>, <class 'int'>, <class 'transformers.utils.generic.ModelOutput'>, <class 'tuple'>, <class 'list'>, <class 'dict'>, <class 'numpy.ndarray'>) is accepted for attention_mask.

Call arguments received by layer 'tf_bert_model_3' (type TFBertModel):
  • input_ids=<KerasTensor shape=(None, 400), dtype=int32, sparse=False, name=input_ids>
  • attention_mask=<KerasTensor shape=(None, 400), dtype=int32, sparse=False, name=attention_mask>
  • token_type_ids=None
  • position_ids=None
  • head_mask=None
  • inputs_embeds=None
  • encoder_hidden_states=None
  • encoder_attention_mask=None
  • past_key_values=None
  • use_cache=None
  • output_attentions=None
  • output_hidden_states=None
  • return_dict=None
  • training=False

In [None]:
# def build_model(transformer, max_len=512):
#     input_word_ids = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
#     sequence_output = transformer(input_word_ids)[0]
#     cls_token = sequence_output[:, 0, :]
#     out = tf.keras.layers.Dense(1, activation='sigmoid')(cls_token)

#     model = tf.keras.Model(inputs=input_word_ids, outputs=out)
#     model.compile(loss='binary_crossentropy',
#                   optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5),
#                   metrics=[tf.keras.metrics.AUC()])
#     return model
def build_model(transformer, max_len=512):
    # input_ids = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
    # attention_mask = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")

    # Define input layers as TensorFlow Input layers
    input_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
    attention_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")

    bert_output = transformer(input_ids=input_ids, attention_mask=attention_mask)[0]
    cls_token = bert_output[:, 0, :]  # CLS token
    output = tf.keras.layers.Dense(1, activation='sigmoid')(cls_token)

    model = tf.keras.models.Model(inputs=[input_ids, attention_mask], outputs=output)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5),
                  loss='binary_crossentropy',
                  metrics=[tf.keras.metrics.AUC()])
    return model



In [None]:
# First load the real tokenizer
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

# Save the loaded tokenizer locally
tokenizer.save_pretrained('.')

# Reload it with the huggingface tokenizers library
fast_tokenizer = BertWordPieceTokenizer("vocab.txt", lowercase=True)
fast_tokenizer

Tokenizer(vocabulary_size=30522, model=BertWordPiece, unk_token=[UNK], sep_token=[SEP], cls_token=[CLS], pad_token=[PAD], mask_token=[MASK], clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True, wordpieces_prefix=##)

In [None]:
# def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):
#     tokenizer.enable_truncation(max_length=maxlen)
#     tokenizer.enable_padding(length=maxlen)
#     all_ids = []

#     for i in tqdm(range(0, len(texts), chunk_size)):
#         text_chunk = texts[i:i+chunk_size].tolist()
#         encs = tokenizer.encode_batch(text_chunk)
#         all_ids.extend([enc.ids for enc in encs])

#     return np.array(all_ids)
# def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):
#     tokenizer.enable_truncation(max_length=maxlen)
#     tokenizer.enable_padding(length=maxlen)

#     input_ids = []
#     attention_masks = []

#     for i in tqdm(range(0, len(texts), chunk_size)):
#         text_chunk = texts[i:i+chunk_size].tolist()
#         encs = tokenizer.encode_batch(text_chunk)
#         input_ids.extend([enc.ids for enc in encs])
#         attention_masks.extend([enc.attention_mask for enc in encs])

#     return np.array(input_ids), np.array(attention_masks)

def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(length=maxlen)

    input_ids = []
    attention_masks = []

    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        input_ids.extend([enc.ids for enc in encs])
        attention_masks.extend([enc.attention_mask for enc in encs])

    return np.array(input_ids), np.array(attention_masks)


In [None]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
auc, roc, fsc = [], [], []
TRAIN = True  # or False if loading weights
cv_models_path = "./bert_models"

In [None]:
import os

cv_models_path = "./bert_models"
os.makedirs(cv_models_path, exist_ok=True)

In [None]:
for c, (train_index, val_index) in enumerate(kf.split(X, y)):
    print(f' fold {c}')
    X_train, X_val       = X[train_index], X[val_index]
    y_train, y_val       = y[train_index], y[val_index]

    X_train_ids, X_train_masks = fast_encode(X_train, fast_tokenizer, maxlen=400)
    X_val_ids, X_val_masks = fast_encode(X_val, fast_tokenizer, maxlen=400)

    # Convert to TensorFlow tensors
    X_train_ids = tf.convert_to_tensor(X_train_ids, dtype=tf.int32)
    X_train_masks = tf.convert_to_tensor(X_train_masks, dtype=tf.int32)
    X_val_ids = tf.convert_to_tensor(X_val_ids, dtype=tf.int32)
    X_val_masks = tf.convert_to_tensor(X_val_masks, dtype=tf.int32)

    # Now create the datasets
    train_dataset = (
        tf.data.Dataset
        .from_tensor_slices(({
            "input_ids": X_train_ids,
            "attention_mask": X_train_masks
        }, y_train))
        .shuffle(2048)
        .batch(64)
        .prefetch(tf.data.AUTOTUNE)
    )

    valid_dataset = (
        tf.data.Dataset
        .from_tensor_slices(({
            "input_ids": X_val_ids,
            "attention_mask": X_val_masks
        }, y_val))
        .batch(64)
        .cache()
        .prefetch(tf.data.AUTOTUNE)
    )



    strategy = tf.distribute.get_strategy()


    with strategy.scope():
      transformer_layer = transformers.TFBertModel.from_pretrained('bert-base-uncased')
      model = build_model(transformer=transformer_layer,  max_len=400)



    #model.layers[1].trainable = False
    model.summary()

    print('Fitting')

    if TRAIN:
      history              = model.fit(train_dataset,
                                      epochs=1
                                      )
      model.save_weights(f'{cv_models_path}/BERT_full_fold_{c}.h5')
    else:
      model.load_weights(f'{cv_models_path}/BERT_full_fold_{c}.h5')

    probs                = model.predict(valid_dataset, batch_size=64, verbose=1)
    auc_f                = average_precision_score(y_val, probs)
    auc.append(auc_f)
    roc_f                = roc_auc_score(y_val, probs)
    roc.append(roc_f)
    print(f' average precision {auc_f}')
    print(f' roc auc {roc_f}')

    threshold = 0.1
    probs_class = probs.copy()
    probs_class[probs_class >= threshold] = 1
    probs_class[probs_class < threshold] = 0
    precision = precision_score(y_val, probs_class)
    recall    = recall_score(y_val, probs_class)
    fscore    = f1_score(y_val, probs_class)

    fsc.append(fscore)
    print(f' fscore {fscore}')

    del model
    K.clear_session()
    gc.collect()

 fold 0


  0%|          | 0/496 [00:00<?, ?it/s]

  0%|          | 0/124 [00:00<?, ?it/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

ValueError: Exception encountered when calling layer 'tf_bert_model_6' (type TFBertModel).

Data of type <class 'keras.src.backend.common.keras_tensor.KerasTensor'> is not allowed only (<class 'tensorflow.python.framework.tensor.Tensor'>, <class 'bool'>, <class 'int'>, <class 'transformers.utils.generic.ModelOutput'>, <class 'tuple'>, <class 'list'>, <class 'dict'>, <class 'numpy.ndarray'>) is accepted for attention_mask.

Call arguments received by layer 'tf_bert_model_6' (type TFBertModel):
  • input_ids=<KerasTensor shape=(None, 400), dtype=int32, sparse=False, name=input_ids>
  • attention_mask=<KerasTensor shape=(None, 400), dtype=int32, sparse=False, name=attention_mask>
  • token_type_ids=None
  • position_ids=None
  • head_mask=None
  • inputs_embeds=None
  • encoder_hidden_states=None
  • encoder_attention_mask=None
  • past_key_values=None
  • use_cache=None
  • output_attentions=None
  • output_hidden_states=None
  • return_dict=None
  • training=False

In [None]:
from datasets import Dataset
import pandas as pd
# Create DataFrames
df_train = pd.DataFrame({'text': X, 'label': y})
df_test = pd.DataFrame({'text': X_test, 'label': y_test})

# Convert to Hugging Face Datasets
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=512)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/158719 [00:00<?, ? examples/s]

Map:   0%|          | 0/64830 [00:00<?, ? examples/s]

In [None]:
train_dataset = train_dataset.remove_columns(['text'])
test_dataset = test_dataset.remove_columns(['text'])


In [None]:
from datasets import Value, Sequence, Features

features = Features({
    'input_ids': Sequence(Value(dtype='int64')),
    'attention_mask': Sequence(Value(dtype='int64')),
    'token_type_ids': Sequence(Value(dtype='int64')),
    'label': Value(dtype='int64'),
})

train_dataset = train_dataset.cast(features)
test_dataset = test_dataset.cast(features)


Casting the dataset:   0%|          | 0/158719 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/64830 [00:00<?, ? examples/s]

In [None]:
# Set format for PyTorch or TensorFlow
# train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
# test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids', 'label'])


In [None]:
df_train["label"] = df_train["label"].astype(int)
df_test["label"] = df_test["label"].astype(int)


In [None]:
print(train_dataset.features)

{'input_ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'token_type_ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'label': Value(dtype='int64', id=None)}


In [None]:
print(train_dataset[0])


ValueError: Unable to avoid copy while creating an array as requested.
If using `np.array(obj, copy=False)` replace it with `np.asarray(obj)` to allow a copy when needed (no behavior change in NumPy 1.x).
For more details, see https://numpy.org/devdocs/numpy_2_0_migration_guide.html#adapting-to-changes-in-the-copy-keyword.

In [None]:
from transformers import BertForSequenceClassification

# model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1, problem_type="single_label_classification")
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2  # not 1!
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments, Trainer

# from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir='./logs',
)



from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
# pip install --upgrade transformers
import transformers
print(transformers.__version__)

4.51.3


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# try:
trainer.train()
# except Exception as e:
#     print("Training failed with error:")
#     print(e)



ValueError: Unable to avoid copy while creating an array as requested.
If using `np.array(obj, copy=False)` replace it with `np.asarray(obj)` to allow a copy when needed (no behavior change in NumPy 1.x).
For more details, see https://numpy.org/devdocs/numpy_2_0_migration_guide.html#adapting-to-changes-in-the-copy-keyword.

In [None]:
metrics = trainer.evaluate()
print(metrics)


In [None]:
# 1. Data Prep
import pandas as pd
from datasets import Dataset

df_train = pd.DataFrame({'text': X, 'label': y})
df_test = pd.DataFrame({'text': X_test, 'label': y_test})

train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

# 2. Tokenization
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=512)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# 3. Remove raw text and index column
train_dataset = train_dataset.remove_columns(['text'])
test_dataset = test_dataset.remove_columns(['text'])

# 4. Cast to proper data types
from datasets import Features, Sequence, Value

features = Features({
    'input_ids': Sequence(Value(dtype='int64')),
    'attention_mask': Sequence(Value(dtype='int64')),
    'token_type_ids': Sequence(Value(dtype='int64')),
    'label': Value(dtype='int64')
})

train_dataset = train_dataset.cast(features)
test_dataset = test_dataset.cast(features)

# 5. Set format for PyTorch
# train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids', 'label'])
# test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids', 'label'])
# 5. Set format for TensorFlow
train_dataset.set_format(type='tensorflow', columns=['input_ids', 'attention_mask', 'token_type_ids', 'label'])
test_dataset.set_format(type='tensorflow', columns=['input_ids', 'attention_mask', 'token_type_ids', 'label'])

# 6. Model
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2  # Two classes: hate / not hate
)

# 7. Metrics
# from sklearn.metrics import accuracy_score, precision_recall_fscore_support
# import numpy as np

# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     preds = np.argmax(logits, axis=1)
#     precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
#     acc = accuracy_score(labels, preds)
#     return {
#         'accuracy': acc,
#         'f1': f1,
#         'precision': precision,
#         'recall': recall
#     }



Map:   0%|          | 0/158719 [00:00<?, ? examples/s]

Map:   0%|          | 0/64830 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/158719 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/64830 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Ensure labels are also in the correct type
    labels = labels.astype(np.int64)
    preds = np.argmax(logits, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    do_eval=True,  # ensure evaluation runs
    save_steps=500,  # control how often you save (customizable)
    logging_steps=500,
    num_train_epochs=3,
    #per_device_train_batch_size=16,
    per_device_train_batch_size=8, # This was the default value
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    # load_best_model_at_end=True
)

# 9. Trainer
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# 10. Train
trainer.train()

  trainer = Trainer(


ValueError: Unable to avoid copy while creating an array as requested.
If using `np.array(obj, copy=False)` replace it with `np.asarray(obj)` to allow a copy when needed (no behavior change in NumPy 1.x).
For more details, see https://numpy.org/devdocs/numpy_2_0_migration_guide.html#adapting-to-changes-in-the-copy-keyword.

In [None]:
import transformers
import inspect
from transformers import TrainingArguments

print("Transformers version:", transformers.__version__)
print("TrainingArguments module:", TrainingArguments.__module__)
print("TrainingArguments file:", inspect.getfile(TrainingArguments))


Transformers version: 4.51.3
TrainingArguments module: transformers.training_args
TrainingArguments file: /usr/local/lib/python3.11/dist-packages/transformers/training_args.py


In [None]:
# !pip uninstall transformers -y
# !pip install transformers==4.51.3 --no-cache-dir --force-reinstall


In [None]:
# 1. Data Prep
import pandas as pd
from datasets import Dataset

df_train = pd.DataFrame({'text': X, 'label': y})
df_test = pd.DataFrame({'text': X_test, 'label': y_test})

train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

# 2. Tokenization
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=512)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# 3. Remove raw text and index column
train_dataset = train_dataset.remove_columns(['text'])
test_dataset = test_dataset.remove_columns(['text'])

# 4. Cast to proper data types
from datasets import Features, Sequence, Value

features = Features({
    'input_ids': Sequence(Value(dtype='int64')),
    'attention_mask': Sequence(Value(dtype='int64')),
    'token_type_ids': Sequence(Value(dtype='int64')),
    'label': Value(dtype='int64')
})

train_dataset = train_dataset.cast(features)
test_dataset = test_dataset.cast(features)

# 5. Set format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids', 'label'])
# 5. Set format for TensorFlow - removed

# 6. Model
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2  # Two classes: hate / not hate
)

# 7. Metrics
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Ensure labels are also in the correct type
    labels = labels.astype(np.int64)
    preds = np.argmax(logits, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    do_eval=True,  # ensure evaluation runs
    save_steps=500,  # control how often you save (customizable)
    logging_steps=500,
    num_train_epochs=3,
    #per_device_train_batch_size=16,
    per_device_train_batch_size=8, # This was the default value
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    # load_best_model_at_end=True
)

# 9. Trainer
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# 10. Train
trainer.train()

Map:   0%|          | 0/158719 [00:00<?, ? examples/s]

Map:   0%|          | 0/64830 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/158719 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/64830 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


ValueError: Unable to avoid copy while creating an array as requested.
If using `np.array(obj, copy=False)` replace it with `np.asarray(obj)` to allow a copy when needed (no behavior change in NumPy 1.x).
For more details, see https://numpy.org/devdocs/numpy_2_0_migration_guide.html#adapting-to-changes-in-the-copy-keyword.