In [1]:
# General
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# NLP & Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, f1_score

# TensorFlow & Keras (RNN)
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Hugging Face (BERT)
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import create_optimizer


In [2]:
# Example dataset
data = {
    "text": [
        "I hate this person",
        "Love everyone",
        "This is awful",
        "Such a nice day",
        "You are stupid",
        "Peace and love",
        "I despise this",
        "Amazing work",
        "Horrible experience",
        "Feeling happy today"
    ],
    "label": [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]  # 1=hate, 0=not hate
}

df = pd.DataFrame(data)
print(df.head())

# Encode labels (if needed)
le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])


                 text  label
0  I hate this person      1
1       Love everyone      0
2       This is awful      1
3     Such a nice day      0
4      You are stupid      1


In [3]:
# Tokenization
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['text'])
sequences = tokenizer.texts_to_sequences(df['text'])
padded_sequences = pad_sequences(sequences, padding='post', maxlen=10)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    padded_sequences, df['label'], test_size=0.2, random_state=42
)


In [4]:
vocab_size = 5000
embedding_dim = 64
max_len = 10

model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_len),
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# Train
history = model.fit(X_train, y_train, epochs=10, batch_size=2, validation_data=(X_test, y_test))




Epoch 1/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 72ms/step - accuracy: 0.4500 - loss: 0.6943 - val_accuracy: 0.5000 - val_loss: 0.6938
Epoch 2/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.5000 - loss: 0.6925 - val_accuracy: 0.5000 - val_loss: 0.6934
Epoch 3/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.5500 - loss: 0.6972 - val_accuracy: 0.5000 - val_loss: 0.6932
Epoch 4/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.3167 - loss: 0.7043 - val_accuracy: 0.5000 - val_loss: 0.6929
Epoch 5/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 1.0000 - loss: 0.6820 - val_accuracy: 0.5000 - val_loss: 0.6929
Epoch 6/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.5500 - loss: 0.6898 - val_accuracy: 0.5000 - val_loss: 0.6927
Epoch 7/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━

In [5]:
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print("F1 Score:", f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 123ms/step
F1 Score: 0.0
              precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       0.00      0.00      0.00         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2

[[1 0]
 [1 0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
# If dataset is imbalanced, use class weights
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = dict(enumerate(class_weights))

# Add Early Stopping
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)


# Train RNN with class weights and early stopping
history = model.fit(X_train, y_train, epochs=10, batch_size=2, validation_data=(X_test, y_test), class_weight=class_weights_dict, callbacks=[early_stopping])

Epoch 1/10


ValueError: Cannot take the length of shape with unknown rank.

In [11]:
# Load tokenizer & model
bert_model_name = "bert-base-uncased"
tokenizer_bert = BertTokenizer.from_pretrained(bert_model_name)
model_bert = TFBertForSequenceClassification.from_pretrained(bert_model_name, num_labels=2, from_pt=True)

# Encode texts
X = list(df['text'])
y = df['label'].values
inputs = tokenizer_bert(X, return_tensors="tf", padding=True, truncation=True, max_length=32)

# Train-test split
train_size = int(0.8*len(df))
X_train_inputs = {k:v[:train_size] for k,v in inputs.items()}
X_test_inputs = {k:v[train_size:] for k,v in inputs.items()}
y_train, y_test = y[:train_size], y[train_size:]

# Compile BERT
optimizer, schedule = create_optimizer(init_lr=5e-5, num_warmup_steps=0, num_train_steps=10)
model_bert.compile(optimizer=optimizer, loss=model_bert.compute_loss, metrics=['accuracy'])

# Fine-tune
model_bert.fit(X_train_inputs, y_train, epochs=2, batch_size=2)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/2


AttributeError: in user code:

    File "/usr/local/lib/python3.12/dist-packages/tf_keras/src/engine/training.py", line 1398, in train_function  *
        return step_function(self, iterator)
    File "/usr/local/lib/python3.12/dist-packages/transformers/modeling_tf_utils.py", line 1572, in compute_loss  *
        return super().compute_loss(*args, **kwargs)
    File "/usr/local/lib/python3.12/dist-packages/tf_keras/src/engine/training.py", line 1206, in compute_loss  **
        return self.compiled_loss(
    File "/usr/local/lib/python3.12/dist-packages/tf_keras/src/engine/compile_utils.py", line 275, in __call__
        y_t, y_p, sw = match_dtype_and_rank(y_t, y_p, sw)
    File "/usr/local/lib/python3.12/dist-packages/tf_keras/src/engine/compile_utils.py", line 854, in match_dtype_and_rank
        if (y_t.dtype.is_floating and y_p.dtype.is_floating) or (

    AttributeError: 'NoneType' object has no attribute 'dtype'


In [10]:
y_pred_bert = np.argmax(model_bert.predict(X_test_inputs).logits, axis=1)
print("F1 Score:", f1_score(y_test, y_pred_bert))
print(classification_report(y_test, y_pred_bert))


F1 Score: 0.0
              precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       0.00      0.00      0.00         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
