In [10]:
import pandas as pd

# Step 1: Sample Data
data = {
    'label': ['ham', 'spam', 'ham', 'spam', 'ham', 'spam', 'ham', 'ham', 'spam', 'ham',
              'spam', 'ham', 'ham', 'spam', 'ham', 'spam', 'ham', 'ham', 'spam', 'ham'],
    'text': [
        "Hi there, how are you?", 
        "WINNER!! You've won a free ticket. Text WIN to 12345",
        "Are we still on for dinner tonight?", 
        "Congratulations! You've been selected for a prize.",
        "Don't forget our meeting tomorrow.", 
        "You have been chosen to receive a free gift!",
        "Can you send me the report?", 
        "Call me when you get this.", 
        "Get your free ringtone now!", 
        "Looking forward to our trip.",
        "Free entry in 2 a weekly competition! Text WIN to 80086 now",
        "Thanks for your help today.",
        "Happy birthday! Have a great one.", 
        "You've won $1000 cash!", 
        "See you at the game later.",
        "Claim your reward at spamoffers.com", 
        "Lunch at 1 PM works for me.", 
        "Can we talk later?", 
        "URGENT! Act now to claim your prize.",
        "Hey, let’s catch up soon!"
    ]
}
df = pd.DataFrame(data)
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Step 2: Train/Test Split before tokenizing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# Step 3: Tokenize using BERT tokenizer
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(list(X_train), truncation=True, padding=True, return_tensors="tf")
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, return_tensors="tf")




In [13]:
import tensorflow as tf

# Step 4: Create TensorFlow dataset
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), list(y_train))).batch(4)
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), list(y_test))).batch(4)

# Step 5: Load BERT model
from transformers import TFBertForSequenceClassification

model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Step 6: Compile
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

# Step 7: Train
model.fit(train_dataset, validation_data=test_dataset, epochs=2)


All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1f43eabf7f0>

In [15]:
def predict_spam(text):
    tokens = tokenizer(text, truncation=True, padding=True, return_tensors="tf")
    output = model(**tokens)
    prediction = tf.argmax(output.logits, axis=1).numpy()[0]
    return "its a Spam message " if prediction == 1 else "its a normal message"

# Try it out
print(predict_spam("You won a lottery! Claim now"))
print(predict_spam("Hey, just checking in."))


its a Spam message 
its a normal message
