In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# FCC dataset preloaded as 'sms_data'; else load manually
# sms_data = pd.read_csv('SMSSpamCollection', sep='	', names=['label','message'])
# Split into train and test sets
train_data, test_data = train_test_split(sms_data, test_size=0.2, random_state=42)

In [None]:
# Encode labels
le = LabelEncoder()
train_labels = le.fit_transform(train_data['label'])
test_labels = le.transform(test_data['label'])

# Tokenize text
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(train_data['message'])
train_sequences = tokenizer.texts_to_sequences(train_data['message'])
test_sequences = tokenizer.texts_to_sequences(test_data['message'])

max_len = 50
train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post', truncating='post')

In [None]:
model = Sequential([
    Embedding(input_dim=5000, output_dim=16, input_length=max_len),
    GlobalAveragePooling1D(),
    Dense(24, activation='relu'),
    Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_padded, train_labels, epochs=10, validation_data=(test_padded, test_labels), verbose=2)

In [None]:
def predict_message(message):
    seq = tokenizer.texts_to_sequences([message])
    padded = pad_sequences(seq, maxlen=max_len, padding='post', truncating='post')
    prob = float(model.predict(padded)[0][0])
    label = 'spam' if prob >= 0.5 else 'ham'
    return [prob, label]

In [None]:
predict_message('Congratulations! You won a free ticket!')