In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import Tokenizer
from sklearn.preprocessing import pad_sequences

# Load the dataset
url = 'path_to_your_dataset.csv'
data = pd.read_csv(url)

# Display the first few rows of the dataset
data.head()


In [None]:
# Convert labels to binary values: ham=0, spam=1
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'])

# Split data into features and labels
X = data['message']
y = data['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize the text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

# Pad sequences to ensure uniform input size
max_length = 100
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_length)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_length)


In [None]:
# Define the model
model = keras.Sequential([
    layers.Embedding(input_dim=10000, output_dim=128, input_length=max_length),
    layers.GlobalAveragePooling1D(),
    layers.Dense(64, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_padded, y_train, epochs=5, validation_data=(X_test_padded, y_test), verbose=1)


In [None]:
def predict_message(message):
    # Tokenize and pad the input message
    message_seq = tokenizer.texts_to_sequences([message])
    message_padded = pad_sequences(message_seq, maxlen=max_length)

    # Make prediction
    prediction = model.predict(message_padded)

    # Determine likelihood and classification
    likelihood = prediction[0][0]
    classification = 'spam' if likelihood > 0.5 else 'ham'

    return [likelihood, classification]


In [None]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()
# Test the function with some example messages
print(predict_message("Congratulations! You've won a $1000 gift card."))
print(predict_message("Hey, are we still on for the meeting tomorrow?"))
