In [None]:
"""
Spam vs Ham Classifier using Neural Networks (Keras)

Author: Snesha Aamalepatil
Description:
    A simple neural network built using Keras and TensorFlow to classify SMS messages
    as spam or ham (not spam). Text preprocessing uses CountVectorizer, and the
    model uses Dense layers with dropout regularization.

Dataset:
    SMS Spam Collection Dataset from UCI Repository or equivalent.
    Path in script: /content/archive (6).zip

License: MIT (or any other license you prefer)
"""


# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import scikit-learn and Keras components
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.feature_extraction.text import CountVectorizer
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.utils import to_categorical

# Load SMS spam collection dataset
data = pd.read_csv('/content/archive (6).zip', encoding='latin-1')

# Show data structure
data.info()

# Remove unnecessary columns and keep only 'v1' (label) and 'v2' (message)
data = data.iloc[:, :2]
data.head()

# Separate message text and labels
messages = data["v2"].tolist()
labels = data["v1"].tolist()

# Split the dataset into training and test sets (80/20)
x_train, x_test, y_train, y_test = train_test_split(messages, labels, test_size=0.2, random_state=42)

# Print sizes
print(len(x_train), len(x_test), len(y_train), len(y_test))

# Vectorize the text using CountVectorizer
vectorizer = CountVectorizer()
x_train_vectorized = vectorizer.fit_transform(x_train)
x_test_vectorized = vectorizer.transform(x_test)

# Convert 'spam' to 1 and 'ham' to 0
y_train = [1 if label == 'spam' else 0 for label in y_train]
y_test = [1 if label == 'spam' else 0 for label in y_test]

# Convert labels to one-hot encoded format
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

# Build a simple neural network
model = Sequential()
model.add(Dense(32, activation='tanh', input_shape=(x_train_vectorized.shape[1],)))
model.add(Dropout(0.3))  # Regularization
model.add(Dense(16, activation='tanh'))
model.add(Dropout(0.3))
model.add(Dense(2, activation='sigmoid'))  # Output layer for binary classification

# Show model architecture
model.summary()

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
x_train_vectorized.sort_indices()  # Ensures indexing is correct for sparse tensors
history = model.fit(x_train_vectorized, y_train, epochs=10, batch_size=32, validation_data=(x_test_vectorized, y_test))

# Plot training and validation loss over epochs
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Predict a new sample message
new_message = "let us go out"
new_message_vectorized = vectorizer.transform([new_message])
prediction = model.predict(new_message_vectorized)[0][0]

if prediction > 0.5:
    print("Predicted Spam")
else:
    print("Predicted Ham")

# Display prediction probabilities
print(model.predict(new_message_vectorized))
