<a href="https://colab.research.google.com/github/PRAGHATHAPRIYA2589/step/blob/Day3/Day_3_Task.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.metrics import accuracy_score

In [4]:
# Load dataset
dataset_path = "/content/Spam-Classification.csv"
data = pd.read_csv(dataset_path)

In [5]:
# Rename the columns for consistency
data.rename(columns={"CLASS": "label", "SMS": "message"}, inplace=True)

# One-hot encode labels (spam = 1, ham = 0)
data = pd.get_dummies(data, columns=["label"], prefix=["label"])

# Check the columns after encoding
print(data.columns)

Index(['message', 'label_ham', 'label_spam'], dtype='object')


In [6]:
# Preprocess the messages
messages = data['message']
labels = data[['label_ham', 'label_spam']]  # Check the correct names after one-hot encoding

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(messages, labels, test_size=0.2, random_state=42)

In [7]:
# Tokenize the text data
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

# Pad the sequences to the same length
max_length = 100
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_length, padding="post", truncating="post")
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_length, padding="post", truncating="post")

In [12]:
# Build the LSTM model
model = Sequential([
    Embedding(input_dim=10000, output_dim=64, input_length=max_length),
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(2, activation='sigmoid')  # Softmax for multi-class classification (ham or spam)
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])



In [14]:
# Train the model
model.fit(X_train_padded, y_train, epochs=5, batch_size=32, validation_data=(X_test_padded, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test_padded, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

# Function to predict spam or ham
def predict_message(message):
    sequence = tokenizer.texts_to_sequences([message])
    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding="post", truncating="post")
    prediction = model.predict(padded_sequence)[0]
    return "Spam" if prediction[1] > prediction[0] else "Ham"

# Test the prediction function with a sample message
sample_message = "Congratulations! You've won a $1000 Walmart gift card. Call now!"
result = predict_message(sample_message)
print(f"The message '{sample_message}' is classified as: {result}")

Epoch 1/5
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 59ms/step - accuracy: 0.5098 - loss: 0.6933 - val_accuracy: 0.4767 - val_loss: 0.6937
Epoch 2/5
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 99ms/step - accuracy: 0.4936 - loss: 0.6935 - val_accuracy: 0.4767 - val_loss: 0.6938
Epoch 3/5
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 56ms/step - accuracy: 0.5277 - loss: 0.6928 - val_accuracy: 0.4767 - val_loss: 0.6941
Epoch 4/5
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 56ms/step - accuracy: 0.5062 - loss: 0.6928 - val_accuracy: 0.4767 - val_loss: 0.6935
Epoch 5/5
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 56ms/step - accuracy: 0.5442 - loss: 0.6913 - val_accuracy: 0.5233 - val_loss: 0.6923
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.5477 - loss: 0.6912
Test Accuracy: 52.33%
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/ste