In [8]:
import pandas as pd

# Load the dataset
data = pd.read_csv('spam.csv', encoding='latin-1')

# Check the column names to ensure they are as expected
print(data.columns)

# Drop unnecessary columns (Unnamed columns)
data = data[['class', 'message']]

# Rename the columns for clarity
data.columns = ['label', 'text']

# Map labels to binary values (ham = 0, spam = 1)
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

# Inspect the cleaned dataset
print(data.head())


Index(['class', 'message', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')
   label                                               text
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.2, random_state=42)

# Initialize the Tokenizer
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

# Convert text to sequences (tokens)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding sequences to ensure equal length
max_len = 100  # You can adjust this depending on your dataset
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

# Check the shape of the processed data
print("Training data shape:", X_train_pad.shape)
print("Testing data shape:", X_test_pad.shape)


Training data shape: (4457, 100)
Testing data shape: (1115, 100)


In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Define the LSTM model
model = Sequential()

# Embedding layer: Maps words to vectors of fixed size
model.add(Embedding(input_dim=5000, output_dim=64, input_length=max_len))

# LSTM layer: Processes the sequential data
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))

# Fully connected layer: For binary classification (ham vs spam)
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Summary of the model
model.summary()




In [11]:
# Train the model
history = model.fit(X_train_pad, y_train, epochs=5, batch_size=64, validation_data=(X_test_pad, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test_pad, y_test)
print("Test Accuracy:", accuracy)


Epoch 1/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 151ms/step - accuracy: 0.8632 - loss: 0.4661 - val_accuracy: 0.8655 - val_loss: 0.4048
Epoch 2/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 143ms/step - accuracy: 0.8588 - loss: 0.4120 - val_accuracy: 0.8655 - val_loss: 0.3993
Epoch 3/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 142ms/step - accuracy: 0.8588 - loss: 0.4101 - val_accuracy: 0.8655 - val_loss: 0.3977
Epoch 4/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 138ms/step - accuracy: 0.8745 - loss: 0.3787 - val_accuracy: 0.8655 - val_loss: 0.3957
Epoch 5/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 137ms/step - accuracy: 0.8633 - loss: 0.4008 - val_accuracy: 0.8655 - val_loss: 0.3956
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - accuracy: 0.8760 - loss: 0.3757
Test Accuracy: 0.865470826625824


In [12]:
from sklearn.metrics import classification_report, confusion_matrix

# Predictions
y_pred = model.predict(X_test_pad)
y_pred = (y_pred > 0.5).astype(int)  # Thresholding at 0.5 for binary classification

# Classification report and confusion matrix
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 42ms/step
              precision    recall  f1-score   support

           0       0.87      1.00      0.93       965
           1       0.00      0.00      0.00       150

    accuracy                           0.87      1115
   macro avg       0.43      0.50      0.46      1115
weighted avg       0.75      0.87      0.80      1115

[[965   0]
 [150   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
