In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
try:
    data = pd.read_csv("new mail dataset.csv", encoding='latin-1')
except UnicodeDecodeError:
    import chardet
    with open("new mail dataset.csv", 'rb') as rawdata:
        result = chardet.detect(rawdata.read(100000))
    data = pd.read_csv("new mail dataset.csv", encoding=result['encoding'])

In [None]:
data.shape

(6046, 2)

In [None]:
data = data.drop_duplicates(keep='first')

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data['Category'] = le.fit_transform(data['Category'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Category'] = le.fit_transform(data['Category'])


In [None]:
from sklearn.model_selection import train_test_split
x = data['Message']
y = data['Category']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=42)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Create the tokenizer
tokenizer = Tokenizer(num_words=10000)  # Limit to the top 10,000 words

# Fit the tokenizer on the training set
tokenizer.fit_on_texts(x_train)  # Fit on training data only

# Convert the text to sequences of integers for train and test
X_train_sequences = tokenizer.texts_to_sequences(x_train)
X_test_sequences = tokenizer.texts_to_sequences(x_test)


In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Pad the sequences to the same length
maxlen = 100  # Maximum length of sequence (100 words)
X_train_padded = pad_sequences(X_train_sequences, maxlen=maxlen)
X_test_padded = pad_sequences(X_test_sequences, maxlen=maxlen)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Build the LSTM model
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=128, input_length=maxlen))  # Embedding layer
model.add(LSTM(64))  # LSTM layer
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

model.summary()




In [None]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Train the model on the training data
history = model.fit(X_train_padded, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=1)

Epoch 1/10
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 82ms/step - accuracy: 0.8522 - loss: 0.4090 - val_accuracy: 0.9679 - val_loss: 0.1082
Epoch 2/10
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 91ms/step - accuracy: 0.9876 - loss: 0.0537 - val_accuracy: 0.9744 - val_loss: 0.0840
Epoch 3/10
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 95ms/step - accuracy: 0.9930 - loss: 0.0279 - val_accuracy: 0.9731 - val_loss: 0.0777
Epoch 4/10
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 84ms/step - accuracy: 0.9986 - loss: 0.0074 - val_accuracy: 0.9782 - val_loss: 0.0774
Epoch 5/10
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 76ms/step - accuracy: 0.9991 - loss: 0.0055 - val_accuracy: 0.9782 - val_loss: 0.0857
Epoch 6/10
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 86ms/step - accuracy: 0.9994 - loss: 0.0046 - val_accuracy: 0.9782 - val_loss: 0.0911
Epoch 7/10
[1m98/98[0m [32m

In [None]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test_padded, y_test)
print(f'Test Loss: {loss}')
print(f'Test Accuracy: {accuracy}')

[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.9821 - loss: 0.0984
Test Loss: 0.11707239598035812
Test Accuracy: 0.9796651005744934


In [None]:
new_email = ["Congratulations! You've won a $1000 gift card. Click here to claim now!"]
new_sequence = tokenizer.texts_to_sequences(new_email)
new_padded = pad_sequences(new_sequence, maxlen=maxlen)
prediction = model.predict(new_padded)
if prediction >= 0.5:
    print("This email is spam.")
else:
    print("This email is not spam.")
print("Prediction probability is ",prediction)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
This email is spam.
Prediction probability is  [[0.99983954]]


In [None]:
# Save the model
model.save('spam_classifier_model.h5')  # You can choose a different filename if you prefer



In [None]:
with open('tokenizer.json', 'w') as f:
    f.write(tokenizer.to_json())