# import useful libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
data=pd.read_csv('spam_ham_dataset.csv')

# Data cleaning

In [5]:

# Check for missing values
print(data.isnull().sum())

# Handle missing values (replace with empty string for text data)
data['text'].fillna('', inplace=True)

# Remove duplicates
data.drop_duplicates(inplace=True)

data.drop(['Unnamed: 0'],axis=1,inplace=True)

Unnamed: 0    0
label         0
text          0
label_num     0
dtype: int64


In [59]:
data.head

<bound method NDFrame.head of      label                                               text  label_num
0      ham  Subject: enron methanol ; meter # : 988291\r\n...          0
1      ham  Subject: hpl nom for january 9 , 2001\r\n( see...          0
2      ham  Subject: neon retreat\r\nho ho ho , we ' re ar...          0
3     spam  Subject: photoshop , windows , office . cheap ...          1
4      ham  Subject: re : indian springs\r\nthis deal is t...          0
...    ...                                                ...        ...
5166   ham  Subject: put the 10 on the ft\r\nthe transport...          0
5167   ham  Subject: 3 / 4 / 2000 and following noms\r\nhp...          0
5168   ham  Subject: calpine daily gas nomination\r\n>\r\n...          0
5169   ham  Subject: industrial worksheets for august 2000...          0
5170  spam  Subject: important online banking alert\r\ndea...          1

[5171 rows x 3 columns]>

# Data Analysis

In [60]:
spam_count = data['label'].where(data['label'] == 'spam').count()
print(spam_count)
ham_count = data['label'].where(data['label'] == 'ham').count()
print(ham_count)

1499
3672


In [32]:
x=data['text']
y=data['label']

In [29]:
# Calculate the length of each text
text_lengths = [len(text.split()) for text in x]

# Calculate the average length
average_length = sum(text_lengths) / len(text_lengths)

print("Average text length:", average_length)

Average text length: 227.78360085089923


In [31]:
# Tokenize and pad the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x)

train_sequences = tokenizer.texts_to_sequences(x)

max_len = 250 # choose an appropriate maximum length for your sequences
train_sequences = pad_sequences(train_sequences, maxlen=max_len)


In [37]:
y = np.array(y, dtype=np.float32)

In [40]:
# Split the data into training and test sets (80-20 split)
x_train, x_test, y_train, y_test = train_test_split(train_sequences, y, test_size=0.2, random_state=42)

# Bulding keras Model

In [41]:
# Build the model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=max_len))
model.add(LSTM(64, return_sequences=True))
model.add(LSTM(32))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(x_train, y_train, epochs=5, batch_size=32, validation_split=0.1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x24a7f9e7cd0>

# Evaluation of Accuracy Metrices

In [42]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(x_test, y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

# You can also use metrics like precision, recall, and F1-score using sklearn
from sklearn.metrics import classification_report, confusion_matrix
y_pred = model.predict(x_test)
y_pred = (y_pred > 0.5).astype(int)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


Test Loss: 0.10414271801710129, Test Accuracy: 0.9681159257888794
[[730  12]
 [ 21 272]]
              precision    recall  f1-score   support

         0.0       0.97      0.98      0.98       742
         1.0       0.96      0.93      0.94       293

    accuracy                           0.97      1035
   macro avg       0.96      0.96      0.96      1035
weighted avg       0.97      0.97      0.97      1035



# Random Example

In [52]:
input_your_mail = ["PayPal Your access has been limited Dear Client, Our technical support and customer department has recently suspected activities in your account. Your Paypal account has been limited because we've noticed significant changes in your account activity. As Your payment processor, we need to understand these change better. We're always concerned about our customers security so please help us recover your account by following the link below. Restore Payment To PayPal Copyright Â© 1999-2020 PayPal. All rights reserved"]
# Print the content of the list
# for email_content in input_your_mail:
#     print(email_content)
input_sequences = tokenizer.texts_to_sequences(input_your_mail)
max_len = 250

# Pad sequences
input_data = pad_sequences(input_sequences, maxlen=max_len)

# Make predictions
y_pred = model.predict(input_data)
y_pred = (y_pred > 0.5).astype(int)

if y_pred==1:
    print("Spam Email")
else:
    print("Ham Email")

Spam Email
