In [3]:
import pandas as pd
import re
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import joblib
print(tf.__version__)  # This should print the installed TensorFlow version

2.18.0


In [4]:
df = pd.read_csv(r'email.csv', encoding='latin-1')

In [5]:
enc = {'ham': 1, 'spam': 0}
df['label'] = df['Category'].map(enc)
df.drop(['Category'], axis=1, inplace=True)

In [None]:
def clean(data):
    data = data.lower()
    data = re.sub(r'\W+', ' ', data)  
    data = re.sub(r'\d+', ' ', data)  
    data = re.sub(r'\s+', ' ', data)  
    data = data.strip() 
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    tokens = data.split()
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]  
    return ' '.join(tokens)

In [7]:
df['Message'] = df['Message'].apply(clean)

In [8]:
# Tokenization
tokenizer = Tokenizer(num_words=5000)  # Limit vocabulary size
tokenizer.fit_on_texts(df['Message'])
sequences = tokenizer.texts_to_sequences(df['Message'])

In [9]:
# Padding
X = pad_sequences(sequences, maxlen=100)  # Ensure all inputs have the same length
y = df['label'].values

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43)

In [11]:
model = Sequential([
    Embedding(input_dim=5000, output_dim=32, input_length=100),  # Word embeddings
    LSTM(64, return_sequences=True),  # LSTM layer
    Dropout(0.5),  
    LSTM(32),  
    Dense(1, activation='sigmoid')  # Binary classification output
])



In [12]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [16]:
history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 88ms/step - accuracy: 0.9987 - loss: 0.0056 - val_accuracy: 0.9812 - val_loss: 0.0886
Epoch 2/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 89ms/step - accuracy: 0.9991 - loss: 0.0027 - val_accuracy: 0.9803 - val_loss: 0.0982
Epoch 3/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 81ms/step - accuracy: 0.9990 - loss: 0.0021 - val_accuracy: 0.9686 - val_loss: 0.1537
Epoch 4/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 80ms/step - accuracy: 0.9974 - loss: 0.0073 - val_accuracy: 0.9821 - val_loss: 0.0857
Epoch 5/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 79ms/step - accuracy: 1.0000 - loss: 0.0011 - val_accuracy: 0.9794 - val_loss: 0.0870


In [17]:
joblib.dump(history.history, 'training_history.pkl')

['training_history.pkl']

In [18]:
model.save('my_model.keras')
joblib.dump(tokenizer, 'tokenizer.pkl')

['tokenizer.pkl']

In [19]:
print(history.history.keys())


dict_keys(['accuracy', 'loss', 'val_accuracy', 'val_loss'])


In [None]:
tokenizer = joblib.load('tokenizer.pkl')
print(type(tokenizer))  


<class 'keras.src.legacy.preprocessing.text.Tokenizer'>


In [None]:
history = joblib.load('training_history.pkl')
print(type(history))  


<class 'dict'>


In [22]:
model = tf.keras.models.load_model('my_model.keras')
print(model.summary())

  saveable.load_own_variables(weights_store.get(inner_path))


None
