In [None]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
%matplotlib inline
import warnings 
warnings.filterwarnings( "ignore")

In [None]:
# For Train test split
from sklearn.model_selection import train_test_split

In [None]:
# For pre-processing of text
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout, GlobalAveragePooling1D, Flatten, SpatialDropout1D

In [None]:
df=pd.read_excel('spam.xlsx')

In [None]:
df.head()

In [None]:
df=df[['v1','v2']]
df.columns=['Category','Content']
df.head()

In [None]:
# Concatenate all strings into a single string
spam_text=[i for i in df['Content']]
spam_text_final=''
for i in spam_text:
  spam_text_final=spam_text_final+str(i)
spam_text_final

In [None]:
# Creating a word cloud
ham_msg_cloud = WordCloud(width =520, height =260, stopwords = STOPWORDS, max_font_size = 50, background_color='green', colormap='magma').generate(spam_text_final)

In [None]:
plt.figure(figsize=(16,10))
plt.imshow(ham_msg_cloud, interpolation ='bicubic')
plt.axis('off')
plt.show()

In [None]:
df['Category']

In [None]:
df.info()

In [None]:
df['Category'].value_counts(normalize=True)

In [None]:
# Understand distribution of ham and spam messages

sns.countplot(x='Category',data=df)

In [None]:
new_df=df.copy()

In [None]:
new_df['Text_Length'] = new_df['Content'].str.len()

In [None]:
new_df['Content_Type'] = new_df['Category'].map({'ham':0,'spam':1})
content_label = new_df['Content_Type'].values
new_df.head()

In [None]:
new_df['Content'] = new_df['Content'].astype(str)

In [None]:
new_df.info()

In [None]:
new_df['Content'] = new_df['Content'].apply(lambda x: " ".join(x.lower() for x in x.split()))
new_df['Content'].head()

In [None]:
new_df['Content'] = new_df['Content'].str.replace('[^\w\s]','')
#\w: Returns a match where the string contains any word characters (characters from a to Z, digits from 0-9, and the underscore _ character)
#\s: Returns a match where the string contains a white space character.
#[^]: Returns a match for any character EXCEPT what is written after it.
new_df['Content'].head()

In [None]:
#!pip install nltk

In [None]:
# Stopwords Removal
import nltk 
from nltk.corpus import stopwords
stop = stopwords.words('english')
new_df['Content'] = new_df['Content'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
new_df['Content'].head()

In [None]:
freq = pd.Series(' '.join(new_df['Content']).split()).value_counts()[:20]
freq

In [None]:
from nltk.stem import WordNetLemmatizer

In [None]:
nltk.download('wordnet')

In [None]:
lt = WordNetLemmatizer()
new_df['Content'].apply(lambda x: " ".join([lt.lemmatize(word) for word in x.split()]))

In [None]:
# Split test vs train
x_train, x_test, y_train, y_test = train_test_split(new_df['Content'], content_label, test_size=0.25, random_state=0)

In [None]:
# Define the parameters
max_len=50    ## texts longer than 50 tokens will be truncated
trunc_type='post'
padding_type='post'
oov_token_1='<OOV>'# out of vocabulary token
vocab_size=500

In [None]:
x_train = [str(item) for item in x_train]

In [None]:
# Converting text into numerical representation - create a tokenizer and fit it on the training data
tokenizer = Tokenizer(num_words = vocab_size,    # maximum number of words to keep in the tokenizer's vocabulary
                      char_level = False,
                      oov_token = oov_token_1)
tokenizer.fit_on_texts(x_train)

In [None]:
x_train

In [None]:
word_index=tokenizer.word_index
total_words = len(word_index)
total_words

In [None]:
word_index

In [None]:
# converting the text data into numerical sequences & padding the sequence for same length
training_sequences = tokenizer.texts_to_sequences(x_train)
training_padded = pad_sequences(training_sequences,
                                maxlen = max_len,
                                padding = padding_type,
                                truncating = trunc_type)

In [None]:
training_sequences

In [None]:
training_padded

In [None]:
testing_sequences = tokenizer.texts_to_sequences(x_test)
testing_padded = pad_sequences(testing_sequences,
                               maxlen = max_len,
                               padding = padding_type,
                               truncating = trunc_type)

In [None]:
testing_sequences

In [None]:
# Model building and Dense Model Architecture
vocab_size=500
embedding_dim = 12   # specifies the dimension of the word embeddings used in the embedding layer
drop_value = 0.2
##n_dense = 5

In [None]:
Model_rnn = Sequential()
Model_rnn.add(Embedding(vocab_size,
                        embedding_dim,  # each word in the vocabulary will be represented by a dense vector of 12 dimensions.
                        input_length = max_len))
Model_rnn.add(GlobalAveragePooling1D())
Model_rnn.add(Dense(24,activation='relu'))
Model_rnn.add(Dropout(drop_value))
Model_rnn.add(Dense(1,activation='sigmoid'))

In [None]:
Model_rnn.summary()
Model_rnn.compile(loss = 'binary_crossentropy', optimizer = 'adam' , metrics = ['accuracy'])

In [None]:
y_train.shape

In [None]:
num_epochs = 50
early_stop = EarlyStopping(monitor='val_loss', patience=3)
history = Model_rnn.fit(training_padded,
                        y_train,
                        epochs=num_epochs,
                        validation_data=(testing_padded,y_test),
                        callbacks =[early_stop],
                        verbose=2)

In [None]:
Model_rnn.evaluate(testing_padded,y_test)

In [None]:
from sklearn.metrics import classification_report

y_probs = Model_rnn.predict(training_padded)

# Convert the probabilities to predicted labels (0 or 1) using a threshold (e.g., 0.5)
threshold = 0.5
y_pred = (y_probs >= threshold).astype(int)

# Step 2: Generate the classification report
target_names = ['Class 0 (Negative)', 'Class 1 (Positive)']  # Replace with actual class names if available
report = classification_report(y_train, y_pred, target_names=target_names)

# Print the classification report
print("Classification Report:")
print(report)

In [None]:
from sklearn.metrics import classification_report

# Assuming you have trained the model (Model_rnn) and have the testing data and true labels (testing_padded and y_test)

# Step 1: Make predictions on the test dataset using the trained model
y_probs = Model_rnn.predict(testing_padded)

# Convert the probabilities to predicted labels (0 or 1) using a threshold (e.g., 0.5)
threshold = 0.5
y_pred = (y_probs >= threshold).astype(int)

# Step 2: Generate the classification report
target_names = ['Class 0 (Negative)', 'Class 1 (Positive)']  # Replace with actual class names if available
report = classification_report(y_test, y_pred, target_names=target_names)

# Print the classification report
print("Classification Report:")
print(report)

In [None]:
# Define parameters and LSTM model
n_lstm = 128
drop_lstm = 0.2
model_lstm = Sequential()
model_lstm.add(Embedding(vocab_size,embedding_dim,input_length=max_len))
model_lstm.add(SpatialDropout1D(drop_lstm))
model_lstm.add(LSTM(n_lstm,return_sequences=False))
model_lstm.add(Dropout(drop_lstm))
model_lstm.add(Dense(1,activation='sigmoid'))

In [None]:
model_lstm.summary()

In [None]:
model_lstm.compile(loss='binary_crossentropy',
                   optimizer='adam',
                   metrics = ['accuracy'])

In [None]:
num_epochs = 10
early_stop = EarlyStopping(monitor='val_loss', patience=2)
history = model_lstm.fit(training_padded,
                         y_train,
                         epochs=num_epochs,
                         validation_data=(testing_padded,y_test),
                         callbacks =[early_stop],
                         verbose=2)

In [None]:
model_lstm.evaluate(testing_padded,y_test)

In [None]:
train_dense_results = model_lstm.evaluate(training_padded, np.asarray(y_train), verbose=2, batch_size=256)
valid_dense_results=model_lstm.evaluate(testing_padded, np.asarray(y_test), verbose=2, batch_size=256)
