**SPAM EMAIL CLASSIFICATION**

In [2]:
import pandas as pd

In [3]:
df=pd.read_csv("/content/data/email.csv")
df.sample(4)

Unnamed: 0,Category,Message
321,ham,"Merry Christmas to you too babe, i love ya *ki..."
1645,ham,India have to take lead:)
4144,spam,In The Simpsons Movie released in July 2007 na...
747,ham,"I promise to take good care of you, princess. ..."


In [4]:
df.isna().any().any()

False

In [5]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4
"{""mode"":""full""",1,1,isActive:false},1


In [6]:
df["Category"].value_counts()

ham               4825
spam               747
{"mode":"full"       1
Name: Category, dtype: int64

**SAMPLING OUR IMBALANCED DATASET**

In [7]:
df_spam = df[df["Category"]=="spam"]
df_spam.shape

(747, 2)

In [8]:
df_ham = df[df["Category"]=="ham"]
df_ham.shape

(4825, 2)

Downsampling our the ham_data(column) to match up with the spam dataset..

In [9]:
df_ham_downsampled = df_ham.sample(df_spam.shape[0])
df_ham_downsampled.shape

(747, 2)

In [10]:
df_balanced = pd.concat([df_spam, df_ham_downsampled])
df_balanced.shape

(1494, 2)

In [11]:
df_balanced["Category"].value_counts()

spam    747
ham     747
Name: Category, dtype: int64

In [12]:
df_balanced.sample(4)

Unnamed: 0,Category,Message
5567,spam,This is the 2nd time we have tried 2 contact u...
5112,spam,December only! Had your mobile 11mths+? You ar...
2265,ham,Ok . . now i am in bus. . If i come soon i wil...
2724,ham,"Tunde, how are you doing. This is just wishing..."


In [13]:
df_balanced["spam"] = df_balanced["Category"].apply(lambda x:1 if x=="spam" else 0)
df_balanced.sample(4)

Unnamed: 0,Category,Message,spam
3564,spam,Auction round 4. The highest bid is now £54. N...,1
5077,ham,"Well, i'm glad you didn't find it totally disa...",0
2826,spam,Congratulations - Thanks to a good friend U ha...,1
619,ham,I come n pick ü up... Come out immediately aft...,0


In [14]:
columns_to_drop =["Category","spam"]

In [15]:
x_df = df_balanced.Message
x_df.head(2)

2    Free entry in 2 a wkly comp to win FA Cup fina...
5    FreeMsg Hey there darling it's been 3 week's n...
Name: Message, dtype: object

In [16]:
y_df = df_balanced.spam
y_df.head(1)

2    1
Name: spam, dtype: int64

In [None]:
!pip install transformers

*Data Splitting*

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x_df, y_df,test_size= 0.2,random_state=42,stratify=df_balanced["spam"])

*Data preprocessing(Data_cleaning and Stop_word removal)*

In [None]:

import re
import nltk
from nltk.corpus import stopwords

# Download stopwords if not already downloaded
# nltk.download('stopwords')
nltk.download('punkt')
def preprocess_text(text):
    # Define necessary symbols commonly found in spam messages
    necessary_symbols = ['$', '%', '*', '@', '#']

    # Remove unnecessary symbols
    text = re.sub(r'[^\w\s{}]'.format(''.join(necessary_symbols)), '', text)

    # Tokenize the text
    tokens = nltk.word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

    # Reconstruct the text
    processed_text = ' '.join(filtered_tokens)

    return processed_text
# Apply preprocessing function to a Pandas Series containing email text
x_train_preprocessed = x_train.apply(preprocess_text)

In [None]:
x_train.head(2)

In [None]:
x_train_preprocessed.head(2)

In [23]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Assuming x_train_preprocessed is a list of texts
def embed_texts(texts, batch_size=8):
    model.eval()  # Put model in evaluation mode
    embeddings = []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        tokens = tokenizer(batch, padding=True, truncation=True, return_tensors='pt', max_length=256)

        with torch.no_grad():
            outputs = model(**tokens)
        batch_embeddings = outputs.last_hidden_state[:,0,:].numpy()  # Get CLS token embeddings for the batch
        embeddings.append(batch_embeddings)

    # Concatenate all batch embeddings
    embeddings = np.concatenate(embeddings, axis=0)
    return embeddings

# Example usage
# Convert x_train_preprocessed to a list if it's not already
x_train_list = list(x_train_preprocessed)
embedded_data = embed_texts(x_train_list, batch_size=8)


In [26]:
print(len(embedded_data))
print(embedded_data.shape)
print(type(embedded_data))

1195
(1195, 768)
<class 'numpy.ndarray'>


In [28]:
embedded_data[:4]

array([[-0.21681614,  0.17266676,  0.4239767 , ..., -0.17581615,
         0.09801457,  0.42288333],
       [-0.17533849,  0.05348707,  0.01884171, ..., -0.28377557,
        -0.14435261,  0.65693724],
       [ 0.13556391, -0.31114104,  0.09395669, ..., -0.14489728,
        -0.17871536,  0.31393197],
       [-0.30670094, -0.04176201,  0.1720631 , ..., -0.3626571 ,
        -0.04933013,  0.42002165]], dtype=float32)

In [29]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Pad sequences to a fixed length
max_sequence_length =1200 # Adjust as needed based on the maximum length of your email texts
X = pad_sequences(embedded_data, maxlen=max_sequence_length, dtype='float32', padding='post')

# Convert labels to numpy array
y = np.array(y_train)

print(y.shape)

(1195, 1200)


In [33]:

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split

# Assuming you have labels for your emails stored in a variable y_train

# Convert embedded data to numpy arrays
# X = np.array([embedding.numpy() for embedding in embedded_data])
# y = np.array(y_train)
print(X.shape)
print(y.shape)
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
n_features = 120  # 1200 features divided by 10 time steps
n_steps = 10  # Choose based on your data's nature

X_train = X_train.reshape((X_train.shape[0], n_steps, n_features))
X_val = X_val.reshape((X_val.shape[0], n_steps, n_features))
print(X_train.shape)
print(X_val.reshape)

# Define LSTM model architecture
model = Sequential([
    LSTM(128, input_shape=X_train.shape[1:]),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

# Evaluate the model
loss, accuracy = model.evaluate(X_val, y_val)
print("Validation Loss:", loss)
print("Validation Accuracy:", accuracy)

(1195, 1200)
(1195,)
(956, 10, 120)
<built-in method reshape of numpy.ndarray object at 0x799990638570>
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Validation Loss: 0.1696687936782837
Validation Accuracy: 0.9372385144233704
