In [66]:
import pandas as pd
import numpy as np
import nltk # natural language toolkit -> tokenization, stopword removal , stemming wa lemmitization,
import torch
import torch.nn as nn
import torch.optim as optim
import string

from torch.utils.data import Dataset, DataLoader
# TfIDF use garxum
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [67]:
nltk.download('punkt') # punkt is required for tokenization
nltk.download('punkt_tab')
nltk.download('stopwords') # stopwords are required for stopword removal
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Sushil\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Sushil\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sushil\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Sushil\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [68]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [69]:
data = pd.read_csv('../1Classes/spam.csv',encoding='latin-1')

In [70]:
data.head(5)

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [71]:

data = data[['text', 'label']]
data.columns = ['text', 'labels']
data.head()


Unnamed: 0,text,labels
0,Subject: enron methanol ; meter # : 988291\r\n...,ham
1,"Subject: hpl nom for january 9 , 2001\r\n( see...",ham
2,"Subject: neon retreat\r\nho ho ho , we ' re ar...",ham
3,"Subject: photoshop , windows , office . cheap ...",spam
4,Subject: re : indian springs\r\nthis deal is t...,ham


In [None]:
def preprocess_text(text):
  text = text.lower()
  # punctuation hatauney
  text = "".join(char for char in text if char not in string.punctuation)
  # tokenization
  tokens = word_tokenize(text)
  # remove stopwords
  stop_words = set(stopwords.words('english'))
  tokens = [word for word in tokens if word not in stop_words]
  # lemmatization
  lemmatizer = WordNetLemmatizer()
  tokens = [lemmatizer.lemmatize(word) for word in tokens]
  return " ".join(tokens)

data['cleaned_text'] = data['text'].apply(preprocess_text)



data['labels'] = data['labels'].map({"ham":0, "spam":1})
data.head()

# 2) Feature Extraction (Tf-IDF -> ngram, bagofwords)
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(data['cleaned_text']).toarray()
y = data['labels'].values

# Mathi numpy ma convert vaisakyo
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


class EmailDataset(Dataset):
  def __init__(self, X, y):
    self.X = torch.tensor(X, dtype=torch.float32)
    self.y = torch.tensor(y, dtype=torch.float32)

  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    return self.X[idx], self.y[idx]


train_dataset = EmailDataset(X_train, y_train)
test_dataset = EmailDataset(X_test, y_test)


train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


class SpamClassifier(nn.Module):
  def __init__(self, num_inputs):
    super().__init__()
    self.fc1 = nn.Linear(num_inputs, 128)
    self.fc2 = nn.Linear(128, 64)
    self.fc3 = nn.Linear(64, 1)

    self.dropout = nn.Dropout(0.5)
    self.relu = nn.ReLU()
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    x = self.relu(self.fc1(x))
    x = self.relu(self.fc2(x))
    x = self.dropout(x)
    x = self.sigmoid(self.fc3(x))
    return x


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SpamClassifier(num_inputs=X_train.shape[1]).to(device)

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10
model.train()
for epoch in range(1, num_epochs+1):
  total_loss = 0.0
  for inputs, labels in train_loader:
    inputs, labels = inputs.to(device), labels.to(device)
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, labels.unsqueeze(1))
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

  avg_loss = total_loss / len(train_loader)
  print(f"Epoch [{epoch}/{num_epochs}], Loss: {avg_loss:.4f}")




model.eval()
correct = 0
total = 0
with torch.no_grad():
  for inputs, labels in test_loader:
    inputs, labels = inputs.to(device), labels.to(device)
    outputs = model(inputs)
    predicted = (outputs >= 0.5).float()
    total += labels.size(0)
    correct += (predicted == labels.unsqueeze(1)).sum().item()

accuracy = correct / total
print(f"Test Accuracy: {accuracy:.4f}")


def predict_email(test_email):
  model.eval()
  preprocessed_email = preprocess_text(test_email)
  email_vector = vectorizer.transform([preprocessed_email]).toarray()
  email_tensor = torch.tensor(email_vector, dtype=torch.float32).to(device)
  output = model(email_tensor)
  predicted_label = (output >= 0.5).float().item()

  if predicted_label == 1:
    print("Email is spam")
  else:
    print("Email is not spam")







Epoch [1/10], Loss: 0.3253
Epoch [2/10], Loss: 0.0299
Epoch [3/10], Loss: 0.0106
Epoch [4/10], Loss: 0.0037
Epoch [5/10], Loss: 0.0025
Epoch [6/10], Loss: 0.0021
Epoch [7/10], Loss: 0.0023
Epoch [8/10], Loss: 0.0014
Epoch [9/10], Loss: 0.0015
Epoch [10/10], Loss: 0.0012
Test Accuracy: 0.9836


In [73]:
test_email = "Congratulation you've won a free trip"
result=predict_email(test_email)

Email is not spam
