In [None]:
import nltk
nltk.download('punkt')
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from nltk.tokenize import word_tokenize

# Load the SMS Spam Collection Dataset with specified encoding
df = pd.read_csv("spam.csv", encoding='latin-1')


# Preprocessing
# Convert labels to numerical values (0 for ham, 1 for spam)
encoder = LabelEncoder()
df['label'] = encoder.fit_transform(df['v1'])

# Tokenization
df['tokenized_text'] = df['v2'].apply(word_tokenize)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['tokenized_text'], df['label'], test_size=0.2, random_state=42)

# Vectorization
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform([' '.join(tokens) for tokens in X_train])
X_test_vec = vectorizer.transform([' '.join(tokens) for tokens in X_test])

# Convert data to PyTorch tensors
X_train_torch = torch.tensor(X_train_vec.toarray(), dtype=torch.float32)
y_train_torch = torch.tensor(y_train.values, dtype=torch.long)
X_test_torch = torch.tensor(X_test_vec.toarray(), dtype=torch.float32)
y_test_torch = torch.tensor(y_test.values, dtype=torch.long)

# Define the model architecture
class SpamClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(SpamClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

# Instantiate the model, loss function, and optimizer
input_size = X_train_torch.shape[1]
hidden_size = 128
num_classes = 2
model = SpamClassifier(input_size, hidden_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
batch_size = 32
train_dataset = [(X_train_torch[i], y_train_torch[i]) for i in range(len(X_train_torch))]
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Evaluation
with torch.no_grad():
    outputs = model(X_test_torch)
    _, predicted = torch.max(outputs, 1)
    accuracy = accuracy_score(y_test_torch.numpy(), predicted.numpy())
    print(f'Accuracy: {accuracy:.4f}')
    print(classification_report(y_test_torch.numpy(), predicted.numpy()))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Epoch [1/10], Loss: 0.0757
Epoch [2/10], Loss: 0.0157
Epoch [3/10], Loss: 0.0053
Epoch [4/10], Loss: 0.0031
Epoch [5/10], Loss: 0.0007
Epoch [6/10], Loss: 0.0020
Epoch [7/10], Loss: 0.0003
Epoch [8/10], Loss: 0.0000
Epoch [9/10], Loss: 0.0001
Epoch [10/10], Loss: 0.0000
Accuracy: 0.9794
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.98      0.86      0.92       150

    accuracy                           0.98      1115
   macro avg       0.98      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115

