In [1]:
import torch

print("Number of GPU: ", torch.cuda.device_count())
print("GPU Name: ", torch.cuda.get_device_name())


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Number of GPU:  1
GPU Name:  NVIDIA GeForce RTX 5060
Using device: cuda


In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("jackksoncsie/spam-email-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/jackksoncsie/spam-email-dataset?dataset_version_number=1...


100%|█████████████████████████████████████████████████████████████████████████████| 2.86M/2.86M [00:01<00:00, 2.10MB/s]

Extracting files...
Path to dataset files: C:\Users\usEr\.cache\kagglehub\datasets\jackksoncsie\spam-email-dataset\versions\1





In [64]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import precision_score, recall_score, f1_score


data = pd.read_csv(r"C:\Users\usEr\.cache\kagglehub\datasets\jackksoncsie\spam-email-dataset\versions\1\emails.csv")
data = data[['text', 'spam']]

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['spam'])
x = data['text']

x_train, x_temp, y_train, y_temp = train_test_split(
    x, y, test_size=0.3, stratify=y, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(
    x_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

vectorizer = TfidfVectorizer(
    analyzer='char',
    ngram_range=(3,6),
    min_df=3,
    max_df=0.95,
    max_features=100000,
    sublinear_tf=True
)

x_train_vec = vectorizer.fit_transform(x_train)
x_val_vec = vectorizer.transform(x_val)
x_test_vec = vectorizer.transform(x_test)

class_weight_dict = {
    0: 1,
    1: 7
}

model = LogisticRegression(
    penalty='l2',
    C=1.5,
    solver='liblinear',
    class_weight=class_weight_dict,
    max_iter=4000
)

model.fit(x_train_vec, y_train)

train_pred = model.predict(x_train_vec)
valid_pred = model.predict(x_val_vec)
print("Train Accuracy:", accuracy_score(y_train, train_pred))
print("Valid Accuracy:", accuracy_score(y_val, valid_pred))

VAL_THRESHOLD = 0.6

val_proba = model.predict_proba(x_val_vec)[:, 1]
val_pred = (val_proba >= VAL_THRESHOLD).astype(int)

val_metrics = {
    "accuracy": accuracy_score(y_val, val_pred),
    "precision": precision_score(y_val, val_pred),
    "recall": recall_score(y_val, val_pred),
    "f1": f1_score(y_val, val_pred)
}

print("\nValidation Metrics (threshold = 0.7)")
for k, v in val_metrics.items():
    print(f"{k.capitalize():<10}: {v}")

test_proba = model.predict_proba(x_test_vec)[:, 1]
test_pred = (test_proba >= VAL_THRESHOLD).astype(int)

test_metrics = {
    "accuracy": accuracy_score(y_test, test_pred),
    "precision": precision_score(y_test, test_pred),
    "recall": recall_score(y_test, test_pred),
    "f1": f1_score(y_test, test_pred)
}

print("\nTest Metrics (threshold = 0.7)")
for k, v in test_metrics.items():
    print(f"{k.capitalize():<10}: {v}")

print("\nConfusion Matrix (Test):\n", confusion_matrix(y_test, test_pred))



Train Accuracy: 0.9972561736093789
Valid Accuracy: 0.9883585564610011

Validation Metrics (threshold = 0.7)
Accuracy  : 0.989522700814901
Precision : 0.9666666666666667
Recall    : 0.9902439024390244
F1        : 0.9783132530120482

Test Metrics (threshold = 0.7)
Accuracy  : 0.9918604651162791
Precision : 0.9671361502347418
Recall    : 1.0
F1        : 0.9832935560859188

Confusion Matrix (Test):
 [[647   7]
 [  0 206]]


In [65]:
import joblib

joblib.dump(model, "spam_logreg_model.joblib")
joblib.dump(vectorizer, "spam_tfidf_vectorizer.joblib")
joblib.dump(label_encoder, "spam_label_encoder.joblib")

joblib.dump(VAL_THRESHOLD, "spam_threshold.joblib")

print("saved successfully")

saved successfully
