In [14]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import xgboost as xgb
import optuna
from google.colab import drive


# Load the data
data = pd.read_csv("/content/spam_email.csv")

# Preprocess the data
data['label'] = (data['label'] == 'Spam').astype(int)
data['text'] = data['text'].fillna('')

# Split the data
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.2, random_state=42)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Convert to DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train_tfidf, label=y_train)
dtest = xgb.DMatrix(X_test_tfidf, label=y_test)

# Define the objective function for Optuna
def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 7),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-2, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 5),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        'tree_method': 'gpu_hist',  # Use GPU acceleration
        'gpu_id': 0
    }

    model = xgb.train(params, dtrain, num_boost_round=100, evals=[(dtest, 'test')], early_stopping_rounds=10, verbose_eval=False)
    preds = model.predict(dtest)
    return 1 - accuracy_score(y_test, preds > 0.5)

# Optimize hyperparameters using Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

print('Number of finished trials:', len(study.trials))
print('Best trial:')
trial = study.best_trial

print('  Value: {}'.format(trial.value))
print('  Params: ')
for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))

# Train the final model with the best parameters
best_params = study.best_params
best_model = xgb.train(best_params, dtrain, num_boost_round=100)

# Make predictions
train_preds = best_model.predict(dtrain)
test_preds = best_model.predict(dtest)

# Calculate accuracies
train_accuracy = accuracy_score(y_train, train_preds > 0.5)
test_accuracy = accuracy_score(y_test, test_preds > 0.5)

print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Testing Accuracy: {test_accuracy:.4f}")

# Function to classify new emails
def classify_email(email_text):
    email_tfidf = tfidf.transform([email_text])
    email_dmatrix = xgb.DMatrix(email_tfidf)
    prediction = best_model.predict(email_dmatrix)[0]
    return "Spam" if prediction > 0.5 else "Ham"

# Test the classifier with some example emails
example_emails = [
    "Get rich quick! Buy our amazing product now!",
    "Hi John, can we reschedule our meeting to next week?",
    "Congratulations! You've won a free iPhone. Click here to claim.",
    "Please find attached the report for Q2 sales figures."
]

for email in example_emails:
    print(f"Email: {email}")
    print(f"Classification: {classify_email(email)}\n")

[I 2024-09-05 01:36:47,116] A new study created in memory with name: no-name-d2367851-4d17-4cbe-973a-c19c629e67a5
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-2, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),

    E.g. tree_method = "hist", device = "cuda"

Parameters: { "n_estimators" } are not used.


    E.g. tree_method = "hist", device = "cuda"

[I 2024-09-05 01:36:51,803] Trial 0 finished with value: 0.047329189342549816 and parameters: {'max_depth': 7, 'learning_rate': 0.13363584613867385, 'n_estimators': 176, 'min_child_weight': 1, 'subsample': 0.7143566878736343, 'colsample_bytree': 0.8454838011324011}. Best is trial 0 with value: 0.047329189342549816.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-2, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),

    E.

Number of finished trials: 50
Best trial:
  Value: 0.03832761600165069
  Params: 
    max_depth: 7
    learning_rate: 0.29958224463052163
    n_estimators: 112
    min_child_weight: 3
    subsample: 0.6642838225971721
    colsample_bytree: 0.9861864474421846
Training Accuracy: 0.9725
Testing Accuracy: 0.9607
Email: Get rich quick! Buy our amazing product now!
Classification: Spam

Email: Hi John, can we reschedule our meeting to next week?
Classification: Ham

Email: Congratulations! You've won a free iPhone. Click here to claim.
Classification: Spam

Email: Please find attached the report for Q2 sales figures.
Classification: Ham



In [13]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Convert test_preds to binary format
test_preds_binary = (test_preds > 0.5).astype(int)

# Generate and print classification report
print("\nClassification Report:")
print(classification_report(y_test, test_preds_binary, target_names=['Ham', 'Spam']))

# Generate and print confusion matrix
cm = confusion_matrix(y_test, test_preds_binary)
print("\nConfusion Matrix:")
print(cm)


Classification Report:
              precision    recall  f1-score   support

         Ham       0.98      0.94      0.96     20467
        Spam       0.94      0.98      0.96     18304

    accuracy                           0.96     38771
   macro avg       0.96      0.96      0.96     38771
weighted avg       0.96      0.96      0.96     38771


Confusion Matrix:
[[19290  1177]
 [  400 17904]]


In [None]:
from google.colab import drive
drive.mount('/content/drive')