# Import necessary libraries

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

# Load the dataset

In [4]:
df = pd.read_csv("F:/Data/spam.csv", encoding='ISO-8859-1')

# Data Processing

## 1. Encoding

In [7]:
label_encoder = LabelEncoder()
df['Category'] = label_encoder.fit_transform(df['Category'])

## 2. Extract features and labels

In [9]:
X = df['Message']
y = df['Category']

## 3. Split the data into training and testing sets

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 4. Convert text data to TF-IDF features

In [13]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.9)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

## 5. Parameter Tuning

In [15]:
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale']
}

# Modelling

## Initialize the SVM model for fine-tuning

In [18]:
svm = SVC(random_state=42, probability=True)

## Set up GridSearchCV to find the best hyperparameters

In [20]:
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

## Perform the grid search on the training data

In [22]:
grid_search.fit(X_train_tfidf, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


## Get the best parameters from the grid search

In [24]:
best_params = grid_search.best_params_
print("Best Parameters Found:", best_params)

Best Parameters Found: {'C': 10, 'gamma': 'scale', 'kernel': 'linear'}


## Initialize the SVM model with fine-tuned parameters

In [26]:
svm_model = SVC(C=best_params['C'], kernel=best_params['kernel'], gamma=best_params['gamma'], random_state=42, probability=True)

## Train the SVM model

In [28]:
svm_model.fit(X_train_tfidf, y_train)

# Predict on the test set

In [30]:
y_pred_svm = svm_model.predict(X_test_tfidf)
y_pred_prob = svm_model.predict_proba(X_test_tfidf)[:, 1]  # Probability predictions for AUC-ROC

# Evaluate the model performance

In [32]:
svm_accuracy = accuracy_score(y_test, y_pred_svm)
svm_precision = precision_score(y_test, y_pred_svm)
svm_recall = recall_score(y_test, y_pred_svm)
svm_f1 = f1_score(y_test, y_pred_svm)
svm_confusion_matrix = confusion_matrix(y_test, y_pred_svm)
svm_auc_roc = roc_auc_score(y_test, y_pred_prob)

# Display results

In [34]:
print(f"Accuracy: {svm_accuracy:.4f}")
print(f"Precision: {svm_precision:.4f}")
print(f"Recall: {svm_recall:.4f}")
print(f"F1-Score: {svm_f1:.4f}")
print(f"AUC-ROC: {svm_auc_roc:.4f}")
print("\nConfusion Matrix:\n", svm_confusion_matrix)

Accuracy: 0.9928
Precision: 0.9930
Recall: 0.9530
F1-Score: 0.9726
AUC-ROC: 0.9924

Confusion Matrix:
 [[965   1]
 [  7 142]]
