In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV


In [2]:
# Load the SMS spam dataset (replace with your own dataset)
file = "smsspamcollection.zip"
df = pd.read_csv(file, compression='zip', sep='\t', names=["label", "message"])

In [3]:
# Data Preprocessing
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42)

In [4]:
# Text Classification Pipeline (TF-IDF Vectorizer + Classifier)
models = {
    "Support Vector Machine": SVC(),
    "Random Forest": RandomForestClassifier(),
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(),
    "Gradient Boosting": GradientBoostingClassifier()
    }

for name, model in models.items():
    pipeline = make_pipeline(TfidfVectorizer(), model)
    pipeline.fit(X_train, y_train)

    # Make predictions
    y_pred = pipeline.predict(X_test)

    # Evaluate the model
    print(f"\nResults for {name}:")
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.2f}")

    # Classification Report
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    # Confusion Matrix
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    # Cross-validation
    cv_scores = cross_val_score(pipeline, df['message'], df['label'], cv=5)
    print(f"Cross-validation Mean Accuracy: {cv_scores.mean():.2f}")


Results for Support Vector Machine:
Accuracy: 0.99
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       966
           1       1.00      0.92      0.96       149

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.98      1115
weighted avg       0.99      0.99      0.99      1115

Confusion Matrix:
[[966   0]
 [ 12 137]]
Cross-validation Mean Accuracy: 0.98

Results for Random Forest:
Accuracy: 0.98
Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       966
           1       1.00      0.83      0.91       149

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115

Confusion Matrix:
[[966   0]
 [ 25 124]]
Cross-validation Mean Accuracy: 0.97

Results for Naive Bayes:
Accuracy: 0.97
Classifi

In [5]:
# Hyperparameter tuning with GridSearchCV (example with Support Vector Machine)
svm_pipeline = make_pipeline(TfidfVectorizer(), StandardScaler(with_mean=False), SVC())

param_grid = {
    'svc__C': [0.1, 1, 10],
    'svc__gamma': [0.01, 0.1, 1],
    'svc__kernel': ['linear', 'rbf']
}

grid_search = GridSearchCV(svm_pipeline, param_grid, cv=5)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print(f"\nBest Hyperparameters: {best_params}")



Best Hyperparameters: {'svc__C': 0.1, 'svc__gamma': 0.01, 'svc__kernel': 'linear'}


In [6]:
# Use the best model for predictions
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)

# Evaluate the best model
print("\nResults for Best Model:")
accuracy_best = accuracy_score(y_test, y_pred_best)
print(f"Accuracy: {accuracy_best:.2f}")

# Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred_best))

# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_best))

# Cross-validation for the best model
cv_scores_best = cross_val_score(best_model, df['message'], df['label'], cv=5)
print(f"Cross-validation Mean Accuracy: {cv_scores_best.mean():.2f}")


Results for Best Model:
Accuracy: 0.98
Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       0.99      0.89      0.94       149

    accuracy                           0.98      1115
   macro avg       0.99      0.95      0.97      1115
weighted avg       0.98      0.98      0.98      1115

Confusion Matrix:
[[965   1]
 [ 16 133]]
Cross-validation Mean Accuracy: 0.98
