In [21]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split, GridSearchCV # GridSearchCV for tuning
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import warnings
import mlflow
import mlflow.sklearn
import pickle

In [22]:
mlflow.sklearn.autolog()

In [23]:
NLTK_STOPWORDS = set(stopwords.words('english'))
NLTK_LEMMATIZER = WordNetLemmatizer()

def preprocess_text(text: str) -> str:
    if not isinstance(text, str): return ""
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    words = [NLTK_LEMMATIZER.lemmatize(word)
             for word in text.split()
             if word not in NLTK_STOPWORDS and len(word) > 1]
    return ' '.join(words)

In [24]:
if __name__ == "__main__":
    print("Starting Hyperparameter Tuning...")

    df = pd.read_csv("spam_data.csv", encoding='latin-1')

    df = df[['sms', 'label']]
    df.dropna(subset=['sms', 'label'], inplace=True)
    df = df[df['sms'].apply(lambda x: isinstance(x, str))]
    df['label'] = pd.to_numeric(df['label'], errors='coerce').astype(int)

    df['processed_sms'] = df['sms'].apply(preprocess_text)

    X = df['processed_sms']
    y = df['label']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
    
    pipeline = Pipeline([
        ('tfidf_vectorizer', TfidfVectorizer(ngram_range=(1, 2))), # Uses both unigrams and bigrams
        ('logistic_classifier', LogisticRegression(random_state=42, solver='liblinear', penalty='l2'))
    ])

    param_grid = {
        'tfidf_vectorizer__max_features': [7000, 10000, 15000], # Tuning number of features
        'logistic_classifier__C': [0.5, 1.0, 5.0] # Tuning regularization strength
    }

Starting Hyperparameter Tuning...


In [25]:
with mlflow.start_run():
        grid_search = GridSearchCV(
            pipeline,
            param_grid,
            cv=5,            # 5-fold cross-validation
            scoring='f1',    # Optimize for F1-score
            n_jobs=-1,       # Use all available CPU cores
            verbose=0        # Keep output minimal during search
        )
    
        grid_search.fit(X_train, y_train)
    
        best_pipeline = grid_search.best_estimator_ # The best performing model from the search
        best_params = grid_search.best_params_ # The best hyperparameters found
        
        # Evaluate the Best Tuned Model
        tuned_predictions = best_pipeline.predict(X_test)

         # Save the model
        model_filename = "best_spam_classifier.pkl"
        with open(model_filename, 'wb') as file:
            pickle.dump(best_pipeline, file)
        print(f"\nModel saved as {model_filename}") # Confirmation message
    
        accuracy = accuracy_score(y_test, tuned_predictions)
        precision = precision_score(y_test, tuned_predictions)
        mlflow.log_metric("final_test_accuracy", accuracy)
        mlflow.log_metric("final_test_precision", precision)

        print("\nHyperparameter Tuning Results:")
        print(f"Optimal Hyperparameters: {best_params}")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")

1. Set the MLFLOW_TRACKING_URI environment variable to the desired tracking URI. `export MLFLOW_TRACKING_URI=http://localhost:5000`
2. Set the tracking URI programmatically by calling `mlflow.set_tracking_uri`. `mlflow.set_tracking_uri('http://localhost:5000')`



Model saved as best_spam_classifier.pkl

Hyperparameter Tuning Results:
Optimal Hyperparameters: {'logistic_classifier__C': 5.0, 'tfidf_vectorizer__max_features': 10000}
Accuracy: 0.9821
Precision: 0.9821
