In [15]:
!pip install mlflow





[notice] A new release of pip is available: 23.2.1 -> 26.0.1
[notice] To update, run: C:\Users\sowmy\AppData\Local\Programs\Python\Python311\python.exe -m pip install --upgrade pip


### **Import Libraries**

In [2]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import average_precision_score



### **Set MLflow Tracking**

In [3]:
import mlflow
import mlflow.sklearn
import os

os.makedirs("mlruns", exist_ok=True)
mlflow.set_tracking_uri("file:./mlruns")


mlflow.set_experiment("Spam Detection Benchmark Models")


  return FileStore(store_uri, store_uri)
2026/02/14 23:29:01 INFO mlflow.tracking.fluent: Experiment with name 'Spam Detection Benchmark Models' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///c:/Users/sowmy/Downloads/SEM4/AML/Assignment2/mlruns/663893061351406484', creation_time=1771091941507, experiment_id='663893061351406484', last_update_time=1771091941507, lifecycle_stage='active', name='Spam Detection Benchmark Models', tags={}>

### **Load Data**

In [11]:
train_df = pd.read_csv("train.csv")
val_df = pd.read_csv("validation.csv")
test_df = pd.read_csv("test.csv")


### **Add Extra Text Features**

In [12]:
def add_text_features(df):
    df["num_characters"] = df["Message"].fillna("").apply(len)
    df["num_words"] = df["Message"].fillna("").apply(lambda x: len(x.split()))
    df["num_sentences"] = df["Message"].fillna("").apply(
        lambda x: x.count('.') + x.count('!') + x.count('?')
    )
    return df


In [13]:
train_df = add_text_features(train_df)
val_df = add_text_features(val_df)
test_df = add_text_features(test_df)


In [14]:
train_df.head()

Unnamed: 0,Label,Message,num_characters,num_words,num_sentences
0,0,ü come,6,2,0
1,0,forgot work today wan na chat thing ok drop te...,91,19,0
2,0,problem,7,1,0
3,0,think chennai well settl,24,4,0
4,0,yup havent want go yoga call book,33,7,0


### **TF-IDF Vectorization**


In [15]:
train_df["Message"] = train_df["Message"].fillna("").astype(str)
val_df["Message"] = val_df["Message"].fillna("").astype(str)
test_df["Message"] = test_df["Message"].fillna("").astype(str)

In [16]:
tfidf = TfidfVectorizer(max_features=3000)

X_train_text = tfidf.fit_transform(train_df["Message"])
X_val_text = tfidf.transform(val_df["Message"])
X_test_text = tfidf.transform(test_df["Message"])

### **Combine Extra Features**

In [17]:
def combine_features(X_text, df):
    extra = df[["num_characters", "num_words", "num_sentences"]].values
    return hstack([X_text, extra])


In [19]:
X_train = combine_features(X_train_text, train_df)
X_val = combine_features(X_val_text, val_df)
X_test = combine_features(X_test_text, test_df)

y_train = train_df["Label"]
y_val = val_df["Label"]
y_test = test_df["Label"]


### **Models**

In [20]:
models = { 
    "MultinomialNB": MultinomialNB(),
    
    "LogisticRegression": LogisticRegression(
        C=10,
        max_iter=1000,
        class_weight="balanced",
        random_state=42
    ),
    
    "RandomForestClassifier": RandomForestClassifier(
        n_estimators=200,
        max_depth=None,
        random_state=42,
        n_jobs=-1
    )
}


### **MLflow Experiment Setup**

In [21]:
mlflow.set_experiment("Assignment2_Model_Versioning")

2026/02/14 23:34:15 INFO mlflow.tracking.fluent: Experiment with name 'Assignment2_Model_Versioning' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///c:/Users/sowmy/Downloads/SEM4/AML/Assignment2/mlruns/309911367380612686', creation_time=1771092255996, experiment_id='309911367380612686', last_update_time=1771092255996, lifecycle_stage='active', name='Assignment2_Model_Versioning', tags={}>

### **Train, Track, and Register Models**

In [22]:
val_scores = {}

for model_name, model in models.items():
    
    with mlflow.start_run(run_name=model_name):
        
        # Train
        model.fit(X_train, y_train)
        
        # Validation probabilities
        y_val_proba = model.predict_proba(X_val)[:, 1]
        val_aucpr = average_precision_score(y_val, y_val_proba)
        
        # Test probabilities
        y_test_proba = model.predict_proba(X_test)[:, 1]
        test_aucpr = average_precision_score(y_test, y_test_proba)
        
        # Log parameters
        mlflow.log_params(model.get_params())
        
        # Log metrics
        mlflow.log_metric("val_AUCPR", val_aucpr)
        mlflow.log_metric("test_AUCPR", test_aucpr)
        
        # Log model
        mlflow.sklearn.log_model(model, "model")
        
        # Register model
        model_uri = f"runs:/{mlflow.active_run().info.run_id}/model"
        mlflow.register_model(model_uri, model_name)
        
        val_scores[model_name] = val_aucpr
        
        print(f"{model_name} - Validation AUCPR: {val_aucpr}")


  flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)
  return FileStore(store_uri)
Successfully registered model 'MultinomialNB'.
Created version '1' of model 'MultinomialNB'.


MultinomialNB - Validation AUCPR: 0.8092735861467902


  flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)
Successfully registered model 'LogisticRegression'.
Created version '1' of model 'LogisticRegression'.


LogisticRegression - Validation AUCPR: 0.9542030846367953


  flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)
Successfully registered model 'RandomForestClassifier'.
Created version '1' of model 'RandomForestClassifier'.

RandomForestClassifier - Validation AUCPR: 0.9633827776825854





### **Checkout and Print AUCPR for All 3 Models**

In [23]:
print("\nModel Selection Metric (Validation AUCPR):\n")

for name, score in val_scores.items():
    print(f"{name}: {score}")



Model Selection Metric (Validation AUCPR):

MultinomialNB: 0.8092735861467902
LogisticRegression: 0.9542030846367953
RandomForestClassifier: 0.9633827776825854


### **Select Best Model**

In [24]:
best_model = max(val_scores, key=val_scores.get)

print("\nBest Model Based on Validation AUCPR:", best_model)



Best Model Based on Validation AUCPR: RandomForestClassifier
