In [1]:
import os
import pandas as pd
import numpy as np
import joblib

from pathlib import Path
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

# Reproducibility
RANDOM_STATE = 42

# Ensure model dir exists
Path("models").mkdir(exist_ok=True)


In [2]:
DATA_DIR = Path("./data_splits")
train_df = pd.read_csv(DATA_DIR / "train.csv")
val_df   = pd.read_csv(DATA_DIR / "validation.csv")
test_df  = pd.read_csv(DATA_DIR / "test.csv")

print(f"Train/Val/Test sizes: {len(train_df)}/{len(val_df)}/{len(test_df)}")
train_df.head()


Train/Val/Test sizes: 4457/557/558


Unnamed: 0,label,message
0,0,guy close
1,0,please come imin towndontmatter urgoin outlrju...
2,0,ok ksry knw sivatats askd
3,0,ill see prolly yeah
4,0,ill see swing bit got thing take care firsg


In [3]:
for df in (train_df, val_df, test_df):
    df['message'] = df['message'].fillna("")


In [4]:
# Candidate pipelines
pipelines = {
    "Naive Bayes": Pipeline([
        ("tfidf", TfidfVectorizer(max_features=5000, stop_words="english")),
        ("clf", MultinomialNB())
    ]),
    "Logistic Regression": Pipeline([
        ("tfidf", TfidfVectorizer(max_features=5000, stop_words="english")),
        ("clf", LogisticRegression(max_iter=1000, random_state=RANDOM_STATE))
    ]),
    "SVM": Pipeline([
        ("tfidf", TfidfVectorizer(max_features=5000, stop_words="english")),
        ("clf", SVC(kernel="linear", probability=True, random_state=RANDOM_STATE))
    ]),
}

# Optional hyperparams (only for SVM here)
param_grids = {
    "SVM": {
        "clf__C": [0.1, 1, 10]
    }
}


In [5]:
X_train, y_train = train_df['message'], train_df['label']
X_val,   y_val   = val_df  ['message'], val_df  ['label']

results = {}
best_model = None
best_name  = None
best_auc   = -1

for name, pipeline in pipelines.items():
    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_val)
    probs = pipeline.predict_proba(X_val)[:,1]
    acc  = accuracy_score(y_val, preds)
    auc  = roc_auc_score(y_val, probs)
    results[name] = (acc, auc)
    print(f"{name:20s} → Acc: {acc:.4f}, ROC-AUC: {auc:.4f}")
    if auc > best_auc:
        best_auc   = auc
        best_model = pipeline
        best_name  = name

print(f"\n🏆 Best on Val: {best_name} (ROC-AUC={best_auc:.4f})")
pd.DataFrame.from_dict(results, orient='index', columns=['accuracy','roc_auc']) \
  .sort_values('roc_auc', ascending=False)


Naive Bayes          → Acc: 0.9641, ROC-AUC: 0.9738
Logistic Regression  → Acc: 0.9623, ROC-AUC: 0.9755
SVM                  → Acc: 0.9838, ROC-AUC: 0.9761

🏆 Best on Val: SVM (ROC-AUC=0.9761)


Unnamed: 0,accuracy,roc_auc
SVM,0.983842,0.976051
Logistic Regression,0.962298,0.975519
Naive Bayes,0.964093,0.973756


In [6]:
if best_name in param_grids:
    print(f"Tuning hyperparams for {best_name}…")
    gs = GridSearchCV(
        best_model,
        param_grids[best_name],
        cv=5,
        scoring="roc_auc",
        n_jobs=-1
    )
    gs.fit(X_train, y_train)
    best_model = gs.best_estimator_
    print(f"→ Best params: {gs.best_params_}")


Tuning hyperparams for SVM…
→ Best params: {'clf__C': 1}


In [7]:
X_test, y_test = test_df['message'], test_df['label']
test_preds = best_model.predict(X_test)
test_probs = best_model.predict_proba(X_test)[:,1]

print("📊 Test Set Performance:")
print(classification_report(y_test, test_preds, digits=4))
print(f"Accuracy: {accuracy_score(y_test, test_preds):.4f}")
print(f"ROC-AUC:   {roc_auc_score(y_test, test_probs):.4f}")


📊 Test Set Performance:
              precision    recall  f1-score   support

           0     0.9856    0.9917    0.9886       483
           1     0.9444    0.9067    0.9252        75

    accuracy                         0.9803       558
   macro avg     0.9650    0.9492    0.9569       558
weighted avg     0.9801    0.9803    0.9801       558

Accuracy: 0.9803
ROC-AUC:   0.9871


In [8]:
# Extract vectorizer and classifier
vectorizer = best_model.named_steps['tfidf']
classifier = best_model.named_steps['clf']

joblib.dump(vectorizer, Path("models")/"tfidf_vectorizer.joblib")
joblib.dump(classifier, Path("models")/f"clf_{best_name.replace(' ','_')}.joblib")
print(f"✅ Saved TF-IDF and {best_name} classifier in models/")


✅ Saved TF-IDF and SVM classifier in models/


In [9]:
# ====== Persist full pipeline for scoring ======
import joblib
from pathlib import Path

pipeline_path = Path("models")/"pipeline_SVM.joblib"
joblib.dump(best_model, pipeline_path)
print(f"✅ Full pipeline saved at: {pipeline_path}")


✅ Full pipeline saved at: models\pipeline_SVM.joblib


In [10]:
from score import score
p, prop = score("Free money", 0.5)
print(p, prop)


True 0.529251082319466


In [12]:
import subprocess
res = subprocess.run(["pytest","test_score.py","-q"], capture_output=True, text=True)
print(res.stdout)
assert res.returncode == 0
print("✅ Unit tests passed!")


[32m.[0m[32m.[0m[32m.[0m[32m                                                                      [100%][0m
[32m[32m[1m3 passed[0m[32m in 1.59s[0m[0m

✅ Unit tests passed!


In [13]:
import subprocess, time, requests

# 1) Launch the Flask app in a subprocess
proc = subprocess.Popen(["python", "app.py"])
time.sleep(2)  # give it time to start

try:
    # 2) Send a test request
    resp = requests.post(
        "http://127.0.0.1:5000/score",
        json={"text": "Free lottery, click here to claim!"}
    )
    print("Status code:", resp.status_code)
    print("JSON response:", resp.json())
    assert resp.status_code == 200
    jr = resp.json()
    assert "prediction" in jr and "propensity" in jr
    assert isinstance(jr["prediction"], int)
    assert 0.0 <= jr["propensity"] <= 1.0
    print("✅ Smoke test passed")
finally:
    # 3) Tear down the Flask app
    proc.terminate()
    proc.wait()


Status code: 200
JSON response: {'prediction': 1, 'propensity': 0.9999960798222837}
✅ Smoke test passed


In [15]:
!pip install coverage --quiet



[notice] A new release of pip is available: 23.2.1 -> 25.1
[notice] To update, run: C:\Users\samar\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [16]:
import coverage
import pytest

# Start coverage
cov = coverage.Coverage(source=["."], omit=["*/venv/*", "*/.venv/*", "*/__pycache__/*"])
cov.start()

# Run all tests
pytest.main(["-q", "--disable-warnings", "--maxfail=1"])

# Stop and save
cov.stop()
cov.save()

# Write report to file
with open("coverage.txt", "w") as f:
    cov.report(file=f)

# Print summary to screen
cov.report()
print("\n✅ coverage.txt created.")


[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m                                                                     [100%][0m
[32m[32m[1m4 passed[0m[32m in 2.13s[0m[0m
Name            Stmts   Miss  Cover
-----------------------------------
app.py             15     15     0%
score.py            8      5    38%
test_flask.py      15      0   100%
test_score.py      19      0   100%
-----------------------------------
TOTAL              57     20    65%

✅ coverage.txt created.
