In [1]:
# Test Simulations

def test_load_bounds(df):
    assert df["load_factor"].between(0,1).all()

In [3]:
# Test Language Features

def test_text_not_empty(df):
    assert df["text"].str.len().min() > 20

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split

from src.constants import DATA_PATH_PROCESSED, TEXT_COLUMN
from src.nlp.features import build_tfidf
from src.nlp.models import build_classifier
from src.nlp.evaluation import evaluate

def main():
    print("Loading processed dataset...")
    df = pd.read_csv(DATA_PATH_PROCESSED)

    assert TEXT_COLUMN in df.columns, "Missing text column"

    # Create simple risk label
    df["high_risk"] = (df["load_factor"] > 0.7).astype(int)

    X = df[TEXT_COLUMN]
    y = df["high_risk"]

    print("Splitting data...")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    print("Building TF-IDF features...")
    vectorizer = build_tfidf(max_features=2000)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    print("Training model...")
    model = build_classifier()
    model.fit(X_train_vec, y_train)

    print("Evaluating model...")
    preds = model.predict(X_test_vec)
    evaluate(y_test, preds)

    print("\n✅ Pipeline sanity check PASSED")

if __name__ == "__main__":
    main()


Loading processed dataset...
Splitting data...
Building TF-IDF features...
Training model...
Evaluating model...
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       595
           1       0.17      0.20      0.18         5

    accuracy                           0.98       600
   macro avg       0.58      0.60      0.59       600
weighted avg       0.99      0.98      0.99       600


✅ Pipeline sanity check PASSED
