In [8]:
import pandas as pd
import joblib
from sklearn.metrics import accuracy_score, classification_report
import mlflow


def validate_model(data_path='data/processed_data.csv', model_path='model/logreg_model.pkl'):
    # Start MLflow run
    with mlflow.start_run(run_name="model_validation"):

        # Load the processed data
        df = pd.read_csv(data_path)

        # Use exactly the same columns as in training
        X = df[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_male', 'Embarked_Q', 'Embarked_S']].copy()
        y = df['Survived']

        # Handle missing values
        X['Age'] = X['Age'].fillna(X['Age'].mean())
        X['Fare'] = X['Fare'].fillna(X['Fare'].mean())

        # Load the trained model
        model = joblib.load(model_path)

        # Predict
        preds = model.predict(X)

        # Metrics
        accuracy = accuracy_score(y, preds)
        print(f" Validation Accuracy: {accuracy:.4f}")
        print(classification_report(y, preds))

        # Log accuracy to MLflow
        mlflow.log_metric("validation_accuracy", accuracy)


if __name__ == "__main__":
    validate_model()


 Validation Accuracy: 1.0000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       266
           1       1.00      1.00      1.00       152

    accuracy                           1.00       418
   macro avg       1.00      1.00      1.00       418
weighted avg       1.00      1.00      1.00       418

