In [1]:
!pip freeze | grep scikit-learn

scikit-learn==1.5.1


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_curve, auc, classification_report, confusion_matrix, precision_score, recall_score, f1_score

import mlflow
import mlflow.sklearn
from google.cloud import storage as gcs_storage
from google.auth.exceptions import DefaultCredentialsError

In [3]:
# input_file = f'https://github.com/Tiamz01/water__quality_classification_mlops/blob/master/01-data_collection_and_model/data/waterQuality1.csv'
input_file = "01-data_collection_and_model/data/waterQuality.csv"
output_file = f"output/outcome.csv"

RUN_ID = os.getenv('RUN_ID', 'b148d239d6b84b08961892fe7b8dfc95')

In [4]:
def get_and_clean_data(input_file):
    data =  pd.read_csv(input_file)
    data.replace('#NUM!', np.nan, inplace=True)
    data.dropna()
    # Convert all columns to numeric, forcing non-numeric values to NaN
    data = data.apply(pd.to_numeric, errors='coerce')

    cleaned_data = data.dropna()
    df = cleaned_data
    return df

In [5]:
data = get_and_clean_data(input_file)
data

Unnamed: 0,aluminium,ammonia,arsenic,barium,cadmium,chloramine,chromium,copper,flouride,bacteria,...,lead,nitrates,nitrites,mercury,perchlorate,radium,selenium,silver,uranium,is_safe
0,1.65,9.08,0.04,2.85,0.007,0.35,0.83,0.17,0.05,0.20,...,0.054,16.08,1.13,0.007,37.75,6.78,0.08,0.34,0.02,1.0
1,2.32,21.16,0.01,3.31,0.002,5.28,0.68,0.66,0.90,0.65,...,0.100,2.01,1.93,0.003,32.26,3.21,0.08,0.27,0.05,1.0
2,1.01,14.02,0.04,0.58,0.008,4.24,0.53,0.02,0.99,0.05,...,0.078,14.16,1.11,0.006,50.28,7.07,0.07,0.44,0.01,0.0
3,1.36,11.33,0.04,2.96,0.001,7.23,0.03,1.66,1.08,0.71,...,0.016,1.41,1.29,0.004,9.12,1.72,0.02,0.45,0.05,1.0
4,0.92,24.33,0.03,0.20,0.006,2.67,0.69,0.57,0.61,0.13,...,0.117,6.74,1.11,0.003,16.90,2.41,0.02,0.06,0.02,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7994,0.05,7.78,0.00,1.95,0.040,0.10,0.03,0.03,1.37,0.00,...,0.197,14.29,1.00,0.005,3.57,2.13,0.09,0.06,0.03,1.0
7995,0.05,24.22,0.02,0.59,0.010,0.45,0.02,0.02,1.48,0.00,...,0.031,10.27,1.00,0.001,1.48,1.11,0.09,0.10,0.08,1.0
7996,0.09,6.85,0.00,0.61,0.030,0.05,0.05,0.02,0.91,0.00,...,0.182,15.92,1.00,0.000,1.35,4.84,0.00,0.04,0.05,1.0
7997,0.01,10.00,0.01,2.00,0.000,2.00,0.00,0.09,0.00,0.00,...,0.000,0.00,0.00,0.000,0.00,0.00,0.00,0.00,0.00,1.0


In [6]:
def feature_engineering(df):
    # Features and target variable
    X = df.drop(columns=['is_safe'])
    y = df['is_safe']

    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
    return X_train, X_test, y_train, y_test

In [7]:
def load_model(run_id):
    logged_model = f"gs://water_quality_model/1/{run_id}/artifacts/model/gb_model"
    model = mlflow.pyfunc.load_model(logged_model)
    return model


def apply_model(input_file, run_id, output_file):
    df = get_and_clean_data(input_file)
    
    # Check if 'is_safe' column exists
    if 'is_safe' not in df.columns:
        raise KeyError("'is_safe' column is missing from the data.")
    
    X_train, X_test, y_train, y_test = feature_engineering(df)
    
    model = load_model(run_id)
    
    y_pred = model.predict(X_test)
    
    # Save the results
    df_result = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
    df_result['model_version'] = run_id
    
    # Ensure the output directory exists
    output_dir = os.path.dirname(output_file)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    df_result.to_csv(output_file, index=False)
    
    return df_result
    

In [8]:
apply_model(input_file=input_file, run_id=RUN_ID, output_file=output_file)


 - psutil (current: 5.9.0, required: psutil==6.0.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


Unnamed: 0,Actual,Predicted,model_version
5488,0.0,0.0,b148d239d6b84b08961892fe7b8dfc95
2577,0.0,0.0,b148d239d6b84b08961892fe7b8dfc95
3692,1.0,1.0,b148d239d6b84b08961892fe7b8dfc95
7465,0.0,0.0,b148d239d6b84b08961892fe7b8dfc95
7564,0.0,0.0,b148d239d6b84b08961892fe7b8dfc95
...,...,...,...
1471,0.0,0.0,b148d239d6b84b08961892fe7b8dfc95
6257,0.0,0.0,b148d239d6b84b08961892fe7b8dfc95
4323,0.0,0.0,b148d239d6b84b08961892fe7b8dfc95
4747,0.0,0.0,b148d239d6b84b08961892fe7b8dfc95
