In [10]:
import numpy as np
import pandas as pd
from joblib import dump, load
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [7]:
def preprocess_data(data, target_column, impute_method, save_path):
    # Pemisahan Fitur dan Target
    X = data.drop(columns=[target_column])
    y = data[target_column]
    
    # Pembagian Data Latih
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)

    # Pipeline imputasi (penanganan missing values)
    cleaning = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy=impute_method))
    ])

    # Fit pipeline untuk memproses data
    X_train = cleaning.fit_transform(X_train)
    X_test = cleaning.transform(X_test)

    # Simpan pipeline
    dump(cleaning, save_path) # save path untuk menyimpan preprocessor

    return X_train, X_test, y_train, y_test

In [8]:
data = pd.read_csv('https://raw.githubusercontent.com/Sulbae/SMSML_Anggun-Sulis-Setyawan/refs/heads/main/Eksperimen_SML_Anggun-Sulis-Setyawan/water_potability_raw.csv')

data.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0


In [9]:
X_train, X_test, y_train, y_test = preprocess_data(data, target_column='Potability', impute_method='median', save_path='preprocessor_pipeline.joblib')

In [11]:
def inference(new_data, load_path):
    # Memuat pipeline preprocessing
    preprocessor = load(load_path)
    print(f"Pipeline preprocessing dimuat dari: {load_path}")

    # Proses data baru
    cleaned_data = preprocessor.transform(new_data)
    
    return cleaned_data