# **1. Import Library**

In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import re
import joblib
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.metrics import confusion_matrix, classification_report

# **2. Memuat Dataset**

## Tampilkan Data

In [5]:
df_cleaned = pd.read_csv('https://raw.githubusercontent.com/Sulbae/SMSML_Anggun-Sulis-Setyawan/refs/heads/main/Eksperimen_SML_Anggun-Sulis-Setyawan/Preprocessing/water_potability_preprocessing')

df_cleaned.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,7.036752,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.057858,6.635246,333.073546,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,333.073546,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0


# **3. Persiapan Data**

## Pembagian Data Pelatihan

In [9]:
def split_data(df, target='Potability', test_size=0.25, random_state=42):
    X = df_cleaned.drop(columns='Potability', axis=1)
    y = df_cleaned['Potability']
    return train_test_split(X, y, test_size=test_size, stratify=y, random_state=random_state)

In [12]:
X_train, X_test, y_train, y_test = split_data(df_cleaned, target='Potability', test_size=0.25, random_state=42)

print(f"X_train Shape: {X_train.shape}")
print(f"y_train Shape: {y_train.shape}")
print(f"X_test Shape: {X_test.shape}")
print(f"y_test Shape: {y_test.shape}")

X_train Shape: (2457, 9)
y_train Shape: (2457,)
X_test Shape: (819, 9)
y_test Shape: (819,)


## Pelatihan Model

In [17]:
def train_and_evaluate_models(X_train, X_test, y_train, y_test):
    results = []

    # RF Model
    rf_pipeline = Pipeline([
        ('model', RandomForestClassifier(n_estimators=100, random_state=42))
    ])
    rf_pipeline.fit(X_train, y_train)
    y_pred_rf = rf_pipeline.predict(X_test)
    rf_report = pd.DataFrame(classification_report(y_test, y_pred_rf, output_dict=True)).T
    rf_report['model'] = 'Random Forest'
    results.append(rf_report)

    # SVC Model
    svc_pipeline = Pipeline([
        ('model', SVC(kernel='rbf', probability=True, random_state=42))
    ])
    svc_pipeline.fit(X_train, y_train)
    y_pred_svc = svc_pipeline.predict(X_test)
    svc_report = pd.DataFrame(classification_report(y_test, y_pred_svc, output_dict=True)).T
    svc_report['model'] = 'SVC'
    results.append(svc_report)

    combined = pd.concat(results).reset_index().rename(columns={'index' : 'metrics'})
    comparison = combined[combined['metrics'].isin(['0', '1', 'accuracy', 'macro avg', 'weighted avg'])]
    return comparison[['model', 'metrics', 'precision', 'recall', 'f1-score', 'support']].round(2)

In [22]:
models_training = train_and_evaluate_models(X_train, X_test, y_train, y_test)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [24]:
models_training.sort_values(by='metrics')

Unnamed: 0,model,metrics,precision,recall,f1-score,support
0,Random Forest,0,0.67,0.88,0.76,499.0
5,SVC,0,0.61,1.0,0.76,499.0
1,Random Forest,1,0.64,0.32,0.43,320.0
6,SVC,1,0.0,0.0,0.0,320.0
2,Random Forest,accuracy,0.66,0.66,0.66,0.66
7,SVC,accuracy,0.61,0.61,0.61,0.61
3,Random Forest,macro avg,0.65,0.6,0.6,819.0
8,SVC,macro avg,0.3,0.5,0.38,819.0
4,Random Forest,weighted avg,0.66,0.66,0.63,819.0
9,SVC,weighted avg,0.37,0.61,0.46,819.0
