In [55]:
import os
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# ML & preprocessing
from sklearn.model_selection import train_test_split, StratifiedKFold, ParameterSampler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier


# Imbalanced
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)

# External models
try:
    from xgboost import XGBClassifier
except Exception:
    XGBClassifier = None
try:
    from lightgbm import LGBMClassifier
except Exception:
    LGBMClassifier = None

# XAI (optional)
try:
    import shap
except Exception:
    shap = None
try:
    from lime.lime_tabular import LimeTabularExplainer
except Exception:
    LimeTabularExplainer = None

# I/O
import json
import matplotlib.pyplot as plt

In [56]:
df = pd.read_csv('Dataset/patient_adherence_dataset.csv')

In [57]:
df.head(10)

Unnamed: 0,Age,Gender,Medication_Type,Dosage_mg,Previous_Adherence,Education_Level,Income,Social_Support_Level,Condition_Severity,Comorbidities_Count,Healthcare_Access,Mental_Health_Status,Insurance_Coverage,Adherence
0,57,Male,TypeA,136,1,High School,634934,Medium,Severe,3,Poor,Good,1,1
1,47,Male,TypeA,134,1,High School,297954,High,Moderate,3,Good,Good,1,0
2,59,Male,TypeC,89,1,High School,789337,High,Moderate,1,Good,Good,1,1
3,72,Male,TypeB,240,0,Postgraduate,267352,Low,Mild,2,Poor,Moderate,1,0
4,46,Male,TypeA,241,1,Postgraduate,718447,Medium,Mild,1,Poor,Good,1,0
5,46,Female,TypeA,242,1,Graduate,389971,Medium,Mild,1,Good,Good,1,1
6,73,Female,TypeA,215,1,High School,532785,Low,Mild,1,Poor,Good,1,1
7,61,Female,TypeC,93,1,High School,545197,Low,Mild,0,Poor,Good,1,1
8,42,Male,TypeA,148,1,Postgraduate,539541,High,Moderate,1,Poor,Good,1,1
9,58,Male,TypeC,208,1,Postgraduate,634899,Low,Mild,2,Average,Good,1,1


In [58]:
df.shape

(5000, 14)

# Definisi Fitur

In [59]:
numerical_features = ['Age', 'Dosage_mg', 'Previous_Adherence', 'Income', 'Comorbidities_Count', 'Insurance_Coverage']
kategorical_features = ['Gender', 'Medication_Type', 'Education_Level', 'Social_Support_Level', 'Condition_Severity', 'Healthcare_Access', 'Mental_Health_Status']

target_features = 'Adherence'

In [60]:
# Missing Numerik -> Median
for col in numerical_features:
    df[col] = df[col].fillna(df[col].median())

# Missing Kategorik -> 'Uknown'
for col in kategorical_features:
    df[col] = df[col].fillna('Unknown')

# One-Hot Encoding
df = pd.get_dummies(df, columns=kategorical_features, drop_first=True)
print("Data setelah penanganan missing values dan encoding:")
df.head(5)

Data setelah penanganan missing values dan encoding:


Unnamed: 0,Age,Dosage_mg,Previous_Adherence,Income,Comorbidities_Count,Insurance_Coverage,Adherence,Gender_Male,Gender_Other,Medication_Type_TypeB,...,Education_Level_High School,Education_Level_Postgraduate,Social_Support_Level_Low,Social_Support_Level_Medium,Condition_Severity_Moderate,Condition_Severity_Severe,Healthcare_Access_Good,Healthcare_Access_Poor,Mental_Health_Status_Moderate,Mental_Health_Status_Poor
0,57,136,1,634934,3,1,1,True,False,False,...,True,False,False,True,False,True,False,True,False,False
1,47,134,1,297954,3,1,0,True,False,False,...,True,False,False,False,True,False,True,False,False,False
2,59,89,1,789337,1,1,1,True,False,False,...,True,False,False,False,True,False,True,False,False,False
3,72,240,0,267352,2,1,0,True,False,True,...,False,True,True,False,False,False,False,True,True,False
4,46,241,1,718447,1,1,0,True,False,False,...,False,True,False,True,False,False,False,True,False,False


In [61]:
X = df.drop(columns=[target_features])
y = df[target_features]

if not set(y.unique()).issubset({0, 1}):
    mapping = {v: i for i, v in enumerate(y.unique())}
    y = y.map(mapping)
    print("\nMapping target dilakukan:", mapping)

# Train Test Split

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [63]:
# Scalling numerik features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\nTrain/Test Size:", X_train.shape, X_test.shape)


Train/Test Size: (4000, 20) (1000, 20)


In [64]:
X_train_scaled = np.array(X_train_scaled)
X_test_scaled = np.array(X_test_scaled)
y_train = np.array(y_train)
y_test = np.array(y_test)

# Model & Parameter

In [65]:
models_param = {}

# 1 Logistic Regression
models_param['LogisticRegression'] = {
    'model': LogisticRegression(max_iter=2000, random_state=42),
    'params': {'C': [0.01,0.1,1,10], 'class_weight': [None,'balanced']}
}

# 2 Decision Tree
models_param['DecisionTree'] = {
    'model': DecisionTreeClassifier(random_state=42),
    'params': {'max_depth': [3,5,10,None], 'min_samples_split': [2,5,10]}
}

# 3 Random Forest
models_param['RandomForest'] = {
    'model': RandomForestClassifier(random_state=42),
    'params': {'n_estimators':[100,200,300], 'max_depth':[None,5,10], 'class_weight':[None,'balanced']}
}

# 4 GradientBoosting (sklearn)
models_param['GradientBoosting'] = {
    'model': GradientBoostingClassifier(random_state=42),
    'params': {'n_estimators':[100,200], 'learning_rate':[0.05,0.1], 'max_depth':[3,5]}
}

# 5 XGBoost
if XGBClassifier is not None:
    models_param['XGBoost'] = {
        'model': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
        'params': {'n_estimators':[100,200], 'learning_rate':[0.05,0.1], 'max_depth':[3,5], 'subsample':[0.7,1.0]}
    }
else:
    print("XGBoost not available — skipped.")

# 6 LightGBM
if LGBMClassifier is not None:
    models_param['LightGBM'] = {
        'model': LGBMClassifier(random_state=42),
        'params': {'n_estimators':[100,200], 'learning_rate':[0.05,0.1], 'num_leaves':[31,64]}
    }
else:
    print("LightGBM not available — skipped.")


# 7 MLP
models_param['MLP'] = {
    'model': MLPClassifier(max_iter=1000, random_state=42),
    'params': {'hidden_layer_sizes':[(50,),(100,)], 'alpha':[1e-4,1e-3], 'learning_rate_init':[1e-3,1e-2]}
}

# 8 SVM
models_param['SVM'] = {
    'model': SVC(probability=True, random_state=42),
    'params': {
        'C': [0.1, 1, 10],
        'kernel': ['rbf', 'linear'],
        'gamma': ['scale', 'auto']
    }
}

# 9 Naive Bayes
models_param['NB'] = {
    'model': GaussianNB(),
    'params': {
        'var_smoothing': [1e-9, 1e-8, 1e-7]
    }
}

# 9 KNN
models_param['KNN'] = {
    'model': KNeighborsClassifier(),
    'params': {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'p': [1, 2]   # Manhattan (1) & Euclidean (2)
    }
}




In [66]:
print("Model Machine Learning:", list(models_param.keys()))


Model Machine Learning: ['LogisticRegression', 'DecisionTree', 'RandomForest', 'GradientBoosting', 'XGBoost', 'LightGBM', 'MLP', 'SVM', 'NB', 'KNN']


# Manual Randomized Search with SMOTE per fold

In [67]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


In [68]:
params_grid = models_param[name]['params']
base_model = models_param[name]['model']

# Simpan class model untuk instantiate ulang nanti
model_class = base_model.__class__

sampler = ParameterSampler(params_grid, n_iter=5, random_state=42)

best_score = 0
best_param = None

for params in sampler:
    model = model_class(**params)
    fold_scores = []

    for train_idx, val_idx in skf.split(X_train_scaled, y_train):
        X_train_fold, X_val_fold = X_train_scaled[train_idx], X_train_scaled[val_idx]
        y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]

        X_res, y_res = smote.fit_resample(X_train_fold, y_train_fold)

        model.fit(X_res, y_res)
        preds = model.predict(X_val_fold)

        f1 = f1_score(y_val_fold, preds)
        fold_scores.append(f1)

    avg_score = sum(fold_scores) / len(fold_scores)

    if avg_score > best_score:
        best_score = avg_score
        best_param = params

print("Best param:", best_param, " Best F1:", best_score)

# Retrain final model
final_model = model_class(**best_param)

sm = SMOTE(random_state=42)
X_res_full, y_res_full = sm.fit_resample(X_train_scaled, y_train)
final_model.fit(X_res_full, y_res_full)


Best param: {'weights': 'uniform', 'p': 2, 'n_neighbors': 9}  Best F1: 0.5857043886974074


0,1,2
,n_neighbors,9
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [71]:
y_pred = final_model.predict(X_test_scaled)

# cek AUC, gunakan predict_proba atau decision_function
try:
    y_proba = final_model.predict_proba(X_test_scaled)[:, 1]
    auc = roc_auc_score(y_test, y_proba)
except:
    try:
        y_proba = final_model.decision_function(X_test_scaled)
        auc = roc_auc_score(y_test, y_proba)
    except:
        auc = None

print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nAUC:", auc)

              precision    recall  f1-score   support

           0       0.64      0.53      0.58       543
           1       0.54      0.65      0.59       457

    accuracy                           0.59      1000
   macro avg       0.59      0.59      0.59      1000
weighted avg       0.60      0.59      0.59      1000

Confusion Matrix:
 [[290 253]
 [160 297]]

AUC: 0.6214059181707913
