In [1]:

import pandas as pd
from sklearn.cluster import KMeans
import numpy as np
from math import radians, sin, cos, sqrt, atan2
from sklearn.metrics import silhouette_score
import joblib
import os
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
from datetime import datetime
import onnxruntime as ort
import json
import matplotlib.pyplot as plt

In [2]:
def haversine(lat1, lon1, lat2, lon2):
    R = 6371.0  # Earth's radius in km

    dlat = radians(lat2 - lat1)
    dlon = radians(lon2 - lon1)

    a = sin(dlat / 2)**2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    return distance

In [3]:
def make_dirs(model_dir) -> str:
    os.makedirs(model_dir, exist_ok=True)
    return model_dir

In [4]:
def save_skLearn_model(model_dir: str, model, n_clusters: int) -> str:
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_path = os.path.join(model_dir, f"kmean_{n_clusters}_model_{timestamp}.pkl")
    joblib.dump(model, model_path)
    print(f"✅ Saved sklearn model to: {model_path}")
    return model_path

In [5]:
def convert_to_onnx(model, model_dir: str, n_clusters: int, n_features: int):
    try:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        onnx_path = os.path.join(model_dir, f"kmean_{n_clusters}_model_{timestamp}.onnx")
        initial_type = [("float_input", FloatTensorType([None, n_features]))]
        onnx_model = convert_sklearn(model, initial_types=initial_type)
        with open(onnx_path, "wb") as f:
            f.write(onnx_model.SerializeToString())
        print(f"✅ Saved ONNX model to: {onnx_path}")
        return onnx_path
    except Exception as e:
        print(f"❌ Could not convert to ONNX: {e}")

In [6]:
df = pd.read_parquet(r"E:\Hydroneo\Analytics\disease\data\cleaned_data_removed_ZERO.parquet", engine="pyarrow")

print(df.head())

                         id   latitude   longitude
0  66fd107a636caa2b6a7218b8  16.335354  102.254739
1  6707853b97fb7a0c60569ad7  13.706300  100.459700
2  670e17dd97fb7a0c60b3820b  13.706300  100.459700
3  670e182e97fb7a0c60b38670  13.753165  100.494722
4  670e191297fb7a0c60b392a1  13.706300  100.459700


In [7]:
model_dir = r"E:\Hydroneo\Analytics\disease\models"

In [8]:
# coords = df[['latitude', 'longitude']].to_numpy()


# kmeans = KMeans(n_clusters=3, random_state=42)
# df['cluster'] = kmeans.fit_predict(coords)

# # Inspect results
# print(df.head())
    

In [9]:
def train_kmeans(df: pd.DataFrame, n_clusters: int) -> tuple[pd.DataFrame, KMeans]:
    coords = df[['latitude', 'longitude']].to_numpy()
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    df['cluster'] = kmeans.fit_predict(coords)
    return df, kmeans

In [10]:
df.head()

Unnamed: 0,id,latitude,longitude
0,66fd107a636caa2b6a7218b8,16.335354,102.254739
1,6707853b97fb7a0c60569ad7,13.7063,100.4597
2,670e17dd97fb7a0c60b3820b,13.7063,100.4597
3,670e182e97fb7a0c60b38670,13.753165,100.494722
4,670e191297fb7a0c60b392a1,13.7063,100.4597


In [11]:
df, kmeans = train_kmeans(df, n_clusters=3)

In [12]:
# make_dirs(model_dir)

# save_skLearn_model(model_dir, kmeans, n_clusters=2)
# convert_to_onnx(kmeans, model_dir, n_clusters=2, n_features=2)

In [13]:
score = silhouette_score(df[['latitude', 'longitude']], df['cluster'])
print(f"score : {score}")

score : 0.8947461368976874


In [14]:
print("Cluster centers (lat, lon):")
print(kmeans.cluster_centers_)

Cluster centers (lat, lon):
[[ 13.60647156 100.27249098]
 [  6.8893645  100.02277898]
 [ 16.33535391 102.25473855]]


In [15]:
print(kmeans.cluster_centers_[2][0])

16.3353539145895


In [16]:
# # Example new coordinate
new_lat, new_lon = 13.556924, 100.0950911

# # Make sure it’s in the same format as training data [[lat, lon]]
new_point = np.array([[new_lat, new_lon]])

# # Predict cluster
# cluster_id = kmeans.predict(new_point)[0]
# print(f"New point ({new_lat}, {new_lon}) belongs to cluster {cluster_id}")

In [17]:
def predict_cluster(new_lat, new_lon)->int:
    new_point = np.array([[new_lat, new_lon]])

    cluster_id = kmeans.predict(new_point)[0]
    
    return cluster_id

In [18]:
print(f"{predict_cluster(new_lat, new_lon)}")

0


In [19]:
cluster_id = predict_cluster(new_lat, new_lon)
cluster_id_center = kmeans.cluster_centers_[cluster_id]
print(f"center of cluster {cluster_id_center}")

center of cluster [ 13.60647156 100.27249098]


In [20]:
print(cluster_id_center[1])

100.27249097561129


In [21]:
distance_input_to_cluster = haversine(cluster_id_center[0], cluster_id_center[1],
                                      new_lat, new_lon)
print(f"Distance: {int(distance_input_to_cluster)} km")

Distance: 19 km


In [22]:
distance_input_to_cluster = haversine(kmeans.cluster_centers_[1][0],kmeans.cluster_centers_[1][1],
                                      new_lat, new_lon)
print(f"Distance: {int(distance_input_to_cluster)} km")

Distance: 741 km


In [23]:
distance_input_to_cluster = haversine(kmeans.cluster_centers_[2][0],kmeans.cluster_centers_[2][1],
                                      new_lat, new_lon)
print(f"Distance: {int(distance_input_to_cluster)} km")

Distance: 386 km


In [24]:
joplib_model_path = r'E:\Hydroneo\Analytics\disease\models\kmean_2_model_20251029_110536.pkl'

In [25]:
kmeans_loaded = joblib.load(joplib_model_path)

joblib_cluster_id = kmeans_loaded.predict(new_point)[0]

print(f"✅ Predicted cluster: {joblib_cluster_id}")

joblib_cluster_center = kmeans_loaded.cluster_centers_[cluster_id]

print(f"📍 Cluster center (lat, lon): {joblib_cluster_center}")

✅ Predicted cluster: 0
📍 Cluster center (lat, lon): [ 13.60647156 100.27249098]


In [26]:
onnx_model_path = r'E:\Hydroneo\Analytics\disease\models\kmean_2_model_20251029_110536.onnx'

In [27]:
new_point = np.array([[new_lat, new_lon]], dtype=np.float32)  # <-- float32!

# Load ONNX model
session = ort.InferenceSession(onnx_model_path)
input_name = session.get_inputs()[0].name
output_name = session.get_outputs()[0].name
print("✅ ONNX model loaded")

# Predict cluster
predicted_cluster = session.run([output_name], {input_name: new_point})[0][0]
print(f"✅ Predicted cluster: {predicted_cluster}")

✅ ONNX model loaded
✅ Predicted cluster: 0


In [28]:
centers = kmeans.cluster_centers_.tolist()  # convert numpy array to list
with open("kmeans_centers.json", "w") as f:
    json.dump(centers, f)

In [31]:
clusters = df.groupby('cluster')[['latitude', 'longitude']].apply(lambda x: x.values.tolist())

for cluster_id, points in clusters.items():
    print(f"Cluster {cluster_id}:")
    for lat, lon in points:
        print(f"  ({lat}, {lon})")

Cluster 0:
  (13.7063, 100.4597)
  (13.7063, 100.4597)
  (13.753164564515151, 100.49472150566612)
  (13.7063, 100.4597)
  (13.59056, 100.10778)
  (13.59056, 100.10778)
  (13.59056, 100.10778)
  (13.80194, 100.32167)
  (13.80194, 100.32167)
  (13.728934351207734, 100.52873790264128)
  (13.556924, 100.0950911)
  (13.556924, 100.0950911)
  (13.556924, 100.0950911)
  (13.556924, 100.0950911)
  (13.556924, 100.0950911)
  (13.556924, 100.0950911)
  (13.556924, 100.0950911)
  (13.2017461, 101.2523792)
  (13.8018485, 100.3215101)
  (13.556924, 100.0950911)
  (13.556924, 100.0950911)
  (13.556924, 100.0950911)
  (13.728934351207734, 100.52873790264128)
  (13.8140293, 100.0372929)
  (13.527666663474438, 100.15839859843254)
  (13.527666663474438, 100.15839859843254)
  (13.527666663474438, 100.15839859843254)
  (13.527666663474438, 100.15839859843254)
  (13.527666663474438, 100.15839859843254)
  (13.527666663474438, 100.15839859843254)
  (13.527666663474438, 100.15839859843254)
  (13.5276666634744