In [1]:

import pandas as pd
from sklearn.cluster import KMeans
import numpy as np
from math import radians, sin, cos, sqrt, atan2
from sklearn.metrics import silhouette_score
import joblib
import os

In [2]:
def haversine(lat1, lon1, lat2, lon2):
    R = 6371.0  # Earth's radius in km

    dlat = radians(lat2 - lat1)
    dlon = radians(lon2 - lon1)

    a = sin(dlat / 2)**2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    return distance

In [3]:
def make_dirs(model_dir) -> str:
    os.makedirs(model_dir, exist_ok=True)
    return model_dir

In [4]:
def save_model(model_dir, model_path, model, n_clusters):
    model_path = os.path.join(model_dir, f"kmean_{n_clusters}_model.pkl")
    joblib.dump(model, model_path)

In [5]:
df = pd.read_parquet(r"E:\Hydroneo\Analytics\disease\data\cleaned_data_removed_ZERO.parquet", engine="pyarrow")

print(df.head())

                         id   latitude   longitude
0  66fd107a636caa2b6a7218b8  16.335354  102.254739
1  6707853b97fb7a0c60569ad7  13.706300  100.459700
2  670e17dd97fb7a0c60b3820b  13.706300  100.459700
3  670e182e97fb7a0c60b38670  13.753165  100.494722
4  670e191297fb7a0c60b392a1  13.706300  100.459700


In [6]:
model_dir = r"E:\Hydroneo\Analytics\disease\models"

In [7]:
# coords = df[['latitude', 'longitude']].to_numpy()


# kmeans = KMeans(n_clusters=3, random_state=42)
# df['cluster'] = kmeans.fit_predict(coords)

# # Inspect results
# print(df.head())
    

In [8]:
def train_kmeans(df: pd.DataFrame, n_clusters: int) -> tuple[pd.DataFrame, KMeans]:
    coords = df[['latitude', 'longitude']].to_numpy()
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    df['cluster'] = kmeans.fit_predict(coords)
    return df, kmeans

In [9]:
df, kmeans = train_kmeans(df, n_clusters=3)

In [10]:
score = silhouette_score(df[['latitude', 'longitude']], df['cluster'])
print(f"score : {score}")

score : 0.8947461368976874


In [11]:
print("Cluster centers (lat, lon):")
print(kmeans.cluster_centers_)

Cluster centers (lat, lon):
[[ 13.60647156 100.27249098]
 [  6.8893645  100.02277898]
 [ 16.33535391 102.25473855]]


In [12]:
print(kmeans.cluster_centers_[2][0])

16.3353539145895


In [13]:
# # Example new coordinate
new_lat, new_lon = 13.556924, 100.0950911

# # Make sure it’s in the same format as training data [[lat, lon]]
# new_point = np.array([[new_lat, new_lon]])

# # Predict cluster
# cluster_id = kmeans.predict(new_point)[0]
# print(f"New point ({new_lat}, {new_lon}) belongs to cluster {cluster_id}")

In [14]:
def predict_cluster(new_lat, new_lon)->int:
    new_point = np.array([[new_lat, new_lon]])

    cluster_id = kmeans.predict(new_point)[0]
    
    return cluster_id

In [15]:
print(f"{predict_cluster(new_lat, new_lon)}")

0


In [16]:
cluster_id = predict_cluster(new_lat, new_lon)
cluster_id_center = kmeans.cluster_centers_[cluster_id]
print(f"center of cluster {cluster_id_center}")

center of cluster [ 13.60647156 100.27249098]


In [17]:
print(cluster_id_center[1])

100.27249097561129


In [18]:
distance_input_to_cluster = haversine(cluster_id_center[0], cluster_id_center[1],
                                      new_lat, new_lon)
print(f"Distance: {int(distance_input_to_cluster)} km")

Distance: 19 km


In [19]:
distance_input_to_cluster = haversine(kmeans.cluster_centers_[1][0],kmeans.cluster_centers_[1][1],
                                      new_lat, new_lon)
print(f"Distance: {int(distance_input_to_cluster)} km")

Distance: 741 km


In [20]:
distance_input_to_cluster = haversine(kmeans.cluster_centers_[2][0],kmeans.cluster_centers_[2][1],
                                      new_lat, new_lon)
print(f"Distance: {int(distance_input_to_cluster)} km")

Distance: 386 km
