In [1]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [2]:
# Step 1: Load Dataset
file_path = 'dataset_corona_cleaned.csv'

try:
    # Attempt to load the dataset
    data = pd.read_csv(file_path, delimiter=',', engine='python')
except Exception as e:
    print("Error loading the file:", e)
    raise

# Check and split columns if improperly parsed
if len(data.columns) == 1:
    data = data.iloc[:, 0].str.split(',', expand=True)

# Rename columns manually or dynamically if auto-detected
corrected_columns = [
    'Object_ID', 'Provinsi', 'Tanggal', 'Kasus_Terkonfirmasi_Akumulatif', 
    'Penambahan_Harian_Kasus_Terkonfirmasi', 'Kasus_Sembuh_Akumulatif', 
    'Penambahan_Harian_Kasus_Sembuh', 'Kasus_Meninggal_Akumulatif', 
    'Penambahan_Harian_Kasus_Meninggal', 'Kasus_Aktif_Akumulatif', 
    'CFR_Harian', 'RI_Harian', 'FID', 'ObjectId'
]

if len(data.columns) == len(corrected_columns):
    data.columns = corrected_columns
else:
    print(f"Column mismatch. Found {len(data.columns)} columns, expected {len(corrected_columns)}.")
    print("Detected columns:", data.columns)
    raise ValueError("The dataset structure does not match expected column names.")

In [4]:
import numpy as np

# Step 2: Preprocessing Data
columns_to_use = [
    'Kasus_Terkonfirmasi_Akumulatif', 'Penambahan_Harian_Kasus_Terkonfirmasi',
    'Kasus_Sembuh_Akumulatif', 'Penambahan_Harian_Kasus_Sembuh',
    'Kasus_Meninggal_Akumulatif', 'Penambahan_Harian_Kasus_Meninggal',
    'Kasus_Aktif_Akumulatif', 'CFR_Harian'
]

# Replace invalid strings like '#DIV/0!' with NaN
data[columns_to_use] = data[columns_to_use].replace(['#DIV/0!', 'NaN', 'N/A', ''], np.nan)

# Drop rows with NaN values in the relevant columns
data_cleaned = data[columns_to_use].dropna()

# Standardize the data
scaler = StandardScaler()
try:
    data_scaled = scaler.fit_transform(data_cleaned)
except ValueError as e:
    print(f"Error during scaling: {e}")
    print("Please verify the input data for remaining invalid values.")
    raise


In [5]:
# Step 3: Clustering
kmeans = KMeans(n_clusters=4, random_state=42)
data_cleaned['Cluster'] = kmeans.fit_predict(data_scaled)

In [6]:
# Map clusters to zones
zone_mapping = {0: 'Hitam', 1: 'Merah', 2: 'Kuning', 3: 'Hijau'}
data_cleaned['Zone'] = data_cleaned['Cluster'].map(zone_mapping)

In [7]:
# Save clustered data
data_cleaned.to_csv('final_clustered_data.csv', index=False)
print("Final clustered data saved as 'final_clustered_data.csv'.")

Final clustered data saved as 'final_clustered_data.csv'.
