In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt

# 1. LOAD DATA
df = pd.read_excel("STUDENTDATASET.xlsx", sheet_name='Students')
df.head()

# Defining variables to match your file's specific spelling
LAT = 'Latitude'
LON = 'Longitude' # Spelling from your dataset

def run_capacitated_clustering(student_df, school_id, bus_capacity=30):
    school_students = student_df[student_df['SchoolID'] == school_id].copy()
    
    if school_students.empty:
        return school_students

    coords = school_students[[LAT, LON]].values
    
    # Calculate required buses
    n_clusters = int(np.ceil(len(school_students) / bus_capacity))
    
    # AI TRAINING: K-Means
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    school_students['Assigned_Bus_ID'] = kmeans.fit_predict(coords)
    centroids = kmeans.cluster_centers_
    
    # REFINEMENT: The "Balancing" Loop for Capacity
    while True:
        counts = school_students['Assigned_Bus_ID'].value_counts()
        overloaded = counts[counts > bus_capacity].index.tolist()
        underloaded = counts[counts < bus_capacity].index.tolist()
        
        if not overloaded:
            break # Training complete: All buses are <= 30
            
        for bus_id in overloaded:
            # Find students in the full bus
            bus_indices = school_students[school_students['Assigned_Bus_ID'] == bus_id].index
            # Use the variables LAT and LON consistently
            distances = cdist(school_students.loc[bus_indices, [LAT, LON]], [centroids[bus_id]]).flatten()
            
            # Identify the furthest student to move to a less crowded bus
            furthest_student_idx = bus_indices[np.argmax(distances)]
            
            if not underloaded:
                break

            student_loc = school_students.loc[[furthest_student_idx], [LAT, LON]]
            dist_to_others = cdist(student_loc, [centroids[b] for b in underloaded]).flatten()
            new_bus_id = underloaded[np.argmin(dist_to_others)]
            
            school_students.at[furthest_student_idx, 'Assigned_Bus_ID'] = new_bus_id
            
    return school_students

# 2. EXECUTE FOR ALL SCHOOLS
all_results = []
for sid in df['SchoolID'].unique():
    print(f"Clustering students for School ID: {sid}...")
    all_results.append(run_capacitated_clustering(df, sid))

# 3. SAVE AND VISUALIZE
final_df = pd.concat(all_results)
final_df.to_csv('Hafilaty_Clustered_Students.csv', index=False)
print("\nSuccess! Results saved to 'Hafilaty_Clustered_Students.csv'")

# Plotting the results
plt.figure(figsize=(10, 6))
for bus in final_df['Assigned_Bus_ID'].unique():
    bus_data = final_df[final_df['Assigned_Bus_ID'] == bus]
    plt.scatter(bus_data[LON], bus_data[LAT], label=f'Bus {bus}')

plt.title('Hafilaty AI: Student Grouping (30 Max per Bus)')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.legend()
plt.show()