In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

def cluster_locations(df, max_cluster_size=250):
    print("Starting clustering process...")

    # Ensure CNTYNAME column exists
    if 'CNTYNAME' not in df.columns:
        raise ValueError("DataFrame must contain a 'CNTYNAME' column")

    # Initialize clusters
    df['cluster'] = -1
    current_cluster = 0

    # Sort counties by frequency (ascending) to handle rarer counties first
    county_counts = df['CNTYNAME'].value_counts().sort_values()

    for county in county_counts.index:
        county_plants = df[df['CNTYNAME'] == county]

        for _, plant in county_plants.iterrows():
            # Find available clusters
            available_clusters = df[df['cluster'] != -1]['cluster'].unique()
            valid_clusters = []

            for cluster in available_clusters:
                cluster_size = (df['cluster'] == cluster).sum()
                cluster_counties = df[df['cluster'] == cluster]['CNTYNAME'].unique()

                if cluster_size < max_cluster_size and county not in cluster_counties:
                    valid_clusters.append(cluster)

            if valid_clusters:
                # Assign to the smallest valid cluster
                smallest_cluster = min(valid_clusters, key=lambda c: (df['cluster'] == c).sum())
                df.loc[plant.name, 'cluster'] = smallest_cluster
            else:
                # Create a new cluster
                df.loc[plant.name, 'cluster'] = current_cluster
                current_cluster += 1

    print("Clustering process completed.")

    unclustered = df[df['cluster'] == -1]
    if len(unclustered) > 0:
        print(f"Warning: {len(unclustered)} plants were not assigned to any cluster.")
        print("Assigning these plants to new clusters...")
        for i, (_, plant) in enumerate(unclustered.iterrows()):
            df.loc[plant.name, 'cluster'] = current_cluster + i
        print("All plants have been assigned to clusters.")
    else:
        print("Verified: All plants have been assigned to clusters.")
    return df


# Usage
print("Loading data...")
df = pd.read_csv('/content/gen_loc.csv')
print("Data loaded. Starting clustering...")
result_df = cluster_locations(df, max_cluster_size=250)
print("\nFirst few rows of the result:")
print(result_df.head())
print(f"\nShape of the result: {result_df.shape}")
print("\nCluster distribution:")
print(result_df['cluster'].value_counts().sort_index())
print("\nNumber of clusters:", result_df['cluster'].nunique())
print("\nProcess completed.")

Loading data...
Data loaded. Starting clustering...
Starting clustering process...
Clustering process completed.
Verified: All plants have been assigned to clusters.

First few rows of the result:
   ORISPL FUELG1                           PNAME   CNTYNAME        LAT  \
0   60680    SUN                ANAD Solar Array    Calhoun  33.626728   
1   61924    SUN  Cumblerland Land Holdings, LLC  Limestone  34.949740   
2   60679    SUN         Fort Rucker Solar Array       Dale  31.331148   
3   60583    SUN            LaFayette Solar Farm   Chambers  32.876313   
4   62462    SUN                   Muscle Shoals    Colbert  34.769730   

         LON  cluster  
0 -85.969481       12  
1 -86.868053        6  
2 -85.730190        0  
3 -85.388210        0  
4 -87.904540        1  

Shape of the result: (6344, 7)

Cluster distribution:
cluster
0      250
1      250
2      250
3      250
4      250
      ... 
158      2
159      2
160      1
161      1
162      1
Name: count, Length: 163, dtyp

In [None]:
result_df.to_csv('result.csv')

In [None]:
import os
print(os.getcwd())

/content
