# 1. Representative Locations

xxxxx

In [1]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [2]:
def get_representative_locations(file_path, output_name):
    # Read the CSV file
    data = pd.read_csv(file_path)

    # Calculate the count of observations for each location
    num_obs_per_location = data.groupby('location').size().reset_index(name='num_obs')

    # Remove duplicate points
    data_no_duplicates = data.drop_duplicates(subset=['lat', 'long'])

    # Feature Scaling
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data_no_duplicates[['lat', 'long']])

    # Clustering
    num_clusters = min(len(data_no_duplicates), 5)  # Adjusting number of clusters
    kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
    data_no_duplicates = data_no_duplicates.copy()
    data_no_duplicates['cluster'] = kmeans.fit_predict(scaled_data)

    # Select Representative Locations by which ones have the most observations
    representative_locations = data_no_duplicates.groupby('cluster').apply(lambda x: x.max())
    
    representative_locations = representative_locations[['cluster', 'location', 'lat', 'long']]
    
    # Merge representative locations with the count of observations
    representative_locations = pd.merge(representative_locations, num_obs_per_location, on='location', how='left')

    # Get bloom data for each location
    location_data = [data[data['location'] == row['location']] for _, row in representative_locations.iterrows()]

    return data_no_duplicates[['cluster', 'location', 'lat', 'long']], representative_locations, location_data

In [3]:
CLUSTERS_FOLDER = "../data/interim/representative_locations/clusters/"
LOCATIONS_FOLDER = "../data/interim/representative_locations/location/"
VALIDATION_FOLDER = "../data/interim/representative_locations/validation/"

location_dict = {
    "korea":"/Users/simonaytes/Documents/GitHub/cherry-blossom/data/raw/south_korea.csv",
    "japan":"/Users/simonaytes/Documents/GitHub/cherry-blossom/data/raw/japan.csv",
    "switzerland":"/Users/simonaytes/Documents/GitHub/cherry-blossom/data/raw/meteoswiss.csv",
    "usa":"/Users/simonaytes/Documents/GitHub/cherry-blossom/data/raw/usa_formatted.csv"
}

In [4]:
# For each file, find its representative locations
for location in location_dict:
    clusters, rep_locations, loc_data = get_representative_locations(location_dict[location], location)

    # Output the cluster data
    clusters.to_csv(f"{CLUSTERS_FOLDER}{location}_clusters.csv", index=False)

    # Output the representative location lists
    rep_locations.to_csv(f"{LOCATIONS_FOLDER}{location}_representative_locations.csv", index=False)

    # Ouput location-specific validation data
    for i in range(0, len(loc_data)):
        loc_data[i].to_csv(f"{VALIDATION_FOLDER}{location}_{i+1}.csv", index=False)