In [None]:
from sklearn_extra.cluster import KMedoids
from tslearn.metrics import dtw
from scipy.optimize import curve_fit
from scipy.signal import savgol_filter

In [None]:
def calculate_cosine_sim(vector_months_regions):
    """"
    Calculate cosine similarity between consecutive months for each city.
    Works across multiple years without resetting at year boundaries.
    
    Parameters:
        vector_months_regions: np.array with shape (number_region, 2, total_timesteps)
        number_region: int, number of cities
        total_timesteps: int, total number of months across years
    
    Returns:
        vectors_cosine_sim: np.array with shape (number_region, total_timesteps-1)
    """
    if vector_months_regions.ndim != 3 or vector_months_regions.shape[1] != 2:
        raise ValueError(f"Input vector_mc must have shape (n_mc, 2, total_timesteps), but got {vector_months_regions.shape}")
        
    number_region, _, total_timesteps = vector_months_regions.shape #number_region, total time steps here is number of months
    if total_timesteps < 2:
        return np.empty((n_mc, 0), dtype=float)
        
    vectors_cosine_sim = np.full((number_region, total_timesteps - 1), 0, dtype=float)

    for i in range(number_region):  # Iterate over cities
        for w in range(total_timesteps - 1):  # Iterate over months
            v1_x, v1_y = vector_months_regions[i, 0, w], vector_months_regions[i, 1, w]
            v2_x, v2_y = vector_months_regions[i, 0, w + 1], vector_months_regions[i, 1, w + 1]

            f_i = math.sqrt(v1_x**2 + v1_y**2)
            f_j = math.sqrt(v2_x**2 + v2_y**2)
            dot_p = (v1_x * v2_x) + (v1_y * v2_y)

            if f_i * f_j != 0:
                cosine_value = dot_p / (f_i * f_j)
                vectors_cosine_sim[i, w] = cosine_value  # Store result
            
    return vectors_cosine_sim


def calculate_dtw_distance_matrix(time_series_data):
    """Computes ta pairwise Dynamic Time Warping (DTW) distance matrix between time series.
        Each row is a time series for one unit (e.g. city, region)
        Columns are time points (consecutive months)"""
    
    if time_series_data.ndim != 2:
        raise ValueError(f"Input time_series_data must have shape (n_series, n_timesteps), but got {time_series_data.shape}")

    n_series = time_series_data.shape[0]
    distance_matrix = np.zeros((n_series, n_series))

    #Loops through all unique pairs of time series
    for i in range(n_series):
        for j in range(i + 1, n_series):
            ts_i = np.ascontiguousarray(time_series_data[i], dtype=np.float64)
            ts_j = np.ascontiguousarray(time_series_data[j], dtype=np.float64)
            dist = dtw(ts_i, ts_j)
            distance_matrix[i, j] = dist
            distance_matrix[j, i] = dist
    return distance_matrix



In [None]:
def perform_kmedoids_clustering(distance_matrix, n_clusters):
    """Clusters regions using the K-Medoids algorithm based on a precomputed distance matrix"""
    if distance_matrix.shape[0] != distance_matrix.shape[1]:
        raise ValueError(f"Distance matrix must be square, but got shape {distance_matrix.shape}")
    if n_clusters > distance_matrix.shape[0]:
        raise ValueError(f"Number of clusters ({n_clusters}) cannot be greater than the number of data points ({distance_matrix.shape[0]})")

    kmedoids = KMedoids(n_clusters=n_clusters, metric="precomputed", random_state=42, init='k-medoids++', max_iter=300)
    clusters = kmedoids.fit_predict(distance_matrix) #assigns a cluster label to each region.
    return clusters, kmedoids

def calculate_cluster_means(time_series_data, clusters):
    """Calculates the average cosine similarity values for each cluster"""
    if time_series_data.ndim != 2 or clusters.ndim != 1 or time_series_data.shape[0] != len(clusters):
        # Raise error instead of returning None
        raise ValueError("Invalid input shapes for calculate_cluster_means.")
        
    number_regions, n_timesteps = time_series_data.shape
        
    n_clusters = len(set(clusters)) 
    
    cluster_means = np.zeros((n_clusters, n_timesteps))
    cluster_counts = np.zeros(n_clusters)

    for i in range(number_regions):
        cluster_label = clusters[i]
        cluster_means[cluster_label] += time_series_data[i]
        cluster_counts[cluster_label] += 1
        
    # Normalize to get average trends
    for c in range(n_clusters):
        if cluster_counts[c] > 0:
            cluster_means[c] /= cluster_counts[c]

    return cluster_means[:n_clusters]

def sinusoidal(t, A, B, C, D):
    """Sinusoidal function for curve fitting."""
    return A * np.sin(B * t + C) + D

def fit_sinusoidal(x, y):
    """Fits a sinusoidal function to time-series data."""
    
    amplitude_guess = np.ptp(y) / 2 if np.ptp(y) > 1e-9 else 0.1 # ptp = peak-to-peak range
    mean_guess = np.mean(y)
    guess = [amplitude_guess, 2 * np.pi / 12, 0, mean_guess]
    
    # Fit curve using non-linear least squares
    popt, pcov = curve_fit(sinusoidal, x, y, p0=guess, maxfev=10000)
    # Optionally check pcov for finite values if needed, but no print/try
    return popt


def plot_combined_cluster_trends(cluster_means, cluster_colors, save_path_svg=None, savgol_window=7, savgol_order=3):
    """Plots all cluster trends together with fits. """

    n_clusters, n_timesteps = cluster_means.shape
    fig, ax = plt.subplots(figsize=(12, 7))
    months_axis = np.arange(1, n_timesteps + 1)

    for c in range(n_clusters):
        avg_trend = cluster_means[c]
        cluster_label_key = f"cluster {c+1}"
        plot_color = cluster_colors.get(c, 'gray')

        smooth_trend = avg_trend
        if savgol_window < len(avg_trend) and savgol_window % 2 != 0:
            # savgol_filter will raise ValueError on failure
            smooth_trend = savgol_filter(avg_trend, window_length=savgol_window, polyorder=savgol_order)
        # No warnings

        params = fit_sinusoidal(months_axis, smooth_trend)
        fitted_curve = sinusoidal(months_axis, *params)

        line, = ax.plot(months_axis, avg_trend, linestyle="-", color=plot_color, linewidth=2.5, alpha=0.6, label=cluster_label_key)
        ax.plot(months_axis, fitted_curve, linestyle="-", color="black", linewidth=1.2, alpha=0.9)


    ax.set_xlabel("Time Step (e.g., Month)", fontsize=15)
    ax.set_ylabel("Cosine Similarity", fontsize=15)
    ax.tick_params(axis='both', labelsize=12)
    ax.legend(fontsize=12) # Fallback to default legend

    ax.grid(True, axis='y', linestyle='--', alpha=0.6)
    plt.tight_layout()

    if save_path_svg:
        # savefig will raise error on failure
        plt.savefig(save_path_svg, format="svg", dpi=300, bbox_inches='tight')

    plt.show()
    
    
def plot_cluster_map(geopandas_map, clusters, cluster_colors, cluster_id_col='cluster_id', city_num_col='city_number', save_path_jpg=None):
    """Plots a map colored by cluster assignment."""
    # geopandas_map is the shapefile of the regions read using gpd
    
    if city_num_col not in geopandas_map.columns:
        if city_num_col == 'city_number' and geopandas_map.index.name != city_num_col:
            geopandas_map[city_num_col] = geopandas_map.index
        else:
            raise KeyError(f"City identifier column '{city_num_col}' not found in shapefile.")
            
    # Create dictionary mapping regions (cities) to clusters ---
    city_to_cluster = {i: clusters[i] for i in range(len(clusters))}
    

    geopandas_map[cluster_id_col] = geopandas_map[city_num_col].map(city_to_cluster)


    unmapped_count = geopandas_map[cluster_id_col].isna().sum()
    if unmapped_count > 0:
        geopandas_map[cluster_id_col] = geopandas_map[cluster_id_col].fillna(-1)
    # No warning

    # astype(int) might raise ValueError
    geopandas_map[cluster_id_col] = geopandas_map[cluster_id_col].astype(int)
    
    
    #assign color to each cluster
    geopandas_map["color"] = geopandas_map[cluster_id_col].map(cluster_colors).fillna('gray')

    fig, ax = plt.subplots(figsize=(10, 8))
    geopandas_map.plot(ax=ax, color=geopandas_map["color"], edgecolor="black", linewidth=0.5, alpha=0.9)
    #ax.legend(fontsize=12, title="Clusters", loc='best')

    ax.set_axis_off()
    plt.title("City Clusters", fontsize=16)
    plt.tight_layout()

    if save_path_jpg:
        # savefig will raise error on failure
        plt.savefig(save_path_jpg, dpi=300, bbox_inches='tight')

    plt.show()