# Functional Clustering for Drivers

## Importing

In [1]:
import xarray as xr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler

from skfda.ml.clustering import KMeans
from skfda.representation.grid import FDataGrid

import salishsea_tools.viz_tools as sa_vi

np.warnings.filterwarnings('ignore') # For the nan mean warning


## Drivers Preparation

In [2]:
def drivers_preparation(dataset,dataset2):

    indx = np.where((dataset2.time_counter.dt.month==2) & (dataset2.time_counter.dt.day==29))

    inputs = np.stack([
        dataset2['Summation_of_solar_radiation'].to_numpy().reshape(*dataset2['Summation_of_solar_radiation'].to_numpy().shape[:1],-1),
        dataset2['Mean_wind_speed'].to_numpy().reshape(*dataset2['Mean_wind_speed'].to_numpy().shape[:1],-1),
        dataset2['Mean_air_temperature'].to_numpy().reshape(*dataset2['Mean_air_temperature'].to_numpy().shape[:1],-1)
        ])
    
    targets = dataset['Diatom'].to_numpy().reshape(*dataset2['Summation_of_solar_radiation'].to_numpy().shape[:1],-1)

    # Deleting 29 of February
    inputs = np.delete(inputs,indx,axis=1)
    targets = np.delete(targets,indx,axis=0)

    # Splitting in years
    inputs = np.split(inputs,len(np.unique(dataset2.time_counter.dt.year)),axis=1)
    targets = np.split(targets,len(np.unique(dataset.time_counter.dt.year)),axis=0)

    # Means
    inputs = np.nanmean(inputs,axis=0)
    targets = np.nanmean(targets,axis=0)

    x =  np.tile(dataset2.x, len(dataset2.y))
    y =  np.tile(np.repeat(dataset2.y, len(dataset2.x)),1)

    indx = np.where((~np.isnan(targets).any(axis=0))& (x>10) & ((x>100) | (y<880))) # Target goes down to 100m
    inputs = inputs[:,:,indx[0]]

    # Scaling the inputs
    temp = np.reshape(inputs,(len(inputs),inputs.shape[1]*inputs.shape[2]))
    temp = temp.transpose()
    scaler_inputs = make_column_transformer((MinMaxScaler(), [0,1,2]))
    temp = scaler_inputs.fit_transform(temp)
    temp = temp.transpose()
    inputs = np.reshape(temp,(len(inputs),inputs.shape[1],inputs.shape[2])) 

    # Converting it to an appropriate format for functional clustering
    inputs = np.transpose(inputs,axes=(2,1,0))
    inputs2 = FDataGrid(inputs, np.arange(0,len(inputs[0])))

    return(inputs2,indx)
    

## Targets Preparation

In [3]:
def targets_preparation(dataset, name):

    indx = np.where((dataset.time_counter.dt.month==2) & (dataset.time_counter.dt.day==29))
    
    targets = dataset[name].to_numpy().reshape(*dataset[name].to_numpy().shape[:1],-1)

    # Deleting 29 of February
    targets = np.delete(targets,indx,axis=0)

    # Splitting in years
    targets = np.split(targets,len(np.unique(dataset.time_counter.dt.year)),axis=0)

    # Means
    targets = np.nanmean(targets,axis=0)

    x =  np.tile(dataset.x, len(dataset.y))
    y =  np.tile(np.repeat(dataset.y, len(dataset.x)),1)

    indx = np.where((~np.isnan(targets).any(axis=0)) & (x>10) & ((x>100) | (y<880)))
    targets = targets[:,indx[0]]

    # Converting it to an appropriate format for functional clustering
    targets = targets.transpose()
    targets2 = FDataGrid(targets,np.arange(0,len(targets[0])))

    return(targets2,indx)


## Plotting (All Years)

In [4]:
def plotting_all(name,clusters,unique,cluster_mean,counts,ind_cluster):

    if name == 'drivers':
        temp = np.vstack((counts,cluster_mean.transpose()))
        temp = temp.reshape(4,len(unique))
        temp = pd.DataFrame(temp.transpose(),columns=['counts','Summation of solar radiation', 'Mean wind speed', 'Mean Temperature'],index=unique+1)
    else:
        temp = np.concatenate((counts,cluster_mean))
        temp = temp.reshape(2,len(unique))
        temp = pd.DataFrame(temp.transpose(),columns=['counts','mean'],index=unique+1)
    temp.index.name = 'Cluster'
    temp['counts'] = temp['counts'].astype('Int64')
    
    display(temp.transpose())

    fig, ax = plt.subplots(figsize =(5,9))
    cmap = plt.get_cmap('tab20', unique.max()+1)
    cmap.set_bad('gray')
    clus = clusters.plot(ax=ax, cmap=cmap, vmin = unique.min(), vmax = unique.max()+1, add_colorbar=False)
    cbar = fig.colorbar(clus, ticks = unique+0.5) 
    cbar.set_ticklabels(unique+1)
    cbar.set_label('Clusters [count]')
    ax.set_title('Functional Clustering for '+ name + ' (2007-2024)')
    sa_vi.set_aspect(ax)
    plt.show()

    fig, axs = plt.subplots(3, 2, figsize=(10, 15), layout='constrained')
    axs[0, 0].plot(ind_cluster[0])
    axs[0, 0].set_title('Cluster 1')

    axs[0, 1].plot(ind_cluster[1])
    axs[0, 1].set_title('Cluster 2')

    axs[1, 0].plot(ind_cluster[2])
    axs[1, 0].set_title('Cluster 3')

    axs[1, 1].plot(ind_cluster[3])
    axs[1, 1].set_title('Cluster 4')

    axs[2, 0].plot(ind_cluster[4])
    axs[2, 0].set_title('Cluster 5')
    
    axs[2, 1].plot(ind_cluster[5])
    axs[2, 1].set_title('Cluster 6')

    if name == 'drivers':
        fig.legend(['Summation of solar radiation', 'Mean wind speed', 'Mean Temperature'], bbox_to_anchor=(1, 1), loc='center left')
        
    plt.show()
    

## Clustering

In [5]:
def clustering(dataset,quant,indx,name):

    # Training
    kmeans = KMeans(n_clusters=6)
    clusters = kmeans.fit_predict(quant)

    # Sorting so that cluster 1 has the minimum mean target value, 6 the maximum

        # Finding the mean of each cluster
    if name == 'drivers':
        cluster_mean_all = np.mean(kmeans.cluster_centers_.data_matrix,axis=1)
        cluster_mean = cluster_mean_all[:,0]  # Sorted based on the first input
    else:
        cluster_mean = np.squeeze(np.mean(kmeans.cluster_centers_.data_matrix,axis=1))

        # The index to sort the clusters
    indx3 = np.argsort(np.argsort(cluster_mean)) # For the complete map we need the double np.argsort

        # Sorting
    for j in np.arange(0,len(np.unique(clusters))):
        clusters = xr.where(kmeans.labels_==j, indx3[j], clusters)

    unique, counts = np.unique(clusters, return_counts=True)
    
    # Creating the map
    indx2 = np.full(len(dataset.y) * len(dataset.x),np.nan)
    indx2[indx[0]] = clusters
    clusters = np.reshape(indx2,(len(dataset.y),len(dataset.x))) 
    clusters2 = xr.DataArray(clusters,dims = ['y','x'])

    # Obtaining & sorting the individual clusters
    if name == 'drivers':
        ind_cluster = kmeans.cluster_centers_.data_matrix[np.argsort(indx3)]
    else:
        ind_cluster = kmeans.cluster_centers_.data_matrix[np.argsort(indx3)]

    # Sorting the mean values
    if name == 'drivers':
        cluster_mean = cluster_mean_all[np.argsort(cluster_mean)]
    else:
        cluster_mean = cluster_mean[np.argsort(cluster_mean)]

    # cluster_mean = np.round(cluster_mean,3)

    return(clusters2,unique,cluster_mean,counts,ind_cluster)


## Plotting (Clusters)

In [6]:
def plotting_clusters(name,years,unique,clusters_mean,clusters_indiv,counts):

    years2 = np.append(years,'2007-2024')
    for i in unique:

        if name == 'drivers':
            temp = np.vstack((counts[:,i],clusters_mean[:,i,:].transpose()))
            temp = temp.reshape(4,len(years2))
            temp = pd.DataFrame(temp.transpose(),columns=['counts','Summation of solar radiation', 'Mean wind speed', 'Mean Temperature'],index=years2)
        else:
            temp = np.concatenate((counts[:,i],clusters_mean[:,i]))
            temp = temp.reshape(2,len(years2))
            temp = pd.DataFrame(temp.transpose(),columns=['counts','mean'],index=years2)
            
        temp.index.name = 'Year'
        print ('Cluster '+ str(i+1))
        temp['counts'] = temp['counts'].astype('Int64')
        display(temp.transpose())

        k,l = 0,0
        fig, ax = plt.subplots(5, 4, figsize=(10, 15), layout='constrained')

        for j in np.arange (0,len(years)):

            if name == 'drivers':
                ax[k, l].plot(clusters_indiv[j,:,i,:])
                ax[k,l].set_ylim([np.min(clusters_indiv[:,:,i,:]) - 0.05*np.min(clusters_indiv[:,:,i,:]), np.max(clusters_indiv[:,:,i,:])+ 0.05*np.max(clusters_indiv[:,:,i,:])])
            else:
                ax[k, l].plot(clusters_indiv[:,j,i])
                ax[k,l].set_ylim([np.min(clusters_indiv[:,:,i]) - 0.05*np.min(clusters_indiv[:,:,i]), np.max(clusters_indiv[:,:,i])+ 0.05*np.max(clusters_indiv[:,:,i])])
            ax[k, l].set_title(str(years[j]))

            l=l+1
            if l==4:
                l=0
                k=k+1

        ax[4,2].axis('off')
        if name == 'drivers':
            ax[4,3].plot(clusters_indiv[-1,:,i,:])
            ax[4,3].set_ylim([np.min(clusters_indiv[:,:,i,:]) - 0.05*np.min(clusters_indiv[:,:,i,:]), np.max(clusters_indiv[:,:,i,:])+ 0.05*np.max(clusters_indiv[:,:,i,:])])
            fig.legend(['Summation of solar radiation', 'Mean wind speed', 'Mean Temperature'], bbox_to_anchor=(1, 1), loc='center left')
        else:
            ax[4,3].plot(clusters_indiv[:,-1,i])
            ax[4,3].set_ylim([np.min(clusters_indiv[:,:,i]) - 0.05*np.min(clusters_indiv[:,:,i]), np.max(clusters_indiv[:,:,i])+ 0.05*np.max(clusters_indiv[:,:,i])])
        ax[4,3].set_title('2007-2024')  
        fig.suptitle('Cluster '+ str(i+1))        
                     
        plt.show()


## Plotting (Maps)

In [7]:
def plotting_maps(name,years,unique,clusters):

    fig, ax = plt.subplots(5, 4, figsize=(10, 15))

    cmap = plt.get_cmap('tab20', unique.max()+1)
    cmap.set_bad('gray')

    k=0
    l=0

    for j in np.arange (0,len(years)):

        clus = clusters[j].plot(ax=ax[k,l], cmap=cmap, vmin = unique.min(), vmax = unique.max()+1, add_colorbar=False)

        cbar = fig.colorbar(clus, ticks=unique+0.5, fraction=0.08, pad=0.08) 
        cbar.set_ticklabels(unique+1)
        # cbar.set_label('Clusters [count]')
        ax[k,l].set_title(str(years[j]))

        sa_vi.set_aspect(ax[k,l])

        l=l+1

        if l==4:
            l=0
            k=k+1

    ax[4,2].axis('off')

    clus = clusters[-1].plot(ax=ax[4,3], cmap=cmap, vmin = unique.min(), vmax = unique.max()+1, add_colorbar=False)
    cbar = fig.colorbar(clus, ticks=unique+0.5, fraction=0.08, pad=0.08) 
    cbar.set_ticklabels(unique+1)
    # cbar.set_label('Clusters [count]')
    ax[4,3].set_title('2007-2024')
    sa_vi.set_aspect(ax[4,3])

    fig.tight_layout(rect=[0, 0, 1, 0.97])
    fig.suptitle('Functional Clustering for ' + str(name))

    plt.show()


## Files Reading

In [8]:
ds = xr.open_dataset('/data/ibougoudis/MOAD/files/integrated_original.nc')
ds2 = xr.open_dataset('/data/ibougoudis/MOAD/files/external_inputs.nc')

ds = ds.isel(
    y=(np.arange(ds.y[0], ds.y[-1], 5)), 
    x=(np.arange(ds.x[0], ds.x[-1], 5)))

ds2 = ds2.isel(
    y=(np.arange(ds2.y[0], ds2.y[-1], 5)), 
    x=(np.arange(ds2.x[0], ds2.x[-1], 5)))

dataset = ds.sel(time_counter = slice('2007', '2024'))
dataset2 = ds2.sel(time_counter = slice('2007', '2024'))

years = np.unique(ds.time_counter.dt.year)

name = 'drivers'

# pd.set_option('display.float_format', '{:.2E}'.format) # Only for the production rates

## 0 variables: Clustering with all years
## 1 variables: Yearly clusters, based on all years clustering
## 2 variables: Yearly clusters, based on yearly clustering


## Drivers (All years & Calculations)

In [None]:
if name == 'drivers':

    drivers,indx = drivers_preparation(dataset,dataset2)
    clusters0, unique, clusters_mean0, counts0, clusters_indiv0 = clustering(dataset2,drivers,indx,name)
    plotting_all(name,clusters0,unique,clusters_mean0,counts0,clusters_indiv0)

    clusters_indiv1 = np.zeros((clusters_indiv0.shape[2],clusters_indiv0.shape[1],len(years),len(unique))) 

    clusters_indiv2 = np.zeros((len(years),clusters_indiv0.shape[0],clusters_indiv0.shape[1],clusters_indiv0.shape[2])) 
    clusters2 =  np.zeros((len(years),clusters0.shape[0],clusters0.shape[1])) 
    counts2 = np.zeros((len(years),len(unique)))
    clusters_mean2 = np.zeros((len(years),len(unique), clusters_mean0.shape[1]))

    for i in range(0, len(years)):

        dataset = ds.sel(time_counter = slice(str(years[i]), str(years[i])))
        dataset2 = ds2.sel(time_counter = slice(str(years[i]), str(years[i])))
        drivers, _ = drivers_preparation(dataset,dataset2)

        drivers1 = np.squeeze(drivers.data_matrix).transpose()
        clusters1 = np.ravel(clusters0)[indx]

        for j in unique:
            temp = xr.where(clusters1==j, drivers1, np.nan)
            clusters_indiv1[:,:,i,j] = np.nanmean(temp,axis=2)

        clusters, _, clusters_mean, counts, clusters_indiv = clustering(dataset,drivers,indx,name) 
        clusters2[i,:,:] = clusters
        clusters_mean2[i,:,:] = clusters_mean 
        counts2[i,:] = counts 
        clusters_indiv2[i,:,:,:] = clusters_indiv 

    clusters_mean1 = np.mean(clusters_indiv1,axis=1).transpose(1,2,0)
    clusters_mean1 = np.round(np.append(clusters_mean1,np.expand_dims(clusters_mean0,0),axis=0),3)

    clusters_indiv1 = clusters_indiv1.transpose(2,3,1,0)
    clusters_indiv1 = np.append(clusters_indiv1,np.expand_dims(clusters_indiv0,0),axis=0).transpose(0,2,1,3)

    clusters2 = xr.DataArray(clusters2,dims = ['years','y','x'])
    clusters0 = xr.DataArray(clusters0,dims = ['y','x'])
    clusters2 = xr.concat((clusters2,clusters0),'years')

    clusters_mean2 = np.round(np.append(clusters_mean2,np.expand_dims(clusters_mean0,0),axis=0),3)
    counts2 = np.append(counts2,np.expand_dims(counts0,0),axis=0)

    clusters_indiv2 = np.append(clusters_indiv2,np.expand_dims(clusters_indiv0,0),axis=0).transpose(0,2,1,3)


## Drivers (Individual clusters based on all years clustering)

In [None]:
if name == 'drivers':

    print ('Individual clusters based on all-years clustering')
    print ('\n')
    plotting_clusters(name,years,unique,clusters_mean1,clusters_indiv1,np.reshape(np.tile(counts0,len(years)+1),(len(years)+1,len(unique))))
        

## Drivers (Individual clusters based on yearly clustering)

In [None]:
if name == 'drivers':
    
    print ('Individual clusters based on yearly clustering')
    print ('\n')

    plotting_maps(name,years,unique,clusters2)
    plotting_clusters(name,years,unique,clusters_mean2,clusters_indiv2,counts2)
    

## Targets (All years & Calculations)

In [12]:
if name != 'drivers':

    targets,indx = targets_preparation(dataset,name)
    clusters0, unique, clusters_mean0, counts0, clusters_indiv0 = clustering(dataset,targets,indx,name)
    plotting_all(name,clusters0,unique,clusters_mean0,counts0,clusters_indiv0)

    clusters_indiv1 = np.zeros((clusters_indiv0.shape[1],len(years),len(unique)))

    clusters_indiv2 = np.zeros((clusters_indiv0.shape[1],len(years),len(unique))).transpose(2,0,1) 
    clusters2 =  np.zeros((len(years),clusters0.shape[0],clusters0.shape[1])) 
    counts2 = np.zeros((len(years),len(unique)))
    clusters_mean2 = np.zeros((len(years),len(unique)))

    for i in range(0, len(years)):

        dataset = ds.sel(time_counter = slice(str(years[i]), str(years[i])))
        targets, _ = targets_preparation(dataset,name)

        targets1 = np.squeeze(targets.data_matrix).transpose()
        clusters1 = np.ravel(clusters0)[indx]

        for j in unique:
            temp = xr.where(clusters1==j, targets1, np.nan)
            clusters_indiv1[:,i,j] = np.nanmean(temp,axis=1)

        clusters, _, clusters_mean, counts, clusters_indiv = clustering(dataset,targets,indx,name) 
        clusters_indiv = np.squeeze(clusters_indiv,2)
        clusters2[i,:,:] = clusters
        clusters_mean2[i,:] = clusters_mean 
        counts2[i,:] = counts 
        clusters_indiv2[:,:,i] = clusters_indiv 

    clusters_mean1 = np.mean(clusters_indiv1,axis=0)
    clusters_mean1 = np.append(clusters_mean1,np.expand_dims(clusters_mean0,0),axis=0)
    clusters_indiv1 = np.append(clusters_indiv1,clusters_indiv0.transpose(1,2,0),axis=1)

    clusters2 = xr.DataArray(clusters2,dims = ['years','y','x'])
    clusters0 = xr.DataArray(clusters0,dims = ['y','x'])

    clusters2 = xr.concat((clusters2,clusters0),'years')

    clusters_mean2 = np.append(clusters_mean2,np.expand_dims(clusters_mean0,0),axis=0)
    counts2 = np.append(counts2,np.expand_dims(counts0,0),axis=0)
    clusters_indiv2 = clusters_indiv2.transpose(1,2,0)
    clusters_indiv2 = np.append(clusters_indiv2,clusters_indiv0.transpose(1,2,0),axis=1)


## Targets (Individual clusters based on all-years clustering)

In [13]:
if name != 'drivers':
    
    print ('Individual clusters based on all-years clustering')
    print ('\n')

    plotting_clusters(name,years,unique,clusters_mean1,clusters_indiv1,np.reshape(np.tile(counts0,len(years)+1),(len(years)+1,len(unique))))


## Targets (Individual clusters based on yearly clustering)

In [14]:
if name != 'drivers':
    
    print ('Individual clusters based on yearly clustering')
    print ('\n')

    plotting_maps(name,years,unique,clusters2)
    plotting_clusters(name,years,unique,clusters_mean2,clusters_indiv2,counts2)
