# Daily Clustering

## Importing

In [None]:
import xarray as xr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.cluster import BisectingKMeans
from sklearn_som.som import SOM

from tqdm import tqdm


## Datasets Preparation

In [None]:
def datasets_preparation(coords):

    clusters = xr.DataArray(coords=coords, dims = ['time_counter', 'y', 'x'],
        attrs=dict(description="Clusters produced by the k-means algorithm",
        long_name ="Cluster",
        units="count"))
        
    return (clusters)


## Clustering

In [None]:
def clustering(dataset, name):

    variable = np.ravel(dataset[name])

    # Pre processing 
    indx = np.where(~np.isnan(variable))
    inputs = variable[indx]
    inputs = inputs.reshape(len(inputs),1)

    kmeans = KMeans(n_clusters=6).fit(inputs)
    predictions = kmeans.predict(inputs)

    # The index to sort the clusters
    indx3 = np.argsort(np.argsort(np.squeeze(kmeans.cluster_centers_, axis=1))) # For the complete map we need the double np.argsort

    # Sorting
    for j in np.arange(0,len(np.unique(predictions))):
        predictions = xr.where(kmeans.labels_==j, indx3[j], predictions)

    # Post processing
    indx2 = np.full(variable.size,np.nan)
    indx2[indx] = predictions
    clusters = np.reshape(indx2,(len(dataset.y),len(dataset.x))) 

    return(clusters)
    

## File Creation

In [None]:
def file_creation(variable, name):

    temp = variable.to_dataset(name=name)
    temp.to_netcdf(path='/data/ibougoudis/MOAD/files/feb_apr_daily_clustering.nc', mode='a', encoding={name:{"zlib": True, "complevel": 9}})
    temp.close()


## Main Body

In [None]:
ds = xr.open_dataset('/data/ibougoudis/MOAD/files/feb_apr_c.nc')

# ds = ds.isel(
#     y=(np.arange(ds.y[0], ds.y[-1], 5)), 
#     x=(np.arange(ds.x[0], ds.x[-1], 5)))

dataset = ds.sel(time_counter = slice('2007', '2024'))
years = np.unique(dataset.time_counter.dt.year)

coords = dict(time_counter=ds.time_counter, y=ds.y, x=ds.x) 

names = list(ds.keys())
del names[names.index('Summation_of_solar_radiation'):names.index('Day_of_year')+1] # These are continuous 

name = 'Z1_Z1'

clusters = datasets_preparation(coords)
    
for i in tqdm(range (0, len(ds.time_counter))):        

    dataset = ds.isel(time_counter=i)        
    clusters[i] = clustering(dataset, name)

# Calling file creation
file_creation(clusters, name)
