In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import AffinityPropagation
from dotenv import load_dotenv
import os
import warnings
warnings.filterwarnings("ignore")

load_dotenv()
directory = os.getenv("output_dir")
main_dir = os.getenv("main_dir")

In [61]:
model = AffinityPropagation()

# Clustering with AP.
def clustering(dir, output_dir, model, dataset):
    paths = os.listdir(dir)
    files = sorted([file for file in paths if file.endswith(".csv")])

    ap_res = {}
    # output = {}
    for file in files:
        df = pd.read_csv(os.path.join(dir, file))

        if "Unnamed: 0" in df.columns:
            df.drop(df.columns[0], axis=1, inplace=True)
            df.set_index(df.columns[0], inplace=True)

        else:
            df.set_index(df.columns[0], inplace=True)

        df = df.fillna(df.median())

        # AP clustering.
        model.fit(df)
        stations = list(model.cluster_centers_indices_)

        indices= {}
        for index, station in enumerate(list(stations)):
            indices[index+1] = station

        # assingning cluster labels to the stations
        labels = model.labels_

        labels = labels + 1
        labels = [indices[station] for station in labels]


        ap_res[file.split("/")[-1].split(".")[0]] = list(labels)
        result = pd.DataFrame(ap_res).set_index(df.index)

    result.to_csv(os.path.join(output_dir, f'{dataset}.csv'), index=True)
    return result

# riverflow metrics clustering
datasets = ["func", "func_climate", "all_attributes"]

for dataset in datasets:
    dir = os.path.join(directory, "pca_results", dataset)
    outputdir = os.path.join(directory, "ap_results")
    res = clustering(dir, outputdir, model, dataset)
res

Unnamed: 0_level_0,2011_func_struct_climate_attrs,2012_func_struct_climate_attrs,2013_func_struct_climate_attrs,2014_func_struct_climate_attrs,2015_func_struct_climate_attrs,2016_func_struct_climate_attrs,2017_func_struct_climate_attrs,2018_func_struct_climate_attrs,2019_func_struct_climate_attrs,2020_func_struct_climate_attrs
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
02GA003,15,4,4,4,15,4,4,15,15,15
02GA005,1,1,13,1,1,13,1,13,1,1
02GA010,15,47,4,34,15,42,4,15,14,42
02GA014,1,1,13,3,1,13,1,13,1,1
02GA015,15,4,4,4,15,4,4,15,15,15
...,...,...,...,...,...,...,...,...,...,...
4215500,128,130,130,128,130,130,130,130,130,130
4216418,128,130,130,130,130,130,130,130,130,130
4217000,128,130,130,130,130,130,130,130,130,130
4218000,128,130,130,130,130,130,130,130,130,130


In [63]:
# check the number of clusters
dir = os.path.join(directory, 'ap_results')
paths = os.listdir(dir)
files = sorted([file for file in paths if file.endswith(".csv")])

ap_results = {}
for file in files:
    df = pd.read_csv(os.path.join(dir, file))
    for col in df.columns:
        if col == 'station_id':
            continue
        ap_results[col] = df[col].nunique()

ap_results

{'2011_func_struct_climate_attrs': 13,
 '2012_func_struct_climate_attrs': 15,
 '2013_func_struct_climate_attrs': 14,
 '2014_func_struct_climate_attrs': 15,
 '2015_func_struct_climate_attrs': 15,
 '2016_func_struct_climate_attrs': 15,
 '2017_func_struct_climate_attrs': 15,
 '2018_func_struct_climate_attrs': 17,
 '2019_func_struct_climate_attrs': 15,
 '2020_func_struct_climate_attrs': 15,
 '2011_func_metrics': 10,
 '2012_func_metrics': 11,
 '2013_func_metrics': 12,
 '2014_func_metrics': 11,
 '2015_func_metrics': 9,
 '2016_func_metrics': 10,
 '2017_func_metrics': 10,
 '2018_func_metrics': 11,
 '2019_func_metrics': 12,
 '2020_func_metrics': 11,
 '2011_func_climate_attrs': 13,
 '2012_func_climate_attrs': 11,
 '2013_func_climate_attrs': 10,
 '2014_func_climate_attrs': 12,
 '2015_func_climate_attrs': 10,
 '2016_func_climate_attrs': 10,
 '2017_func_climate_attrs': 11,
 '2018_func_climate_attrs': 12,
 '2019_func_climate_attrs': 13,
 '2020_func_climate_attrs': 12}