In [2]:
from pca import PCA_Analysis
from dotenv import load_dotenv
import os
import pandas as pd
import glob
from tqdm import tqdm

# load .env file and pca class
load_dotenv()
directory = os.getenv("path")
output_dir = os.getenv("output_dir")
pca = PCA_Analysis()

Helper functions

In [26]:
# selecting top attributes from computed pca
def select_top_attr(all_top_attr, attribute) -> list:
    selected_attr = []
    top_attr = all_top_attr[attribute]
    attrs_list = list(top_attr.values())

    for i in range(4):
        selected_attr.append(attrs_list[0][i])
    for i in range(2):
        selected_attr.append(attrs_list[1][i])
    return list(set(selected_attr))

structural attributes (Time-invariant)

In [27]:
def structural_pca() -> dict:
    struct_path = os.path.join(directory, "data","raw_datasets", "structural_attributes", "140_TI_variables")
    stations_list = pd.read_csv(os.path.join(directory,  "data","raw_datasets", "structural_attributes", "stations_list.csv")).stations.tolist()

    struct_files = os.listdir(struct_path)
    struct_files = [file for file in struct_files if file.endswith(".csv")]

    all_top_load_dfs, all_top_attr = [], {}

    for file in struct_files:
        attr_type = file.split("_")[1].split(".")[0]
        input_struct_data = pd.read_csv(os.path.join(struct_path, file))

        input_struct_data = input_struct_data[input_struct_data["station_id"].isin(stations_list)]

        struct_data = pd.concat([input_struct_data["station_id"], input_struct_data.iloc[:, 4:]], axis = 1)
        struct_data = struct_data.set_index("station_id")

        loadings = pca.loadings(struct_data)

        explained_var = pca.explained_variance(struct_data)
        pc1_val = round(explained_var["Explained Variance"].iloc[1],2)
        pc2_val = round(explained_var["Explained Variance"].iloc[2],2)

        top_attr = pca.top_attributes(loadings, 5)
        all_top_attr[attr_type] = top_attr
        new_keys = {'PC1': f'{attr_type}_PC1_{pc1_val}', 'PC2': f'{attr_type}_PC2_{pc2_val}'}
        renamed_top_attr = {new_keys[key]: value for key, value in top_attr.items()}
        top_load_df = pd.DataFrame(renamed_top_attr)
        all_top_load_dfs.append(top_load_df)
    # return  pd.concat(all_top_load_dfs, axis = 1)
    # out_put_df.to_csv(os.path.join(directory, "pca_results", "TI_top_attributes.csv"), index=False)
    return all_top_attr

structural_pca()["soil"]


{'PC1': ['std_topsoil_sand',
  'median_topsoil_sand',
  'std_topsoil_silt',
  'median_topsoil_silt',
  'std_topsoil_clay'],
 'PC2': ['std_topsoil_clay',
  'median_topsoil_sand',
  'median_soil_depth',
  'std_topsoil_sand',
  'std_topsoil_silt']}

In [28]:
# Generate new datasets with selected attributes
def generate_struct_df() -> pd.DataFrame:
    attr_collection = []
    struct_pca = structural_pca()
    for key in struct_pca.keys():
        attributes = select_top_attr(struct_pca, key)
        path = os.path.join(
            directory,
            "data",
            "raw_datasets",
            "structural_attributes",
            "140_TI_variables",
            f"leb_{key}.csv",
        )
        stations_list = pd.read_csv(
            os.path.join(
                directory,
                "data",
                "raw_datasets",
                "structural_attributes",
                "stations_list.csv",
            )
        ).stations.tolist()
        # input_struct_data = input_struct_data[input_struct_data["station_id"].isin(stations_list)]

        att_file = pd.read_csv(path)
        _133_stations_df = att_file[att_file["station_id"].isin(stations_list)]
        _133_stations_df = _133_stations_df[attributes]
        attr_collection.append(_133_stations_df)

    return pd.concat(attr_collection, axis=1).reset_index(drop=True)

generate_struct_df()

Unnamed: 0,std_topsoil_silt,std_topsoil_clay,std_topsoil_sand,median_topsoil_silt,median_topsoil_sand,%cropland,%forest,%wetland,%water,%grassland,std_elevation,max_elevation,range_elevation,median_elevation,mean_elevation
0,8.170080,8.705036,15.565180,50,26,68.998906,15.832885,0.664702,1.392658,0.437804,62.273206,539,280,411,408.804259
1,5.104138,6.927044,12.031182,50,17,84.426076,10.223111,0.260911,0.057743,0.280158,18.732971,508,105,462,456.647797
2,5.926256,9.597431,13.566371,50,17,80.612989,11.655630,0.249661,0.549254,0.220120,36.743025,441,194,363,354.024238
3,7.847713,8.844053,13.724722,50,26,74.802225,15.943270,1.717514,2.085602,0.799634,14.320739,539,102,487,490.430777
4,2.991387,4.134774,7.125058,36,50,61.204512,26.616951,0.477796,0.867342,0.340075,36.667031,506,211,384,385.497491
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128,2.715522,5.111560,3.409538,56,31,20.926085,62.425168,4.021247,0.341540,0.744922,93.604877,598,415,404,391.824850
129,3.155841,6.100140,4.228345,51,31,38.923298,49.756332,4.273898,0.862061,0.727606,70.777109,630,338,473,465.915367
130,3.521046,4.893305,4.476393,51,31,43.565460,39.755651,8.006014,0.989909,0.727843,95.489068,630,365,407,404.317641
131,7.280444,3.999675,7.571920,51,23,45.486890,28.492574,17.236124,0.699812,0.528546,106.358210,630,455,278,314.382768


crop inventories (Time-variable)

In [29]:
def crop_pca() -> dict:
    crops_path = os.path.join(directory, "data", "raw_datasets", "structural_attributes", "crop_inventories")

    crop_inventories_files = glob.glob(f"{crops_path}/*.csv")

    all_crop_top_load_dfs, all_top_attr  = [], {}

    for path in crop_inventories_files:
        crop_yr_df = pd.read_csv(path)
        crop_yr_df = crop_yr_df.set_index(crop_yr_df.columns[0])

        # crop pca analysis
        # pca_df = pca.pca_analysis(crop_yr_df)
        loadings = pca.loadings(crop_yr_df)

        explained_var = pca.explained_variance(crop_yr_df)
        pc1_val = round(explained_var["Explained Variance"].iloc[1],2)
        pc2_val = round(explained_var["Explained Variance"].iloc[2],2)

        top_attr = pca.top_attributes(loadings, 5)
        all_top_attr[path.split("/")[-1].split(".")[0]] = top_attr
        new_keys = {'PC1': f'{path.split("/")[-1].split(".")[0]}_PC1_{pc1_val}', 'PC2': f'{path.split("/")[-1].split(".")[0]}_PC2_{pc2_val}'}
        renamed_top_attr = {new_keys[key]: value for key, value in top_attr.items()}
        top_load_df = pd.DataFrame(renamed_top_attr)
        all_crop_top_load_dfs.append(top_load_df)

    # crop_out_put_df = pd.concat(all_crop_top_load_dfs, axis = 1)
    # crop_out_put_df = crop_out_put_df.reindex(sorted(crop_out_put_df.columns), axis=1)
    # crop_out_put_df.to_csv(os.path.join(directory, "pca_results", "crop_top_attributes.csv"), index=False)

    return all_top_attr

print(crop_pca())

{'crop_inventory_2012': {'PC1': ['grapes2012', 'springwheat2012', 'oats2012', 'tomatoes2012', 'sod2012'], 'PC2': ['soyabeans2012', 'winterwheat2012', 'tomatoes2012', 'corn2012', 'springwheat2012']}, 'crop_inventory_2013': {'PC1': ['grapes2013', 'springwheat2013', 'tomatoes2013', 'fallow2013', 'winterwheat2013'], 'PC2': ['soyabeans2013', 'corn2013', 'winterwheat2013', 'fallow2013', 'tomatoes2013']}, 'crop_inventory_2011': {'PC1': ['oats2011', 'springwheat2011', 'grapes2011', 'tomatoes2011', 'peas2011'], 'PC2': ['soyabeans2011', 'winterwheat2011', 'corn2011', 'tomatoes2011', 'fallow2011']}, 'crop_inventory_2014': {'PC1': ['springwheat2014', 'grapes2014', 'winterwheat2014', 'tomatoes2014', 'potatoes2014'], 'PC2': ['soyabeans2014', 'corn2014', 'fallow2014', 'winterwheat2014', 'peas2014']}, 'crop_inventory_2015': {'PC1': ['tomatoes2015', 'grapes2015', 'springwheat2015', 'potatoes2015', 'winterwheat2015'], 'PC2': ['potatoes2015', 'winterwheat2015', 'grapes2015', 'soyabeans2015', 'canola2015'

Combining TI (soil, lucl, terrain) and TV (crop inventory) datasets 

In [30]:
# helper function to select top attributes for year variant attributes.
def selectTopAtttributesForYear(datatype: dict) -> list:
    pca_selected_attr = []
    for _,val in datatype.items():
        yearTopAttributes, topAttributes = [], []
        for _,v in val.items():
            yearTopAttributes.append(v)

        for i in range(4):
            topAttributes.append(yearTopAttributes[0][i])
        for i in range(2):
            topAttributes.append(yearTopAttributes[1][i])

        pca_selected_attr.append(list(set(topAttributes)))

    # selected_attr = []
    counter = {}
    for arr_attrs in pca_selected_attr:
        for attr in arr_attrs:
            attr = attr.split("2")[0]
            if attr in counter:
                counter[attr] += 1
            else:
                counter[attr] = 1

    count, selected_attr = 0, set()
    count_values = sorted(list(counter.values()), reverse=True)
    # print("counter_values", count_values)
    lookup_counter = min(count_values[:5])


    for key, value in counter.items():
        if value >= lookup_counter:
            count += 1
            selected_attr.add(key)

    return list(selected_attr)

# testing:
# crops_data = crop_pca()

In [31]:
# Generate crop + struct attributes combined dataset.
def generate_struct_crop_df(year:int) -> pd.DataFrame:
    crops_pca = crop_pca()
    selectedAttributesCrops = selectTopAtttributesForYear(crops_pca)
    selectedAttributesCropsWithYear = [0]*len(selectedAttributesCrops)

    for i in range(len(selectedAttributesCrops)):
        selectedAttributesCropsWithYear[i] = f"{selectedAttributesCrops[i]}{year}"

    for key in crops_pca.keys():
        if year == int(key.split("_")[-1]):
            # attributes = select_top_attr(crops_pca, key)
            path = os.path.join(directory, "data", "raw_datasets", "structural_attributes", "crop_inventories", f"{key}.csv")
            attr_file = pd.read_csv(path)
            attr_file = attr_file[selectedAttributesCropsWithYear]
            attr_file.columns = selectedAttributesCrops

            output_df = pd.concat([generate_struct_df(), attr_file], axis=1)
        else:
            pass

    return output_df

generate_struct_crop_df(2012)

Unnamed: 0,std_topsoil_silt,std_topsoil_clay,std_topsoil_sand,median_topsoil_silt,median_topsoil_sand,%cropland,%forest,%wetland,%water,%grassland,...,max_elevation,range_elevation,median_elevation,mean_elevation,corn,tomatoes,oats,soyabeans,springwheat,winterwheat
0,8.170080,8.705036,15.565180,50,26,68.998906,15.832885,0.664702,1.392658,0.437804,...,539,280,411,408.804259,18.061561,0.000000,0.000000,10.655077,0.0,0.000000
1,5.104138,6.927044,12.031182,50,17,84.426076,10.223111,0.260911,0.057743,0.280158,...,508,105,462,456.647797,23.698112,0.000000,0.000000,16.446032,0.0,0.000000
2,5.926256,9.597431,13.566371,50,17,80.612989,11.655630,0.249661,0.549254,0.220120,...,441,194,363,354.024238,25.089803,0.000000,0.000000,16.327990,0.0,0.000000
3,7.847713,8.844053,13.724722,50,26,74.802225,15.943270,1.717514,2.085602,0.799634,...,539,102,487,490.430777,10.929124,0.000000,0.000000,7.864170,0.0,0.000000
4,2.991387,4.134774,7.125058,36,50,61.204512,26.616951,0.477796,0.867342,0.340075,...,506,211,384,385.497491,12.710850,0.000000,0.000000,10.783461,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128,2.715522,5.111560,3.409538,56,31,20.926085,62.425168,4.021247,0.341540,0.744922,...,598,415,404,391.824850,1.674956,0.000607,0.150118,0.327531,0.0,0.121308
129,3.155841,6.100140,4.228345,51,31,38.923298,49.756332,4.273898,0.862061,0.727606,...,630,338,473,465.915367,15.219172,0.000000,0.421673,0.407922,0.0,0.187346
130,3.521046,4.893305,4.476393,51,31,43.565460,39.755651,8.006014,0.989909,0.727843,...,630,365,407,404.317641,15.998103,0.000000,0.427356,0.697591,0.0,0.484688
131,7.280444,3.999675,7.571920,51,23,45.486890,28.492574,17.236124,0.699812,0.528546,...,630,455,278,314.382768,14.749155,0.001077,0.414476,2.293613,0.0,0.791003


feature engineer (functional attributes - riverflow metrics)

In [32]:
# metrics to drop.
drop_metrics = [
    "Station Name",
    "Country",
    "Watershed-Area",
    "Latitude",
    "Longitude",
    "YR-MaxFlow",
    "GM-MaxFlow",
    "NGM-MaxFlow",
    "YR-MinFlow",
    "GM-MinFlow",
    "NGM-MinFlow",
    "YR-MedianFlow",
    "GM-MedianFlow",
    "NGM-MedianFlow",
    "YR-Q95Flow",
    "GM-Q95Flow",
    "NGM-Q95Flow",
    "YR-Q5Flow",
    "GM-Q5Flow",
    "NGM-Q5Flow",
]

def functional_pca(drop_metrics: list = drop_metrics):
    functional_path = os.path.join(directory, "data", "raw_datasets", "functional_attributes", "133_riverflow")
    functional_files = glob.glob(f"{functional_path}/*.csv")

    all_top_load_dfs, all_top_metrics = [], {}

    for file in functional_files:
        df = pd.read_csv(file)
        df = df.set_index(df.columns[0])

        # drop metrics and checking for missing values.
        functional_df = df.loc[:, ~df.columns.str.startswith(tuple(drop_metrics))]
        functional_df = functional_df.fillna(functional_df.median())

        pca_loadings = pca.loadings(functional_df)

        explained_var = pca.explained_variance(functional_df)
        pc1_val = round(explained_var["Explained Variance"].iloc[1],2)
        pc2_val = round(explained_var["Explained Variance"].iloc[2],2)

        top_metrics = pca.top_attributes(pca_loadings, 5)
        all_top_metrics[file.split("/")[-1].split(".")[0]] = top_metrics
        new_keys = {'PC1': f'{file.split("/")[-1].split(".")[0]}_PC1_{pc1_val}', 'PC2': f'{file.split("/")[-1].split(".")[0]}_PC2_{pc2_val}'}
        renamed_top_metrics = {new_keys[key]: value for key, value in top_metrics.items()}
        yr_top_load_df = pd.DataFrame(renamed_top_metrics)
        all_top_load_dfs.append(yr_top_load_df)

    # out_put_df = pd.concat(all_top_load_dfs, axis = 1)
    # out_put_df =out_put_df.reindex(sorted(out_put_df.columns), axis=1)

    return all_top_metrics

functionalAttributes = functional_pca()
print(selectTopAtttributesForYear(functionalAttributes))


['YR-RBI-', 'NGM-RBI-', 'Specific-GM-MedianFlow-', 'Specific-NGM-MedianFlow-', 'YR-CVQ-', 'Specific-YR-MedianFlow-', 'Specific-GM-Q95-']


In [95]:
# pca for climate indices
def climate_pca():
    climate_path = os.path.join(directory, "data", "raw_datasets", "climate_indices")
    climate_files = glob.glob(f"{climate_path}/*.csv")

    all_top_load_dfs, all_top_metrics = [], {}

    for file in climate_files:
        df = pd.read_csv(file)

        stations_list = pd.read_csv(
        os.path.join(
                directory, "data", "raw_datasets", "structural_attributes", "stations_list.csv"
            )
        ).stations.tolist()

        stations_list = sorted(stations_list)

        df.columns.values[0] = "station_id"
        df['station_id'] = df['station_id'].apply(lambda val: val.lstrip("0") if val.startswith("04") else val)

        climate_df = df[df["station_id"].isin(stations_list)]

        # sort the climate_df
        climate_df = climate_df.sort_values("station_id")
        climate_df.index = range(len(climate_df))

        climate_df = climate_df.drop(columns=["station_id"])

        # drop metrics and checking for missing values.
        climatedfNomissingValues = climate_df.fillna(climate_df.median())

        pca_loadings = pca.loadings(climatedfNomissingValues)

        explained_var = pca.explained_variance(climatedfNomissingValues)
        pc1_val = round(explained_var["Explained Variance"].iloc[1],2)
        pc2_val = round(explained_var["Explained Variance"].iloc[2],2)

        top_metrics = pca.top_attributes(pca_loadings, 5)
        all_top_metrics[file.split("/")[-1].split(".")[0]] = top_metrics
        new_keys = {'PC1': f'{file.split("/")[-1].split(".")[0]}_PC1_{pc1_val}', 'PC2': f'{file.split("/")[-1].split(".")[0]}_PC2_{pc2_val}'}
        renamed_top_metrics = {new_keys[key]: value for key, value in top_metrics.items()}
        yr_top_load_df = pd.DataFrame(renamed_top_metrics)
        all_top_load_dfs.append(yr_top_load_df)

        out_put_df = pd.concat(all_top_load_dfs, axis = 1)
        out_put_df =out_put_df.reindex(sorted(out_put_df.columns), axis=1)

    return all_top_metrics

climateAttributes = climate_pca()
print(selectTopAtttributesForYear(climateAttributes))

['AnnualTotalPrcp_', 'AnnualSumOfRainOnSWE_', 'AnnualMaxSWE_', 'AnnualTotalSnowFall_', 'AnnualNumberRainOnSWEdays_']


In [None]:
# generate new dataset for AP classification (riverflows metrics - year and seasonal).
def selected_func_metrics() -> dict:
    stations_list = pd.read_csv(
        os.path.join(
            directory, "data", "raw_datasets", "structural_attributes", "stations_list.csv"
        )
    ).stations.tolist()

    func_metrics = {}

    # riveflow metrics:
    functionalAttributes =selectTopAtttributesForYear(functional_pca())
    for year in range(2011, 2021):
        functionalAttributesWithYear = [functionalAttributes[i] + str(year) for i in range(len(functionalAttributes))]

        functional_path = os.path.join(
            directory, "data", "raw_datasets", "functional_attributes", "133_riverflow", f'{year}.csv'
        )
        functional_df = pd.read_csv(functional_path)
        functional_df = functional_df[functionalAttributesWithYear]
        functional_df['station_id'] = stations_list
        functional_df = functional_df[["station_id"] + [col for col in functional_df.columns if col != "station_id"]]
        func_metrics[str(year)] = functional_df
        dir = os.path.join(output_dir, "func")
        functional_df.to_csv(dir +  f"/{year}_func_metrics.csv", index=False)

    return functional_df

selected_func_metrics()

In [None]:
# generate new dataset for AP classification (riverflows metrics - climate indices).
def selected_func_climate_attrs() -> pd.DataFrame:
    climateAttributes = selectTopAtttributesForYear(climate_pca())

    for year in tqdm(range(2011, 2021)):
        df = pd.read_csv(
            os.path.join(
                directory,
                "data",
                "raw_datasets",
                "climate_indices",
                f"climate_indices_{str(year)}.csv",
            )
        )

        stations_list = pd.read_csv(
        os.path.join(
                directory, "data", "raw_datasets", "structural_attributes", "stations_list.csv"
            )
        ).stations.tolist()

        stations_list = sorted(stations_list)

        df.columns.values[0] = "station_id"
        df['station_id'] = df['station_id'].apply(lambda val: val.lstrip("0") if val.startswith("04") else val)

        climate_df = df[df["station_id"].isin(stations_list)]

        # sort the climate_df
        climate_df = climate_df.sort_values("station_id")
        climate_df.index = range(len(climate_df))

        climateAttributesWithYear = [climateAttributes[i] + str(year) for i in range(len(climateAttributes))]
        climate_df = climate_df[climateAttributesWithYear]

        dir = os.path.join(output_dir, "func", f"{year}_func_metrics.csv")
        functional_df = pd.read_csv(dir)

        output_df = pd.concat([functional_df, climate_df], axis=1)

        output_df = output_df[["station_id"] + [col for col in output_df.columns if col != "station_id"]]

        output_df.to_csv(os.path.join(output_dir, "func_climate", f"{year}_func_climate_attrs.csv"), index=False)
        output_df.to_csv(
            os.path.join(output_dir, "func_climate", f"{year}_func_climate_attrs.csv")
        , index=False)

    return output_df

selected_func_climate_attrs()

In [121]:
# generate new dataset for AP classification (riverflow metrics + structural attributes + climate indices)
stations_list = pd.read_csv(os.path.join( directory, "data", "raw_datasets", "structural_attributes", "stations_list.csv")).stations.tolist()

stations_list = sorted(stations_list)


for year in tqdm(range(2011, 2021)):
    df = pd.read_csv(os.path.join(output_dir, "func_climate", f"{year}_func_climate_attrs.csv"))
    output_df = pd.concat([generate_struct_crop_df(year), df], axis=1)
    output_df['station_id'] = stations_list
    output_df = output_df[["station_id"] + [col for col in output_df.columns if col != "station_id"]]
    output_df.to_csv(os.path.join(output_dir, "all_attributes", f"{year}_func_struct_climate_attrs.csv"), index=False)

100%|██████████| 10/10 [00:00<00:00, 18.76it/s]


PCA TESTING and AP clustering for Flow metrics.

In [31]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# metrics to drop.
drop_metrics = [
    "Station Name",
    "Country",
    "Watershed-Area",
    "Latitude",
    "Longitude",
    "YR-MaxFlow",
    "GM-MaxFlow",
    "NGM-MaxFlow",
    "YR-MinFlow",
    "GM-MinFlow",
    "NGM-MinFlow",
    "YR-MedianFlow",
    "GM-MedianFlow",
    "NGM-MedianFlow",
    "YR-Q95Flow",
    "GM-Q95Flow",
    "NGM-Q95Flow",
    "YR-Q5Flow",
    "GM-Q5Flow",
    "NGM-Q5Flow",
]


functional_path = os.path.join(directory, "data", "raw_datasets", "functional_attributes", "133_riverflow", "2011.csv")
functional_df = pd.read_csv(functional_path)
functional_df = functional_df.set_index(functional_df.columns[0])
functional_df = functional_df.loc[:, ~functional_df.columns.str.startswith(tuple(drop_metrics))]
functional_df = functional_df.fillna(functional_df.median())

scaler = StandardScaler()
scaled_df = scaler.fit_transform(functional_df)
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(scaled_df)

principalDf = pd.DataFrame(data = principalComponents, columns = ['PC1', 'PC2'])

# explained_variance = pca.explained_variance_ratio_
explained_variance_ratio = pca.explained_variance_ratio_
# print("Explained Variance Ratio:", explained_variance_ratio)


cumulative_variance_ratio = explained_variance_ratio.cumsum()
n_components = len(cumulative_variance_ratio[cumulative_variance_ratio <= 0.95])
# print("Number of Principal Components to retain 95% variance:", n_components)


# # loadings
loadings = pca.components_.T * np.sqrt(pca.explained_variance_)

loadings_df = pd.DataFrame(loadings, columns = ['PC1', 'PC2'], index = functional_df.columns)

pc1 = loadings_df['PC1']


# use th value of pc1 to get the top 5 attributes
topLoadingAttributes_pc1 = pc1.abs().sort_values(ascending=False).head(5).index
topLoadingAttributes_pc2 = loadings_df['PC2'].abs().sort_values(ascending=False).head(2).index

# topLoadingAttributes = list(set(topLoadingAttributes_pc1).union(set(topLoadingAttributes_pc2)))

topLoadingAttributes = [
        "YR-RBI-2011",
        "NGM-RBI-2011",
        "Specific-GM-MedianFlow-2011",
        "Specific-NGM-MedianFlow-2011",
        "YR-CVQ-2011",
        "Specific-YR-MedianFlow-2011",
        "Specific-GM-Q95-2011",
        ]


resultedTopAttributes_df = functional_df[topLoadingAttributes]

# AP
from sklearn.cluster import AffinityPropagation
import warnings
warnings.filterwarnings("ignore")
model = AffinityPropagation(damping=0.9, verbose=2)

# fit the model
resultedTopAttributes_df = resultedTopAttributes_df.fillna(resultedTopAttributes_df.median())
model.fit(resultedTopAttributes_df)
labels = model.labels_

ap_res = {}
ap_res[functional_path.split("/")[-1].split(".")[0]] = list(labels)
result = pd.DataFrame(ap_res).set_index(resultedTopAttributes_df.index)

# min and max values of the labels
print("Min Label:", min(labels), "Max Label:", max(labels))

Converged after 15 iterations.
Min Label: 0 Max Label: 16
