In [1]:
from pca import PCA_Analysis
from dotenv import load_dotenv
import os
import pandas as pd
import glob
from tqdm import tqdm

# load .env file and pca class
load_dotenv()
directory = os.getenv("path")
output_dir = os.getenv("output_dir")
pca = PCA_Analysis()

Helper functions

In [2]:
# selecting top attributes from computed pca
def select_top_attr(all_top_attr, attribute) -> list:
    selected_attr = []
    top_attr = all_top_attr[attribute]
    attrs_list = list(top_attr.values())

    for i in range(4):
        selected_attr.append(attrs_list[0][i])
    for i in range(2):
        selected_attr.append(attrs_list[1][i])
    return list(set(selected_attr))

structural attributes (Time-invariant)

In [3]:
def structural_pca() -> dict:
    # struct_path = os.path.join(directory, "data","raw_datasets", "structural_attributes", "140_TI_variables")
    struct_path = os.path.join(directory, "computed_data", "TI_variables")

    stations_list = pd.read_csv(os.path.join(directory, "raw_data", "stations_list.csv")).stations.tolist()

    struct_files = os.listdir(struct_path)
    struct_files = [file for file in struct_files if file.endswith(".csv")]

    all_top_load_dfs, all_top_attr = [], {}

    for file in struct_files:
        attr_type = file.split("_")[1].split(".")[0]
        input_struct_data = pd.read_csv(os.path.join(struct_path, file))

        input_struct_data = input_struct_data[input_struct_data["station_id"].isin(stations_list)]

        struct_data = pd.concat([input_struct_data["station_id"], input_struct_data.iloc[:, 4:]], axis = 1)
        struct_data = struct_data.set_index("station_id")

        loadings = pca.loadings(struct_data)

        explained_var = pca.explained_variance(struct_data)
        pc1_val = round(explained_var["Explained Variance"].iloc[1],2)
        pc2_val = round(explained_var["Explained Variance"].iloc[2],2)

        top_attr = pca.top_attributes(loadings, 5)
        all_top_attr[attr_type] = top_attr
        new_keys = {'PC1': f'{attr_type}_PC1_{pc1_val}', 'PC2': f'{attr_type}_PC2_{pc2_val}'}
        renamed_top_attr = {new_keys[key]: value for key, value in top_attr.items()}
        top_load_df = pd.DataFrame(renamed_top_attr)
        all_top_load_dfs.append(top_load_df)
    # return  pd.concat(all_top_load_dfs, axis = 1)
    # out_put_df.to_csv(os.path.join(directory, "pca_results", "TI_top_attributes.csv"), index=False)
    return all_top_attr

structural_pca()

{'soil': {'PC1': ['mean_topsoil_sand',
   'max_topsoil_sand',
   'median_topsoil_sand',
   'mean_topsoil_silt',
   'min_topsoil_silt'],
  'PC2': ['std_topsoil_clay',
   'min_topsoil_sand',
   'std_topsoil_sand',
   'std_topsoil_silt',
   'median_soil_depth']},
 'surficial': {'PC1': ['Alluvium',
   'Kames and esker',
   'Undefined',
   'Lacustrine',
   'End moraine'],
  'PC2': ['Ground moraine',
   'End moraine',
   'Lacustrine',
   'Undifferentiated',
   'Outwash and dunes']},
 'lulcstats': {'PC1': ['%cropland',
   '%grassland',
   '%wetland',
   '%water',
   '%urban'],
  'PC2': ['%forest', '%grassland', '%water', '%urban', '%wetland']},
 'terrain': {'PC1': ['max_elevation',
   'mean_elevation',
   'median_elevation',
   'range_elevation',
   'std_elevation'],
  'PC2': ['min_elevation',
   'max_slope',
   'range_elevation',
   'median_elevation',
   'mean_elevation']}}

In [10]:
# Generate new datasets with selected attributes
def generate_struct_df() -> pd.DataFrame:
    attr_collection = []
    struct_pca = structural_pca()
    for key in struct_pca.keys():
        attributes = select_top_attr(struct_pca, key)
        path = os.path.join(
            directory,
            "computed_data",
            "TI_variables",
            f"leb_{key}.csv",
        )
        stations_list = pd.read_csv(os.path.join(directory, "raw_data", "stations_list.csv")).stations.tolist()
        # input_struct_data = input_struct_data[input_struct_data["station_id"].isin(stations_list)]

        att_file = pd.read_csv(path)
        _133_stations_df = att_file[att_file["station_id"].isin(stations_list)]
        _133_stations_df = _133_stations_df[attributes]
        attr_collection.append(_133_stations_df)

    return pd.concat(attr_collection, axis=1).reset_index(drop=True)

generate_struct_df()

Unnamed: 0,mean_topsoil_silt,median_topsoil_sand,min_topsoil_sand,std_topsoil_clay,mean_topsoil_sand,max_topsoil_sand,Undefined,End moraine,Ground moraine,Alluvium,...,%water,%grassland,%wetland,%cropland,median_elevation,mean_elevation,min_elevation,range_elevation,max_elevation,max_slope
0,44.782889,26,17,8.705036,33.512144,60,0.916557,0.000000,70.841742,0.000000,...,1.392658,0.437804,0.664702,68.998906,411,408.804259,259,280,539,31.821241
1,47.790384,17,17,6.927044,22.208381,50,0.009040,0.000000,94.796098,0.000000,...,0.057743,0.280158,0.260911,84.426076,462,456.647797,403,105,508,13.121887
2,46.808674,17,17,9.597431,28.581898,64,0.000000,0.000000,62.611159,0.240400,...,0.549254,0.220120,0.249661,80.612989,363,354.024238,247,194,441,27.372448
3,50.330775,26,17,8.844053,27.425381,60,2.016131,0.000000,74.117491,0.000000,...,2.085602,0.799634,1.717514,74.802225,487,490.430777,437,102,539,16.726595
4,36.174353,50,17,4.134774,49.640914,60,0.003601,0.000000,62.288441,0.000000,...,0.867342,0.340075,0.477796,61.204512,384,385.497491,295,211,506,25.869267
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128,54.815661,31,23,5.111560,29.625678,41,0.088710,33.681957,36.586062,3.663547,...,0.341540,0.744922,4.021247,20.926085,404,391.824850,183,415,598,27.470636
129,52.721364,31,23,6.100140,27.720180,33,0.110957,25.047208,48.042152,3.170166,...,0.862061,0.727606,4.273898,38.923298,473,465.915367,292,338,630,35.190132
130,50.526021,31,23,4.893305,29.071465,33,0.264663,31.018234,49.985005,3.230700,...,0.989909,0.727843,8.006014,43.565460,407,404.317641,265,365,630,35.190132
131,52.532807,23,11,3.999675,26.028401,33,0.066045,32.012867,57.727028,5.689437,...,0.699812,0.528546,17.236124,45.486890,278,314.382768,175,455,630,35.190132


crop inventories (Time-variable)

In [11]:
def crop_pca() -> dict:
    crops_path = os.path.join(directory, "computed_data", "crop_data")

    crop_inventories_files = glob.glob(f"{crops_path}/*.csv")

    all_crop_top_load_dfs, all_top_attr  = [], {}

    for path in crop_inventories_files:
        crop_yr_df = pd.read_csv(path)
        crop_yr_df = crop_yr_df.set_index(crop_yr_df.columns[0])

        # crop pca analysis
        # pca_df = pca.pca_analysis(crop_yr_df)
        loadings = pca.loadings(crop_yr_df)

        explained_var = pca.explained_variance(crop_yr_df)
        pc1_val = round(explained_var["Explained Variance"].iloc[1],2)
        pc2_val = round(explained_var["Explained Variance"].iloc[2],2)

        top_attr = pca.top_attributes(loadings, 5)
        all_top_attr[path.split("/")[-1].split(".")[0]] = top_attr
        new_keys = {'PC1': f'{path.split("/")[-1].split(".")[0]}_PC1_{pc1_val}', 'PC2': f'{path.split("/")[-1].split(".")[0]}_PC2_{pc2_val}'}
        renamed_top_attr = {new_keys[key]: value for key, value in top_attr.items()}
        top_load_df = pd.DataFrame(renamed_top_attr)
        all_crop_top_load_dfs.append(top_load_df)

    # crop_out_put_df = pd.concat(all_crop_top_load_dfs, axis = 1)
    # crop_out_put_df = crop_out_put_df.reindex(sorted(crop_out_put_df.columns), axis=1)
    # crop_out_put_df.to_csv(os.path.join(directory, "pca_results", "crop_top_attributes.csv"), index=False)

    return all_top_attr

print(crop_pca())

{'crop_inventory_2012': {'PC1': ['grapes2012', 'springwheat2012', 'oats2012', 'tomatoes2012', 'sod2012'], 'PC2': ['soyabeans2012', 'winterwheat2012', 'tomatoes2012', 'corn2012', 'springwheat2012']}, 'crop_inventory_2013': {'PC1': ['grapes2013', 'springwheat2013', 'tomatoes2013', 'fallow2013', 'winterwheat2013'], 'PC2': ['soyabeans2013', 'corn2013', 'winterwheat2013', 'fallow2013', 'tomatoes2013']}, 'crop_inventory_2011': {'PC1': ['oats2011', 'springwheat2011', 'grapes2011', 'tomatoes2011', 'peas2011'], 'PC2': ['soyabeans2011', 'winterwheat2011', 'corn2011', 'tomatoes2011', 'fallow2011']}, 'crop_inventory_2014': {'PC1': ['springwheat2014', 'grapes2014', 'winterwheat2014', 'tomatoes2014', 'potatoes2014'], 'PC2': ['soyabeans2014', 'corn2014', 'fallow2014', 'winterwheat2014', 'peas2014']}, 'crop_inventory_2015': {'PC1': ['tomatoes2015', 'grapes2015', 'springwheat2015', 'potatoes2015', 'winterwheat2015'], 'PC2': ['potatoes2015', 'winterwheat2015', 'grapes2015', 'soyabeans2015', 'canola2015'

Combining TI (soil, lucl, terrain) and TV (crop inventory) datasets 

In [13]:
# helper function to select top attributes for year variant attributes.
def selectTopAtttributesForYear(datatype: dict) -> list:
    pca_selected_attr = []
    for _,val in datatype.items():
        yearTopAttributes, topAttributes = [], []
        for _,v in val.items():
            yearTopAttributes.append(v)

        for i in range(4):
            topAttributes.append(yearTopAttributes[0][i])
        for i in range(2):
            topAttributes.append(yearTopAttributes[1][i])

        pca_selected_attr.append(list(set(topAttributes)))

    # selected_attr = []
    counter = {}
    for arr_attrs in pca_selected_attr:
        for attr in arr_attrs:
            attr = attr.split("2")[0]
            if attr in counter:
                counter[attr] += 1
            else:
                counter[attr] = 1

    count, selected_attr = 0, set()
    count_values = sorted(list(counter.values()), reverse=True)
    # print("counter_values", count_values)
    lookup_counter = min(count_values[:5])

    for key, value in counter.items():
        if value >= lookup_counter:
            count += 1
            selected_attr.add(key)

    return list(selected_attr)

# testing:
# crops_data = crop_pca()

In [18]:
# Generate crop + struct attributes combined dataset.
def generate_struct_crop_df(year:int) -> pd.DataFrame:
    crops_pca = crop_pca()
    selectedAttributesCrops = selectTopAtttributesForYear(crops_pca)
    selectedAttributesCropsWithYear = [0]*len(selectedAttributesCrops)

    for i in range(len(selectedAttributesCrops)):
        selectedAttributesCropsWithYear[i] = f"{selectedAttributesCrops[i]}{year}"

    for key in crops_pca.keys():
        if year == int(key.split("_")[-1]):
            # attributes = select_top_attr(crops_pca, key)
            path = os.path.join(directory, "computed_data","crop_data", f"{key}.csv")
            attr_file = pd.read_csv(path)
            attr_file = attr_file[selectedAttributesCropsWithYear]
            attr_file.columns = selectedAttributesCrops

            output_df = pd.concat([generate_struct_df(), attr_file], axis=1)
        else:
            pass

    return output_df

generate_struct_crop_df(2015).columns

Index(['mean_topsoil_silt', 'median_topsoil_sand', 'min_topsoil_sand',
       'std_topsoil_clay', 'mean_topsoil_sand', 'max_topsoil_sand',
       'Undefined', 'End moraine', 'Ground moraine', 'Alluvium', 'Lacustrine',
       'Kames and esker', '%forest', '%water', '%grassland', '%wetland',
       '%cropland', 'median_elevation', 'mean_elevation', 'min_elevation',
       'range_elevation', 'max_elevation', 'max_slope', 'tomatoes', 'oats',
       'soyabeans', 'winterwheat', 'springwheat', 'corn'],
      dtype='object')

feature engineer (functional attributes - riverflow metrics)

In [19]:
# metrics to drop.
drop_metrics = [
    "Station Name",
    "Country",
    "Watershed-Area",
    "Latitude",
    "Longitude",
    "YR-MaxFlow",
    "GM-MaxFlow",
    "NGM-MaxFlow",
    "YR-MinFlow",
    "GM-MinFlow",
    "NGM-MinFlow",
    "YR-MedianFlow",
    "GM-MedianFlow",
    "NGM-MedianFlow",
    "YR-Q95Flow",
    "GM-Q95Flow",
    "NGM-Q95Flow",
    "YR-Q5Flow",
    "GM-Q5Flow",
    "NGM-Q5Flow",
]

def functional_pca(drop_metrics: list = drop_metrics):
    functional_path = os.path.join(directory, "computed_data", "flow_data")
    functional_files = glob.glob(f"{functional_path}/*.csv")

    all_top_load_dfs, all_top_metrics = [], {}

    for file in functional_files:
        df = pd.read_csv(file)
        df = df.set_index(df.columns[0])

        # drop metrics and checking for missing values.
        functional_df = df.loc[:, ~df.columns.str.startswith(tuple(drop_metrics))]
        functional_df = functional_df.fillna(functional_df.median())

        pca_loadings = pca.loadings(functional_df)

        explained_var = pca.explained_variance(functional_df)
        pc1_val = round(explained_var["Explained Variance"].iloc[1],2)
        pc2_val = round(explained_var["Explained Variance"].iloc[2],2)

        top_metrics = pca.top_attributes(pca_loadings, 5)
        all_top_metrics[file.split("/")[-1].split(".")[0]] = top_metrics
        new_keys = {'PC1': f'{file.split("/")[-1].split(".")[0]}_PC1_{pc1_val}', 'PC2': f'{file.split("/")[-1].split(".")[0]}_PC2_{pc2_val}'}
        renamed_top_metrics = {new_keys[key]: value for key, value in top_metrics.items()}
        yr_top_load_df = pd.DataFrame(renamed_top_metrics)
        all_top_load_dfs.append(yr_top_load_df)

    # out_put_df = pd.concat(all_top_load_dfs, axis = 1)
    # out_put_df =out_put_df.reindex(sorted(out_put_df.columns), axis=1)

    return all_top_metrics

functionalAttributes = functional_pca()
# print(selectTopAtttributesForYear(functionalAttributes))
print(functionalAttributes)


{'metrics2013': {'PC1': ['YR-CVQ-2013', 'NGM-CVQ-2013', 'NGM-RBI-2013', 'GM-CVQ-2013', 'YR-RBI-2013'], 'PC2': ['Specific-NGM-MedianFlow-2013', 'Specific-YR-MedianFlow-2013', 'Specific-GM-Q95-2013', 'Specific-YR-Q95-2013', 'Specific-YR-MaxFlow-2013']}, 'metrics2012': {'PC1': ['YR-CVQ-2012', 'NGM-CVQ-2012', 'YR-RBI-2012', 'NGM-RBI-2012', 'GM-RBI-2012'], 'PC2': ['Specific-NGM-MedianFlow-2012', 'Specific-YR-MedianFlow-2012', 'Specific-YR-Q95-2012', 'Specific-GM-Q95-2012', 'NGM-DOY-MaxFlow-2012']}, 'metrics2011': {'PC1': ['YR-CVQ-2011', 'NGM-CVQ-2011', 'GM-CVQ-2011', 'GM-RBI-2011', 'YR-RBI-2011'], 'PC2': ['Specific-YR-MedianFlow-2011', 'Specific-GM-MedianFlow-2011', 'Specific-NGM-MedianFlow-2011', 'YR-DOY-MaxFlow-2011', 'NGM-DOY-MaxFlow-2011']}, 'metrics2015': {'PC1': ['YR-CVQ-2015', 'NGM-CVQ-2015', 'GM-CVQ-2015', 'Specific-GM-Q5-2015', 'Specific-YR-Q5-2015'], 'PC2': ['Specific-NGM-Q5-2015', 'Specific-NGM-Q95-2015', 'Specific-NGM-MedianFlow-2015', 'Specific-YR-MedianFlow-2015', 'Specific-YR

In [21]:
# pca for climate indices
def climate_pca():
    climate_path = os.path.join(directory, "computed_data", "climate_data")
    climate_files = glob.glob(f"{climate_path}/*.csv")

    all_top_load_dfs, all_top_metrics = [], {}

    for file in climate_files:
        df = pd.read_csv(file)

        stations_list = pd.read_csv(os.path.join(directory, "raw_data", "stations_list.csv")).stations.tolist()
        stations_list = sorted(stations_list)

        df.columns.values[0] = "station_id"
        df['station_id'] = df['station_id'].apply(lambda val: val.lstrip("0") if val.startswith("04") else val)

        climate_df = df[df["station_id"].isin(stations_list)]

        # sort the climate_df
        climate_df = climate_df.sort_values("station_id")
        climate_df.index = range(len(climate_df))

        climate_df = climate_df.drop(columns=["station_id"])

        # drop metrics and checking for missing values.
        climatedfNomissingValues = climate_df.fillna(climate_df.median())

        pca_loadings = pca.loadings(climatedfNomissingValues)

        explained_var = pca.explained_variance(climatedfNomissingValues)
        pc1_val = round(explained_var["Explained Variance"].iloc[1],2)
        pc2_val = round(explained_var["Explained Variance"].iloc[2],2)

        top_metrics = pca.top_attributes(pca_loadings, 5)
        all_top_metrics[file.split("/")[-1].split(".")[0]] = top_metrics
        new_keys = {'PC1': f'{file.split("/")[-1].split(".")[0]}_PC1_{pc1_val}', 'PC2': f'{file.split("/")[-1].split(".")[0]}_PC2_{pc2_val}'}
        renamed_top_metrics = {new_keys[key]: value for key, value in top_metrics.items()}
        yr_top_load_df = pd.DataFrame(renamed_top_metrics)
        all_top_load_dfs.append(yr_top_load_df)

        out_put_df = pd.concat(all_top_load_dfs, axis = 1)
        out_put_df =out_put_df.reindex(sorted(out_put_df.columns), axis=1)

    return all_top_metrics

climateAttributes = climate_pca()
print(selectTopAtttributesForYear(climateAttributes))

['AnnualTotalPrcp_', 'AnnualTotalSnowFall_', 'AnnualMaxSWE_', 'AnnualSumOfRainOnSWE_', 'AnnualNumberRainOnSWEdays_']


Generating Datasets

In [22]:
# generate new dataset for AP classification (riverflows metrics - year and seasonal).
def selected_func_metrics() -> dict:

    stations_list = pd.read_csv(os.path.join(directory, "raw_data", "stations_list.csv")).stations.tolist()

    func_metrics = {}

    # riveflow metrics:
    functionalAttributes = selectTopAtttributesForYear(functional_pca())
    for year in range(2011, 2021):
        functionalAttributesWithYear = [functionalAttributes[i] + str(year) for i in range(len(functionalAttributes))]

        functional_path = os.path.join(directory, "computed_data", "flow_data", f'metrics{year}.csv')
        functional_df = pd.read_csv(functional_path)
        functional_df = functional_df[functionalAttributesWithYear]
        functional_df['station_id'] = stations_list
        functional_df = functional_df[["station_id"] + [col for col in functional_df.columns if col != "station_id"]]
        func_metrics[str(year)] = functional_df
        dir = os.path.join(output_dir, "pca_results","func")
        # functional_df.to_csv(dir +  f"/{year}_func_metrics.csv", index=False)

    return functional_df

selected_func_metrics()

Unnamed: 0,station_id,NGM-RBI-2020,YR-CVQ-2020,GM-CVQ-2020,NGM-CVQ-2020,Specific-YR-MedianFlow-2020
0,02GA003,0.24,1.3909,0.3529,1.2703,0.607088
1,02GA005,0.54,2.9480,1.1331,2.4142,0.564604
2,02GA010,0.39,2.1678,0.8050,1.9489,0.497875
3,02GA014,0.41,1.9411,1.3275,1.6394,0.598141
4,02GA015,0.15,1.2562,0.4745,1.1294,0.514268
...,...,...,...,...,...,...
128,4215500,0.48,1.2210,1.5750,0.9835,0.931192
129,4216418,0.45,1.3795,1.5005,1.1497,0.778192
130,4217000,0.37,1.3004,1.4373,1.0703,0.583968
131,4218000,0.22,1.0934,1.2758,0.8586,0.668873


In [23]:
# generate new dataset for AP classification (riverflows metrics - climate indices).
def selected_func_climate_attrs() -> pd.DataFrame:
    climateAttributes = selectTopAtttributesForYear(climate_pca())

    for year in tqdm(range(2011, 2021)):
        df = pd.read_csv(
            os.path.join(
                directory,
                "computed_data",
                "climate_data",
                f"climate_indices_{str(year)}.csv",
            )
        )

        stations_list = pd.read_csv(os.path.join(directory, "raw_data", "stations_list.csv")).stations.tolist()

        stations_list = sorted(stations_list)

        df.columns.values[0] = "station_id"
        df['station_id'] = df['station_id'].apply(lambda val: val.lstrip("0") if val.startswith("04") else val)

        climate_df = df[df["station_id"].isin(stations_list)]

        # sort the climate_df
        climate_df = climate_df.sort_values("station_id")
        climate_df.index = range(len(climate_df))

        climateAttributesWithYear = [climateAttributes[i] + str(year) for i in range(len(climateAttributes))]
        climate_df = climate_df[climateAttributesWithYear]

        dir = os.path.join(output_dir, "pca_results", "func", f"{year}_func_metrics.csv")
        functional_df = pd.read_csv(dir)

        output_df = pd.concat([functional_df, climate_df], axis=1)

        output_df = output_df[["station_id"] + [col for col in output_df.columns if col != "station_id"]]

        # output_df.to_csv(os.path.join(output_dir, "pca_results", "func_climate", f"{year}_func_climate_attrs.csv"), index=False)

    return output_df

selected_func_climate_attrs()

100%|██████████| 10/10 [00:00<00:00, 414.41it/s]


Unnamed: 0,station_id,NGM-CVQ-2020,YR-CVQ-2020,GM-CVQ-2020,Specific-YR-MedianFlow-2020,NGM-RBI-2020,AnnualTotalPrcp_2020,AnnualTotalSnowFall_2020,AnnualMaxSWE_2020,AnnualSumOfRainOnSWE_2020,AnnualNumberRainOnSWEdays_2020
0,02GA003,1.2703,1.3909,0.3529,0.607088,0.24,1010.713959,195.079240,81.669326,203.024210,27.314734
1,02GA005,2.4142,2.9480,1.1331,0.564604,0.54,1095.710944,220.902053,106.514529,245.755275,30.729560
2,02GA010,1.9489,2.1678,0.8050,0.497875,0.39,882.545058,151.788813,46.130942,158.554805,20.182011
3,02GA014,1.6394,1.9411,1.3275,0.598141,0.41,1198.672760,262.683061,131.524397,256.889547,36.047934
4,02GA015,1.1294,1.2562,0.4745,0.514268,0.15,966.132549,189.796726,81.581732,198.676942,28.311284
...,...,...,...,...,...,...,...,...,...,...,...
128,4215500,0.9835,1.2210,1.5750,0.931192,0.48,1246.779627,199.612023,68.894627,196.550519,33.369565
129,4216418,1.1497,1.3795,1.5005,0.778192,0.45,1207.922529,221.861610,80.690176,216.901936,41.870588
130,4217000,1.0703,1.3004,1.4373,0.583968,0.37,1151.104473,197.834512,64.415240,178.932813,36.728435
131,4218000,0.8586,1.0934,1.2758,0.668873,0.22,1117.032122,168.475242,43.875742,134.829480,28.986130


In [27]:
# generate new dataset for AP classification (riverflow metrics + structural attributes + climate indices)
stations_list = pd.read_csv(os.path.join(directory, "raw_data", "stations_list.csv")).stations.tolist()

stations_list = sorted(stations_list)


for year in tqdm(range(2011, 2021)):
    df = pd.read_csv(os.path.join(output_dir, "pca_results","func_climate", f"{year}_func_climate_attrs.csv"))
    output_df = pd.concat([generate_struct_crop_df(year), df], axis=1)
    output_df['station_id'] = stations_list
    output_df = output_df[["station_id"] + [col for col in output_df.columns if col != "station_id"]]
    # output_df.to_csv(os.path.join(output_dir, "pca_results","all_attributes", f"{year}_func_struct_climate_attrs.csv"), index=False)
output_df.columns

100%|██████████| 10/10 [00:00<00:00, 21.98it/s]


Index(['station_id', 'mean_topsoil_silt', 'median_topsoil_sand',
       'min_topsoil_sand', 'std_topsoil_clay', 'mean_topsoil_sand',
       'max_topsoil_sand', 'Undefined', 'End moraine', 'Ground moraine',
       'Alluvium', 'Lacustrine', 'Kames and esker', '%forest', '%water',
       '%grassland', '%wetland', '%cropland', 'median_elevation',
       'mean_elevation', 'min_elevation', 'range_elevation', 'max_elevation',
       'max_slope', 'tomatoes', 'oats', 'soyabeans', 'winterwheat',
       'springwheat', 'corn', 'NGM-CVQ-2020', 'YR-CVQ-2020', 'GM-CVQ-2020',
       'Specific-YR-MedianFlow-2020', 'NGM-RBI-2020', 'AnnualTotalPrcp_2020',
       'AnnualMaxSWE_2020', 'AnnualSumOfRainOnSWE_2020',
       'AnnualTotalSnowFall_2020', 'AnnualNumberRainOnSWEdays_2020'],
      dtype='object')

PCA TESTING and AP clustering for Flow metrics.

In [34]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# metrics to drop.
drop_metrics = [
    "Station Name",
    "Country",
    "Watershed-Area",
    "Latitude",
    "Longitude",
    "YR-MaxFlow",
    "GM-MaxFlow",
    "NGM-MaxFlow",
    "YR-MinFlow",
    "GM-MinFlow",
    "NGM-MinFlow",
    "YR-MedianFlow",
    "GM-MedianFlow",
    "NGM-MedianFlow",
    "YR-Q95Flow",
    "GM-Q95Flow",
    "NGM-Q95Flow",
    "YR-Q5Flow",
    "GM-Q5Flow",
    "NGM-Q5Flow",
]


functional_path = os.path.join(directory, "computed_data", "flow_data", "metrics2011.csv")
functional_df = pd.read_csv(functional_path)
functional_df = functional_df.set_index(functional_df.columns[0])
functional_df = functional_df.loc[:, ~functional_df.columns.str.startswith(tuple(drop_metrics))]
functional_df = functional_df.fillna(functional_df.median())

scaler = StandardScaler()
scaled_df = scaler.fit_transform(functional_df)
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(scaled_df)

principalDf = pd.DataFrame(data = principalComponents, columns = ['PC1', 'PC2'])

# explained_variance = pca.explained_variance_ratio_
explained_variance_ratio = pca.explained_variance_ratio_
# print("Explained Variance Ratio:", explained_variance_ratio)


cumulative_variance_ratio = explained_variance_ratio.cumsum()
n_components = len(cumulative_variance_ratio[cumulative_variance_ratio <= 0.95])
# print("Number of Principal Components to retain 95% variance:", n_components)


# loadings
loadings = pca.components_.T * np.sqrt(pca.explained_variance_)

loadings_df = pd.DataFrame(loadings, columns = ['PC1', 'PC2'], index = functional_df.columns)

pc1 = loadings_df['PC1']


# use the value of pc1 to get the top 5 attributes
topLoadingAttributes_pc1 = pc1.abs().sort_values(ascending=False).head(5).index
topLoadingAttributes_pc2 = loadings_df['PC2'].abs().sort_values(ascending=False).head(2).index

# topLoadingAttributes = list(set(topLoadingAttributes_pc1).union(set(topLoadingAttributes_pc2)))
topLoadingAttributes = [
        "YR-RBI-2011",
        "NGM-RBI-2011",
        "Specific-GM-MedianFlow-2011",
        "Specific-NGM-MedianFlow-2011",
        "YR-CVQ-2011",
        "Specific-YR-MedianFlow-2011",
        "Specific-GM-Q95-2011",
        ]


resultedTopAttributes_df = functional_df[topLoadingAttributes]

# AP
from sklearn.cluster import AffinityPropagation
import warnings
warnings.filterwarnings("ignore")
model = AffinityPropagation(damping=0.9, verbose=2)

# fit the model
resultedTopAttributes_df = resultedTopAttributes_df.fillna(resultedTopAttributes_df.median())
model.fit(resultedTopAttributes_df)
labels = model.labels_

ap_res = {}
ap_res[functional_path.split("/")[-1].split(".")[0]] = list(labels)
result = pd.DataFrame(ap_res).set_index(resultedTopAttributes_df.index)

# min and max values of the labels
print("Min Label:", min(labels), "Max Label:", max(labels))

Converged after 52 iterations.
Min Label: 0 Max Label: 10
