In [6]:
from pca import PCA_Analysis 
from dotenv import load_dotenv
import os
import pandas as pd
import glob
from collections import Counter

# load .env file and pca class
load_dotenv()
directory = os.getenv("path")
pca = PCA_Analysis()

In [21]:
# helper function to count the most frequent attributes:
def count_most_freq(df: pd.DataFrame) -> dict:
   cleanList = []
   for i in df['attrs']:
      crop = i.split('2')[0]
      cleanList.append(crop)
   cleanList

   return Counter(cleanList)

structural attributes (Time-invariant)

In [4]:
struct_path = os.path.join(directory, "structural_attributes", "140_TI_variables")
struct_files = os.listdir(struct_path)
struct_files = [file for file in struct_files if file.endswith(".csv")]

all_top_load_dfs = []

for file in struct_files:
    attr_type = file.split("_")[1].split(".")[0]
    input_struct_data = pd.read_csv(os.path.join(struct_path, file))
    struct_data = pd.concat([input_struct_data["station_id"], input_struct_data.iloc[:, 4:]], axis = 1)
    struct_data = struct_data.set_index("station_id")

    data = pca.pca_analysis(struct_data)
    loadings = pca.loadings(struct_data)

    explained_var = pca.explained_variance(struct_data)
    pc1_val = round(explained_var["Explained Variance"].iloc[1],2)
    pc2_val = round(explained_var["Explained Variance"].iloc[2],2)

    top_attr = pca.top_attributes(loadings, 5)
    new_keys = {'PC1': f'{attr_type}_PC1_{pc1_val}', 'PC2': f'{attr_type}_PC2_{pc2_val}'}
    renamed_top_attr = {new_keys[key]: value for key, value in top_attr.items()}
    top_load_df = pd.DataFrame(renamed_top_attr)
    all_top_load_dfs.append(top_load_df)

out_put_df = pd.concat(all_top_load_dfs, axis = 1)
out_put_df.to_csv(os.path.join(directory, "pca_results", "TI_top_attributes.csv"), index=False)

# plot the explained variance (example)
# pca.pca_plot(explained_var)


crop inventories (Time-variable)

In [4]:
crops_path = os.path.join(directory, "structural_attributes/crop_inventories")
crop_inventories_files = glob.glob(f"{crops_path}/*.csv")

all_crop_top_load_dfs = []

for path in crop_inventories_files:
    crop_yr_df = pd.read_csv(path)
    crop_yr_df = crop_yr_df.set_index(crop_yr_df.columns[0])

    # crop pca analysis
    pca_df = pca.pca_analysis(crop_yr_df)
    loadings = pca.loadings(crop_yr_df)

    explained_var = pca.explained_variance(crop_yr_df)
    pc1_val = round(explained_var["Explained Variance"].iloc[1],2)
    pc2_val = round(explained_var["Explained Variance"].iloc[2],2)

    top_attr = pca.top_attributes(loadings, 5)
    new_keys = {'PC1': f'{path.split("/")[-1].split(".")[0]}_PC1_{pc1_val}', 'PC2': f'{path.split("/")[-1].split(".")[0]}_PC2_{pc2_val}'}
    renamed_top_attr = {new_keys[key]: value for key, value in top_attr.items()}
    top_load_df = pd.DataFrame(renamed_top_attr)
    all_crop_top_load_dfs.append(top_load_df)

crop_out_put_df = pd.concat(all_crop_top_load_dfs, axis = 1)
crop_out_put_df = crop_out_put_df.reindex(sorted(crop_out_put_df.columns), axis=1)
crop_out_put_df.to_csv(os.path.join(directory, "pca_results", "crop_top_attributes.csv"), index=False)


# count the most frequent attributes
# most_freq_crop_attr = pd.read_csv(f"{directory}/count.csv")
# count_most_freq(most_freq_crop_attr)


functional

In [6]:
functional_path = os.path.join(directory, "functional_attributes/133_riverflow")
functional_files = glob.glob(f"{functional_path}/*.csv")

yr_func_dict, szn_func_dict = {}, {}
yr_all_top_load_dfs, szn_all_top_load_dfs = [], []

for file in functional_files:
    functional_df = pd.read_csv(file)
    functional_df = functional_df.set_index(functional_df.columns[0])

    # func year
    yr_functional_df = functional_df.loc[:, functional_df.columns.str.contains('YR')]
    yr_remove_list = ["YR-MaxFlow", "YR-MinFlow","YR-MedianFlow","YR-Q95Flow","YR-Q5Flow"]
    yr_functional_df = yr_functional_df.loc[:, ~yr_functional_df.columns.str.startswith(tuple(yr_remove_list))]
    yr_functional_df = yr_functional_df.fillna(yr_functional_df.median())

    # func seasonal
    szn_functional_df = functional_df.loc[:, functional_df.columns.str.contains('GM|NGM')]
    szn_remove_list = ["GM-MaxFlow", "GM-MinFlow","GM-MedianFlow","GM-Q95Flow","GM-Q5Flow",
                        "NGM-MaxFlow", "NGM-MinFlow","NGM-MedianFlow","NGM-Q95Flow","NGM-Q5Flow"]
    szn_functional_df = szn_functional_df.loc[:, ~szn_functional_df.columns.str.startswith(tuple(szn_remove_list))]
    szn_functional_df = szn_functional_df.fillna(szn_functional_df.median())

    # func year pca analysis
    yr_pca_df = pca.pca_analysis(yr_functional_df)
    yr_loadings = pca.loadings(yr_functional_df)

    yr_explained_var = pca.explained_variance(yr_functional_df)
    yr_pc1_val = round(yr_explained_var["Explained Variance"].iloc[1],2)
    yr_pc2_val = round(yr_explained_var["Explained Variance"].iloc[2],2)
    
    yr_top_attr = pca.top_attributes(yr_loadings, 5)
    new_keys = {'PC1': f'{file.split("/")[-1].split(".")[0]}_PC1_{yr_pc1_val}', 'PC2': f'{file.split("/")[-1].split(".")[0]}_PC2_{yr_pc2_val}'}
    renamed_top_attr = {new_keys[key]: value for key, value in yr_top_attr.items()}
    yr_top_load_df = pd.DataFrame(renamed_top_attr)
    yr_all_top_load_dfs.append(yr_top_load_df)

    # func seasonal pca analysis
    szn_pca_df = pca.pca_analysis(szn_functional_df)
    szn_loadings = pca.loadings(szn_functional_df)

    szn_explained_var = pca.explained_variance(szn_functional_df)
    szn_pc1_val = round(szn_explained_var["Explained Variance"].iloc[1],2)
    szn_pc2_val = round(szn_explained_var["Explained Variance"].iloc[2],2)
    
    szn_top_attr = pca.top_attributes(szn_loadings, 5)
    new_keys = {'PC1': f'{file.split("/")[-1].split(".")[0]}_PC1_{szn_pc1_val}', 'PC2': f'{file.split("/")[-1].split(".")[0]}_PC2_{szn_pc2_val}'}
    renamed_top_attr = {new_keys[key]: value for key, value in szn_top_attr.items()}
    szn_top_load_df = pd.DataFrame(renamed_top_attr)
    szn_all_top_load_dfs.append(szn_top_load_df)

yr_out_put_df = pd.concat(yr_all_top_load_dfs, axis = 1)
yr_out_put_df = yr_out_put_df.reindex(sorted(yr_out_put_df.columns), axis=1)
yr_out_put_df.to_csv(os.path.join(directory, "pca_results", "yr_func_top_attributes.csv"), index=False)

szn_out_put_df = pd.concat(szn_all_top_load_dfs, axis = 1)
szn_out_put_df = szn_out_put_df.reindex(sorted(szn_out_put_df.columns), axis=1)
szn_out_put_df.to_csv(os.path.join(directory, "pca_results", "szn_func_top_attributes.csv"), index=False)