In [5]:
from pca import PCA_Analysis
from dotenv import load_dotenv
import os
import pandas as pd
import glob
from collections import Counter

# load .env file and pca class
load_dotenv()
directory = os.getenv("path")
output_dir = os.getenv("output_path")
pca = PCA_Analysis()

structural attributes (Time-invariant)

In [25]:
def structural_pca() -> dict:
    struct_path = os.path.join(directory, "structural_attributes", "140_TI_variables")
    stations_list = pd.read_csv(os.path.join(directory, "structural_attributes", "stations_list.csv")).stations.tolist()


    struct_files = os.listdir(struct_path)
    struct_files = [file for file in struct_files if file.endswith(".csv")]

    all_top_load_dfs, all_top_attr = [], {}

    for file in struct_files:
        attr_type = file.split("_")[1].split(".")[0]
        input_struct_data = pd.read_csv(os.path.join(struct_path, file))

        input_struct_data = input_struct_data[input_struct_data["station_id"].isin(stations_list)]

        struct_data = pd.concat([input_struct_data["station_id"], input_struct_data.iloc[:, 4:]], axis = 1)
        struct_data = struct_data.set_index("station_id")

        # data = pca.pca_analysis(struct_data)
        loadings = pca.loadings(struct_data)

        explained_var = pca.explained_variance(struct_data)
        pc1_val = round(explained_var["Explained Variance"].iloc[1],2)
        pc2_val = round(explained_var["Explained Variance"].iloc[2],2)

        top_attr = pca.top_attributes(loadings, 5)
        all_top_attr[attr_type] = top_attr
        new_keys = {'PC1': f'{attr_type}_PC1_{pc1_val}', 'PC2': f'{attr_type}_PC2_{pc2_val}'}
        renamed_top_attr = {new_keys[key]: value for key, value in top_attr.items()}
        top_load_df = pd.DataFrame(renamed_top_attr)
        all_top_load_dfs.append(top_load_df)
    # return  pd.concat(all_top_load_dfs, axis = 1)
    # out_put_df.to_csv(os.path.join(directory, "pca_results", "TI_top_attributes.csv"), index=False)
    return all_top_attr

crop inventories (Time-variable)

In [26]:
def crop_pca() -> dict:
    crops_path = os.path.join(directory, "structural_attributes", "crop_inventories")
    crop_inventories_files = glob.glob(f"{crops_path}/*.csv")

    all_crop_top_load_dfs, all_top_attr  = [], {}

    for path in crop_inventories_files:
        crop_yr_df = pd.read_csv(path)
        crop_yr_df = crop_yr_df.set_index(crop_yr_df.columns[0])

        # crop pca analysis
        # pca_df = pca.pca_analysis(crop_yr_df)
        loadings = pca.loadings(crop_yr_df)

        explained_var = pca.explained_variance(crop_yr_df)
        pc1_val = round(explained_var["Explained Variance"].iloc[1],2)
        pc2_val = round(explained_var["Explained Variance"].iloc[2],2)

        top_attr = pca.top_attributes(loadings, 5)
        all_top_attr[path.split("/")[-1].split(".")[0]] = top_attr
        new_keys = {'PC1': f'{path.split("/")[-1].split(".")[0]}_PC1_{pc1_val}', 'PC2': f'{path.split("/")[-1].split(".")[0]}_PC2_{pc2_val}'}
        renamed_top_attr = {new_keys[key]: value for key, value in top_attr.items()}
        top_load_df = pd.DataFrame(renamed_top_attr)
        all_crop_top_load_dfs.append(top_load_df)

    crop_out_put_df = pd.concat(all_crop_top_load_dfs, axis = 1)
    crop_out_put_df = crop_out_put_df.reindex(sorted(crop_out_put_df.columns), axis=1)
    # crop_out_put_df.to_csv(os.path.join(directory, "pca_results", "crop_top_attributes.csv"), index=False)
    return all_top_attr


Combining TI (soil, lucl, terrain) and TV (crop inventory) datasets 

In [27]:
# selecting top attributes from computed pca
def select_top_attr(all_top_attr, attr_type) -> list:
    selected_attr = []
    top_attr = all_top_attr[attr_type]
    attrs_list = list(top_attr.values())

    for i in range(4):
        selected_attr.append(attrs_list[0][i])
    for i in range(2):
        selected_attr.append(attrs_list[1][i])
    return list(set(selected_attr))

In [28]:
# Generate new datasets with selected attributes
def generate_struct_df() -> pd.DataFrame:
    attr_collection = []
    struct_pca = structural_pca()
    for key in struct_pca.keys():
        attributes = select_top_attr(struct_pca, key)
        path = os.path.join(directory, "structural_attributes", "140_TI_variables", f"leb_{key}.csv")
        stations_list = pd.read_csv(os.path.join(directory, "structural_attributes", "stations_list.csv")).stations.tolist()
        # input_struct_data = input_struct_data[input_struct_data["station_id"].isin(stations_list)]

        att_file = pd.read_csv(path)
        _133_stations_df = att_file[att_file["station_id"].isin(stations_list)]
        _133_stations_df = _133_stations_df[attributes]
        attr_collection.append(_133_stations_df)

    return pd.concat(attr_collection, axis=1).reset_index(drop=True)

In [29]:
# Generate crop + struct attributes combined dataset.
def generate_struct_crop_df(year:int) -> pd.DataFrame:
    crops_pca = crop_pca()

    for key in crops_pca.keys():
        if year == int(key.split("_")[-1]):
            attributes = select_top_attr(crops_pca, key)
            path = os.path.join(directory, "structural_attributes", "crop_inventories", f"{key}.csv")
            att_file = pd.read_csv(path)
            att_file = att_file[attributes]
            output_df = pd.concat([generate_struct_df(), att_file], axis=1)
        else:
            pass

    return output_df

generate_struct_crop_df(2011)

Unnamed: 0,std_topsoil_silt,median_topsoil_sand,median_topsoil_silt,std_topsoil_sand,std_topsoil_clay,%water,%cropland,%forest,%grassland,%wetland,...,mean_elevation,std_elevation,median_elevation,max_elevation,tomatoes2011,grapes2011,winterwheat2011,soyabeans2011,oats2011,springwheat2011
0,8.170080,26,50,15.565180,8.705036,1.392658,68.998906,15.832885,0.437804,0.664702,...,408.804259,62.273206,411,539,0.0,0.000000,0.000000,9.805613,0.000000,0.000000
1,5.104138,17,50,12.031182,6.927044,0.057743,84.426076,10.223111,0.280158,0.260911,...,456.647797,18.732971,462,508,0.0,0.000000,0.000000,18.084754,0.000000,0.000000
2,5.926256,17,50,13.566371,9.597431,0.549254,80.612989,11.655630,0.220120,0.249661,...,354.024238,36.743025,363,441,0.0,0.000000,0.000000,12.255434,0.000000,0.000000
3,7.847713,26,50,13.724722,8.844053,2.085602,74.802225,15.943270,0.799634,1.717514,...,490.430777,14.320739,487,539,0.0,0.000000,0.000000,7.356624,0.000000,0.000000
4,2.991387,50,36,7.125058,4.134774,0.867342,61.204512,26.616951,0.340075,0.477796,...,385.497491,36.667031,384,506,0.0,0.000000,0.000000,11.329457,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128,2.715522,31,56,3.409538,5.111560,0.341540,20.926085,62.425168,0.744922,4.021247,...,391.824850,93.604877,404,598,0.0,0.064596,0.158610,0.230181,0.033966,0.000303
129,3.155841,31,51,4.228345,6.100140,0.862061,38.923298,49.756332,0.727606,4.273898,...,465.915367,70.777109,473,630,0.0,0.000000,0.358078,0.885742,0.077918,0.000000
130,3.521046,31,51,4.476393,4.893305,0.989909,43.565460,39.755651,0.727843,8.006014,...,404.317641,95.489068,407,630,0.0,0.000000,0.567742,1.098605,0.120242,0.000000
131,7.280444,23,51,7.571920,3.999675,0.699812,45.486890,28.492574,0.528546,17.236124,...,314.382768,106.358210,278,630,0.0,0.000673,1.364272,2.435316,0.137531,0.000000


In [176]:

def functional_pca():
    functional_path = os.path.join(directory, "functional_attributes", "133_riverflow")
    functional_files = glob.glob(f"{functional_path}/*.csv")

    yr_all_top_load_dfs, all_top_attr = [], {}

    for file in functional_files:
        functional_df = pd.read_csv(file)
        functional_df = functional_df.set_index(functional_df.columns[0])

        # func year
        yr_functional_df = functional_df.loc[:, functional_df.columns.str.contains('YR')]
        yr_remove_list = ["YR-MaxFlow", "YR-MinFlow","YR-MedianFlow","YR-Q95Flow","YR-Q5Flow"]
        yr_functional_df = yr_functional_df.loc[:, ~yr_functional_df.columns.str.startswith(tuple(yr_remove_list))]
        yr_functional_df = yr_functional_df.fillna(yr_functional_df.median())

        # func year pca analysis
        # yr_pca_df = pca.pca_analysis(yr_functional_df)
        yr_loadings = pca.loadings(yr_functional_df)

        yr_explained_var = pca.explained_variance(yr_functional_df)
        yr_pc1_val = round(yr_explained_var["Explained Variance"].iloc[1],2)
        yr_pc2_val = round(yr_explained_var["Explained Variance"].iloc[2],2)

        yr_top_attr = pca.top_attributes(yr_loadings, 5)
        all_top_attr[file.split("/")[-1].split(".")[0]] = yr_top_attr
        new_keys = {'PC1': f'{file.split("/")[-1].split(".")[0]}_PC1_{yr_pc1_val}', 'PC2': f'{file.split("/")[-1].split(".")[0]}_PC2_{yr_pc2_val}'}
        renamed_top_attr = {new_keys[key]: value for key, value in yr_top_attr.items()}
        yr_top_load_df = pd.DataFrame(renamed_top_attr)
        yr_all_top_load_dfs.append(yr_top_load_df)

    yr_out_put_df = pd.concat(yr_all_top_load_dfs, axis = 1)
    yr_out_put_df = yr_out_put_df.reindex(sorted(yr_out_put_df.columns), axis=1)
    # yr_out_put_df.to_csv(os.path.join(directory, "pca_results", "yr_func_top_attributes.csv"), index=False)
    return all_top_attr


In [182]:
# generate new dataset for AP classification (riverflow + structural attributes)
for i in range(2011, 2021):
    attributes = select_top_attr(functional_pca(), str(i))
    functional_path = os.path.join(directory, "functional_attributes", "133_riverflow", f"{i}.csv")
    functional_df = pd.read_csv(functional_path)
    functional_df = functional_df[attributes]
    output_df = pd.concat([generate_struct_crop_df(i), functional_df], axis=1)
    stations_list = pd.read_csv(os.path.join(directory, "structural_attributes", "stations_list.csv")).stations.tolist()
    output_df = pd.concat([pd.DataFrame(stations_list, columns=["station_id"]), output_df], axis=1).set_index("station_id")
    output_df.to_csv(output_dir + f"/{i}_struct_flow.csv")

year and seasonal functional metrics test

In [23]:
file_path = r"/Users/mugisha/Desktop/2011.csv"
struct_data = pd.read_csv(file_path)

# check if there is any missing value struct_data
# replace missing values with median
struct_data = struct_data.fillna(struct_data.median())

loadings = pca.loadings(struct_data)

explained_var = pca.explained_variance(struct_data)


pc1_val = round(explained_var["Explained Variance"].iloc[1],2)
pc2_val = round(explained_var["Explained Variance"].iloc[2],2)

top_attr = pca.top_attributes(loadings, 5)

print(f"PC1: {pc1_val}, PC2: {pc2_val}")
print(f"top_attr {top_attr}")


# all_top_attr[attr_type] = top_attr
# new_keys = {'PC1': f'{attr_type}_PC1_{pc1_val}', 'PC2': f'{attr_type}_PC2_{pc2_val}'}
# renamed_top_attr = {new_keys[key]: value for key, value in top_attr.items()}
# top_load_df = pd.DataFrame(renamed_top_attr)
# all_top_load_dfs.append(top_load_df)

PC1: 0.39, PC2: 0.21
top_attr {'PC1': ['Specific-NGM-MedianFlow-2011', 'Specific-YR-MedianFlow-2011', 'Specific-GM-MedianFlow-2011', 'Specific-NGM-MinFlow-2011', 'Specific-GM-Q95-2011'], 'PC2': ['NGM-CVQ-2011', 'YR-CVQ-2011', 'YR-RBI-2011', 'GM-RBI-2011', 'NGM-RBI-2011']}


functional archive

In [6]:
# functional_path = os.path.join(directory, "functional_attributes/133_riverflow")
# functional_files = glob.glob(f"{functional_path}/*.csv")

# yr_func_dict, szn_func_dict = {}, {}
# yr_all_top_load_dfs, szn_all_top_load_dfs = [], []

# for file in functional_files:
#     functional_df = pd.read_csv(file)
#     functional_df = functional_df.set_index(functional_df.columns[0])

#     # func year
#     yr_functional_df = functional_df.loc[:, functional_df.columns.str.contains('YR')]
#     yr_remove_list = ["YR-MaxFlow", "YR-MinFlow","YR-MedianFlow","YR-Q95Flow","YR-Q5Flow"]
#     yr_functional_df = yr_functional_df.loc[:, ~yr_functional_df.columns.str.startswith(tuple(yr_remove_list))]
#     yr_functional_df = yr_functional_df.fillna(yr_functional_df.median())

#     # func seasonal
#     szn_functional_df = functional_df.loc[:, functional_df.columns.str.contains('GM|NGM')]
#     szn_remove_list = ["GM-MaxFlow", "GM-MinFlow","GM-MedianFlow","GM-Q95Flow","GM-Q5Flow",
#                         "NGM-MaxFlow", "NGM-MinFlow","NGM-MedianFlow","NGM-Q95Flow","NGM-Q5Flow"]
#     szn_functional_df = szn_functional_df.loc[:, ~szn_functional_df.columns.str.startswith(tuple(szn_remove_list))]
#     szn_functional_df = szn_functional_df.fillna(szn_functional_df.median())

#     # func year pca analysis
#     yr_pca_df = pca.pca_analysis(yr_functional_df)
#     yr_loadings = pca.loadings(yr_functional_df)

#     yr_explained_var = pca.explained_variance(yr_functional_df)
#     yr_pc1_val = round(yr_explained_var["Explained Variance"].iloc[1],2)
#     yr_pc2_val = round(yr_explained_var["Explained Variance"].iloc[2],2)

#     yr_top_attr = pca.top_attributes(yr_loadings, 5)
#     new_keys = {'PC1': f'{file.split("/")[-1].split(".")[0]}_PC1_{yr_pc1_val}', 'PC2': f'{file.split("/")[-1].split(".")[0]}_PC2_{yr_pc2_val}'}
#     renamed_top_attr = {new_keys[key]: value for key, value in yr_top_attr.items()}
#     yr_top_load_df = pd.DataFrame(renamed_top_attr)
#     yr_all_top_load_dfs.append(yr_top_load_df)

#     # func seasonal pca analysis
#     szn_pca_df = pca.pca_analysis(szn_functional_df)
#     szn_loadings = pca.loadings(szn_functional_df)

#     szn_explained_var = pca.explained_variance(szn_functional_df)
#     szn_pc1_val = round(szn_explained_var["Explained Variance"].iloc[1],2)
#     szn_pc2_val = round(szn_explained_var["Explained Variance"].iloc[2],2)

#     szn_top_attr = pca.top_attributes(szn_loadings, 5)
#     new_keys = {'PC1': f'{file.split("/")[-1].split(".")[0]}_PC1_{szn_pc1_val}', 'PC2': f'{file.split("/")[-1].split(".")[0]}_PC2_{szn_pc2_val}'}
#     renamed_top_attr = {new_keys[key]: value for key, value in szn_top_attr.items()}
#     szn_top_load_df = pd.DataFrame(renamed_top_attr)
#     szn_all_top_load_dfs.append(szn_top_load_df)

# yr_out_put_df = pd.concat(yr_all_top_load_dfs, axis = 1)
# yr_out_put_df = yr_out_put_df.reindex(sorted(yr_out_put_df.columns), axis=1)
# yr_out_put_df.to_csv(os.path.join(directory, "pca_results", "yr_func_top_attributes.csv"), index=False)

# szn_out_put_df = pd.concat(szn_all_top_load_dfs, axis = 1)
# szn_out_put_df = szn_out_put_df.reindex(sorted(szn_out_put_df.columns), axis=1)
# szn_out_put_df.to_csv(os.path.join(directory, "pca_results", "szn_func_top_attributes.csv"), index=False)