In [1]:
from pca import PCA_Analysis
from dotenv import load_dotenv
import os
import pandas as pd
import glob
from tqdm import tqdm

# load .env file and pca class
load_dotenv()
directory = os.getenv("path")
output_dir = os.getenv("output_dir")
pca = PCA_Analysis()

structural attributes (Time-invariant)

In [None]:
def structural_pca() -> dict:
    struct_path = os.path.join(directory, "data","raw_datasets", "structural_attributes", "140_TI_variables")
    stations_list = pd.read_csv(os.path.join(directory,  "data","raw_datasets", "structural_attributes", "stations_list.csv")).stations.tolist()


    struct_files = os.listdir(struct_path)
    struct_files = [file for file in struct_files if file.endswith(".csv")]

    all_top_load_dfs, all_top_attr = [], {}

    for file in struct_files:
        attr_type = file.split("_")[1].split(".")[0]
        input_struct_data = pd.read_csv(os.path.join(struct_path, file))

        input_struct_data = input_struct_data[input_struct_data["station_id"].isin(stations_list)]

        struct_data = pd.concat([input_struct_data["station_id"], input_struct_data.iloc[:, 4:]], axis = 1)
        struct_data = struct_data.set_index("station_id")

        # data = pca.pca_analysis(struct_data)
        loadings = pca.loadings(struct_data)

        explained_var = pca.explained_variance(struct_data)
        pc1_val = round(explained_var["Explained Variance"].iloc[1],2)
        pc2_val = round(explained_var["Explained Variance"].iloc[2],2)

        top_attr = pca.top_attributes(loadings, 5)
        all_top_attr[attr_type] = top_attr
        new_keys = {'PC1': f'{attr_type}_PC1_{pc1_val}', 'PC2': f'{attr_type}_PC2_{pc2_val}'}
        renamed_top_attr = {new_keys[key]: value for key, value in top_attr.items()}
        top_load_df = pd.DataFrame(renamed_top_attr)
        all_top_load_dfs.append(top_load_df)
    # return  pd.concat(all_top_load_dfs, axis = 1)
    # out_put_df.to_csv(os.path.join(directory, "pca_results", "TI_top_attributes.csv"), index=False)
    return all_top_attr

structural_pca()

crop inventories (Time-variable)

In [None]:
def crop_pca() -> dict:
    crops_path = os.path.join(directory, "data", "raw_datasets", "structural_attributes", "crop_inventories")

    crop_inventories_files = glob.glob(f"{crops_path}/*.csv")

    all_crop_top_load_dfs, all_top_attr  = [], {}

    for path in crop_inventories_files:
        crop_yr_df = pd.read_csv(path)
        crop_yr_df = crop_yr_df.set_index(crop_yr_df.columns[0])

        # crop pca analysis
        # pca_df = pca.pca_analysis(crop_yr_df)
        loadings = pca.loadings(crop_yr_df)

        explained_var = pca.explained_variance(crop_yr_df)
        pc1_val = round(explained_var["Explained Variance"].iloc[1],2)
        pc2_val = round(explained_var["Explained Variance"].iloc[2],2)

        top_attr = pca.top_attributes(loadings, 5)
        all_top_attr[path.split("/")[-1].split(".")[0]] = top_attr
        new_keys = {'PC1': f'{path.split("/")[-1].split(".")[0]}_PC1_{pc1_val}', 'PC2': f'{path.split("/")[-1].split(".")[0]}_PC2_{pc2_val}'}
        renamed_top_attr = {new_keys[key]: value for key, value in top_attr.items()}
        top_load_df = pd.DataFrame(renamed_top_attr)
        all_crop_top_load_dfs.append(top_load_df)

    # crop_out_put_df = pd.concat(all_crop_top_load_dfs, axis = 1)
    # crop_out_put_df = crop_out_put_df.reindex(sorted(crop_out_put_df.columns), axis=1)
    # crop_out_put_df.to_csv(os.path.join(directory, "pca_results", "crop_top_attributes.csv"), index=False)

    return all_top_attr

print(crop_pca())

Combining TI (soil, lucl, terrain) and TV (crop inventory) datasets 

In [4]:
# selecting top attributes from computed pca
def select_top_attr(all_top_attr, attribute) -> list:
    selected_attr = []
    top_attr = all_top_attr[attribute]
    attrs_list = list(top_attr.values())

    for i in range(4):
        selected_attr.append(attrs_list[0][i])
    for i in range(2):
        selected_attr.append(attrs_list[1][i])
    return list(set(selected_attr))

In [5]:
# Generate new datasets with selected attributes
def generate_struct_df() -> pd.DataFrame:
    attr_collection = []
    struct_pca = structural_pca()
    for key in struct_pca.keys():
        attributes = select_top_attr(struct_pca, key)
        path = os.path.join(directory, "data", "raw_datasets", "structural_attributes", "140_TI_variables", f"leb_{key}.csv")
        stations_list = pd.read_csv(os.path.join(directory,  "data","raw_datasets", "structural_attributes", "stations_list.csv")).stations.tolist()
        # input_struct_data = input_struct_data[input_struct_data["station_id"].isin(stations_list)]

        att_file = pd.read_csv(path)
        _133_stations_df = att_file[att_file["station_id"].isin(stations_list)]
        _133_stations_df = _133_stations_df[attributes]
        attr_collection.append(_133_stations_df)

    return pd.concat(attr_collection, axis=1).reset_index(drop=True)

In [None]:
# Generate crop + struct attributes combined dataset.
def generate_struct_crop_df(year:int) -> pd.DataFrame:
    crops_pca = crop_pca()

    for key in crops_pca.keys():
        if year == int(key.split("_")[-1]):
            attributes = select_top_attr(crops_pca, key)
            path = os.path.join(directory, "data", "raw_datasets", "structural_attributes", "crop_inventories", f"{key}.csv")
            attr_file = pd.read_csv(path)
            attr_file = attr_file[attributes]
            output_df = pd.concat([generate_struct_df(), attr_file], axis=1)
        else:
            pass

    return output_df

generate_struct_crop_df(2011)

feature engineer (functional attributes - riverflow metrics)

In [None]:
# metrics to drop.
drop_metrics = [
    "Station Name",
    "Country",
    "Watershed-Area",
    "Latitude",
    "Longitude",
    "YR-MaxFlow",
    "GM-MaxFlow",
    "NGM-MaxFlow",
    "YR-MinFlow",
    "GM-MinFlow",
    "NGM-MinFlow",
    "YR-MedianFlow",
    "GM-MedianFlow",
    "NGM-MedianFlow",
    "YR-Q95Flow",
    "GM-Q95Flow",
    "NGM-Q95Flow",
    "YR-Q5Flow",
    "GM-Q5Flow",
    "NGM-Q5Flow",
]

def functional_pca(drop_metrics: list = drop_metrics):
    functional_path = os.path.join(directory, "data", "raw_datasets", "functional_attributes", "133_riverflow")
    functional_files = glob.glob(f"{functional_path}/*.csv")

    all_top_load_dfs, all_top_metrics = [], {}

    for file in functional_files:
        df = pd.read_csv(file)
        df = df.set_index(df.columns[0])

        # drop metrics and checking for missing values.
        functional_df = df.loc[:, ~df.columns.str.startswith(tuple(drop_metrics))]
        functional_df = functional_df.fillna(functional_df.median())

        # pca analysis start here.
        # pca_df = pca.pca_analysis(functional_df)
        pca_loadings = pca.loadings(functional_df)

        explained_var = pca.explained_variance(functional_df)
        pc1_val = round(explained_var["Explained Variance"].iloc[1],2)
        pc2_val = round(explained_var["Explained Variance"].iloc[2],2)

        top_metrics = pca.top_attributes(pca_loadings, 5)
        all_top_metrics[file.split("/")[-1].split(".")[0]] = top_metrics
        new_keys = {'PC1': f'{file.split("/")[-1].split(".")[0]}_PC1_{pc1_val}', 'PC2': f'{file.split("/")[-1].split(".")[0]}_PC2_{pc2_val}'}
        renamed_top_metrics = {new_keys[key]: value for key, value in top_metrics.items()}
        yr_top_load_df = pd.DataFrame(renamed_top_metrics)
        all_top_load_dfs.append(yr_top_load_df)

    # out_put_df = pd.concat(all_top_load_dfs, axis = 1)
    # out_put_df =out_put_df.reindex(sorted(out_put_df.columns), axis=1)

    return all_top_metrics

functional_pca()


In [None]:
# pca for climate indices
def climate_pca():
    climate_path = os.path.join(directory, "data", "raw_datasets", "climate_indices")
    climate_files = glob.glob(f"{climate_path}/*.csv")

    all_top_load_dfs, all_top_metrics = [], {}

    for file in climate_files:
        df = pd.read_csv(file)
        df.columns.values[0] = "station_id"
        df = df.set_index(df.columns[0])

        # drop metrics and checking for missing values.
        climate_df = df.fillna(df.median())

    #     # pca analysis start here.
    #     # pca_df = pca.pca_analysis(climate_df)
        pca_loadings = pca.loadings(climate_df)

        explained_var = pca.explained_variance(climate_df)
        pc1_val = round(explained_var["Explained Variance"].iloc[1],2)
        pc2_val = round(explained_var["Explained Variance"].iloc[2],2)

        top_metrics = pca.top_attributes(pca_loadings, 5)
        all_top_metrics[file.split("/")[-1].split(".")[0]] = top_metrics
        new_keys = {'PC1': f'{file.split("/")[-1].split(".")[0]}_PC1_{pc1_val}', 'PC2': f'{file.split("/")[-1].split(".")[0]}_PC2_{pc2_val}'}
        renamed_top_metrics = {new_keys[key]: value for key, value in top_metrics.items()}
        yr_top_load_df = pd.DataFrame(renamed_top_metrics)
        all_top_load_dfs.append(yr_top_load_df)

    # out_put_df = pd.concat(all_top_load_dfs, axis = 1)
    # out_put_df =out_put_df.reindex(sorted(out_put_df.columns), axis=1)

    return all_top_metrics

climate_pca()

In [None]:
# generate new dataset for AP classification (riverflows metrics - year and seasonal).
def selected_func_metrics() -> dict:
    stations_list = pd.read_csv(
        os.path.join(
            directory, "data", "raw_datasets", "structural_attributes", "stations_list.csv"
        )
    ).stations.tolist()

    func_metrics = {}

    for i in range(2011, 2021):
        attributes = select_top_attr(functional_pca(), str(i))

        functional_path = os.path.join(
            directory, "data", "raw_datasets", "functional_attributes", "133_riverflow", f'{i}.csv'
        )
        functional_df = pd.read_csv(functional_path)
        functional_df = functional_df[attributes]
        functional_df['station_id'] = stations_list
        functional_df = functional_df[["station_id"] + [col for col in functional_df.columns if col != "station_id"]]
        func_metrics[str(i)] = functional_df
        dir = os.path.join(output_dir, "func")
        functional_df.to_csv(dir +  f"/{i}_func_metrics.csv", index=False)

    return func_metrics

selected_func_metrics()

In [None]:
# generate new dataset for AP classification (riverflows metrics - climate indices).
def selected_func_climate_attrs() -> pd.DataFrame:
    for i in tqdm(range(2011, 2021)):
        df = pd.read_csv(
            os.path.join(
                directory,
                "data",
                "raw_datasets",
                "climate_indices",
                f"climate_indices_{str(i)}.csv",
            )
        )

        stations_list = pd.read_csv(
        os.path.join(
                directory, "data", "raw_datasets", "structural_attributes", "stations_list.csv"
            )
        ).stations.tolist()

        stations_list = sorted(stations_list)

        df.columns.values[0] = "station_id"
        df['station_id'] = df['station_id'].apply(lambda val: val.lstrip("0") if val.startswith("04") else val)

        climate_df = df[df["station_id"].isin(stations_list)]

        # sort the climate_df
        climate_df = climate_df.sort_values("station_id")
        climate_df.index = range(len(climate_df))

        attributes = select_top_attr(climate_pca(), f"climate_indices_{str(i)}")
        climate_df = climate_df[attributes]

        output_df = pd.concat([selected_func_metrics()[str(i)], climate_df], axis=1)

        output_df = output_df[["station_id"] + [col for col in output_df.columns if col != "station_id"]]

        # output_df.to_csv(os.path.join(output_dir, "func_climate", f"{i}_func_climate_attrs.csv"), index=False)
        output_df.to_csv(
            os.path.join(output_dir, "func_climate", f"{i}_func_climate_attrs.csv")
        )

    return output_df

selected_func_climate_attrs()

In [None]:
# generate new dataset for AP classification (riverflow metrics + structural attributes)
def selected_func_struct_attrs() -> dict:
    func_struct_attrs = {}
    for i in range(2011, 2021):
        functional_df = selected_func_metrics()[str(i)]
        output_df = pd.concat([generate_struct_crop_df(i), functional_df], axis=1)
        output_df = output_df[
            ["station_id"] + [col for col in output_df.columns if col != "station_id"]
        ]
        func_struct_attrs[str(i)] = output_df

    return func_struct_attrs

selected_func_struct_attrs()

In [78]:
# generate new dataset for AP classification (riverflow metrics + structural attributes + climate indices)
def selected_func_struct_climate_attrs() -> pd.DataFrame:
    for i in tqdm(range(2011, 2021)):
        df = pd.read_csv(
            os.path.join(
                directory,
                "data",
                "raw_datasets",
                "climate_indices",
                f"climate_indices_{str(i)}.csv",
            )
        )

        stations_list = pd.read_csv(
        os.path.join(
                directory, "data", "raw_datasets", "structural_attributes", "stations_list.csv"
            )
        ).stations.tolist()

        stations_list = sorted(stations_list)

        df.columns.values[0] = "station_id"
        df['station_id'] = df['station_id'].apply(lambda val: val.lstrip("0") if val.startswith("04") else val)

        climate_df = df[df["station_id"].isin(stations_list)]

        # sort the climate_df
        climate_df = climate_df.sort_values("station_id")
        climate_df.index = range(len(climate_df))

        attributes = select_top_attr(climate_pca(), f"climate_indices_{str(i)}")
        climate_df = climate_df[attributes]

        output_df = pd.concat(
            [selected_func_struct_attrs()[str(i)], climate_df], axis=1)

        output_df = output_df[
            ["station_id"] + [col for col in output_df.columns if col != "station_id"]
        ]

        output_df.to_csv(os.path.join(output_dir, "all_attributes", f"{i}_func_struct_climate_attrs.csv"), index=False)

    return output_df

# print(selected_func_struct_climate_attrs())
selected_func_struct_climate_attrs()

100%|██████████| 10/10 [01:12<00:00,  7.25s/it]
