In [1]:
from pca import PCA_Analysis
from dotenv import load_dotenv
import os
import pandas as pd
import glob
from tqdm import tqdm

# load .env file and pca class
load_dotenv()
directory = os.getenv("path")
output_dir = os.getenv("output_dir")
pca = PCA_Analysis()

structural attributes (Time-invariant)

In [2]:
def structural_pca() -> dict:
    struct_path = os.path.join(directory, "data","raw_datasets", "structural_attributes", "140_TI_variables")
    stations_list = pd.read_csv(os.path.join(directory,  "data","raw_datasets", "structural_attributes", "stations_list.csv")).stations.tolist()


    struct_files = os.listdir(struct_path)
    struct_files = [file for file in struct_files if file.endswith(".csv")]

    all_top_load_dfs, all_top_attr = [], {}

    for file in struct_files:
        attr_type = file.split("_")[1].split(".")[0]
        input_struct_data = pd.read_csv(os.path.join(struct_path, file))

        input_struct_data = input_struct_data[input_struct_data["station_id"].isin(stations_list)]

        struct_data = pd.concat([input_struct_data["station_id"], input_struct_data.iloc[:, 4:]], axis = 1)
        struct_data = struct_data.set_index("station_id")

        # data = pca.pca_analysis(struct_data)
        loadings = pca.loadings(struct_data)

        explained_var = pca.explained_variance(struct_data)
        pc1_val = round(explained_var["Explained Variance"].iloc[1],2)
        pc2_val = round(explained_var["Explained Variance"].iloc[2],2)

        top_attr = pca.top_attributes(loadings, 5)
        all_top_attr[attr_type] = top_attr
        new_keys = {'PC1': f'{attr_type}_PC1_{pc1_val}', 'PC2': f'{attr_type}_PC2_{pc2_val}'}
        renamed_top_attr = {new_keys[key]: value for key, value in top_attr.items()}
        top_load_df = pd.DataFrame(renamed_top_attr)
        all_top_load_dfs.append(top_load_df)
    # return  pd.concat(all_top_load_dfs, axis = 1)
    # out_put_df.to_csv(os.path.join(directory, "pca_results", "TI_top_attributes.csv"), index=False)
    return all_top_attr

structural_pca()

{'soil': {'PC1': ['std_topsoil_sand',
   'median_topsoil_sand',
   'std_topsoil_silt',
   'median_topsoil_silt',
   'std_topsoil_clay'],
  'PC2': ['std_topsoil_clay',
   'median_topsoil_sand',
   'median_soil_depth',
   'std_topsoil_sand',
   'std_topsoil_silt']},
 'lulcstats': {'PC1': ['%cropland',
   '%grassland',
   '%wetland',
   '%water',
   '%urban'],
  'PC2': ['%forest', '%grassland', '%water', '%urban', '%wetland']},
 'terrain': {'PC1': ['max_elevation',
   'range_elevation',
   'std_elevation',
   'mean_elevation',
   'median_elevation'],
  'PC2': ['median_elevation',
   'mean_elevation',
   'max_slope',
   'range_elevation',
   'std_elevation']}}

crop inventories (Time-variable)

In [3]:
def crop_pca() -> dict:
    crops_path = os.path.join(directory, "data", "raw_datasets", "structural_attributes", "crop_inventories")

    crop_inventories_files = glob.glob(f"{crops_path}/*.csv")

    all_crop_top_load_dfs, all_top_attr  = [], {}

    for path in crop_inventories_files:
        crop_yr_df = pd.read_csv(path)
        crop_yr_df = crop_yr_df.set_index(crop_yr_df.columns[0])

        # crop pca analysis
        # pca_df = pca.pca_analysis(crop_yr_df)
        loadings = pca.loadings(crop_yr_df)

        explained_var = pca.explained_variance(crop_yr_df)
        pc1_val = round(explained_var["Explained Variance"].iloc[1],2)
        pc2_val = round(explained_var["Explained Variance"].iloc[2],2)

        top_attr = pca.top_attributes(loadings, 5)
        all_top_attr[path.split("/")[-1].split(".")[0]] = top_attr
        new_keys = {'PC1': f'{path.split("/")[-1].split(".")[0]}_PC1_{pc1_val}', 'PC2': f'{path.split("/")[-1].split(".")[0]}_PC2_{pc2_val}'}
        renamed_top_attr = {new_keys[key]: value for key, value in top_attr.items()}
        top_load_df = pd.DataFrame(renamed_top_attr)
        all_crop_top_load_dfs.append(top_load_df)

    # crop_out_put_df = pd.concat(all_crop_top_load_dfs, axis = 1)
    # crop_out_put_df = crop_out_put_df.reindex(sorted(crop_out_put_df.columns), axis=1)
    # crop_out_put_df.to_csv(os.path.join(directory, "pca_results", "crop_top_attributes.csv"), index=False)

    return all_top_attr

print(crop_pca())

{'crop_inventory_2012': {'PC1': ['grapes2012', 'springwheat2012', 'oats2012', 'tomatoes2012', 'sod2012'], 'PC2': ['soyabeans2012', 'winterwheat2012', 'tomatoes2012', 'corn2012', 'springwheat2012']}, 'crop_inventory_2013': {'PC1': ['grapes2013', 'springwheat2013', 'tomatoes2013', 'fallow2013', 'winterwheat2013'], 'PC2': ['soyabeans2013', 'corn2013', 'winterwheat2013', 'fallow2013', 'tomatoes2013']}, 'crop_inventory_2011': {'PC1': ['oats2011', 'springwheat2011', 'grapes2011', 'tomatoes2011', 'peas2011'], 'PC2': ['soyabeans2011', 'winterwheat2011', 'corn2011', 'tomatoes2011', 'fallow2011']}, 'crop_inventory_2014': {'PC1': ['springwheat2014', 'grapes2014', 'winterwheat2014', 'tomatoes2014', 'potatoes2014'], 'PC2': ['soyabeans2014', 'corn2014', 'fallow2014', 'winterwheat2014', 'peas2014']}, 'crop_inventory_2015': {'PC1': ['tomatoes2015', 'grapes2015', 'springwheat2015', 'potatoes2015', 'winterwheat2015'], 'PC2': ['potatoes2015', 'winterwheat2015', 'grapes2015', 'soyabeans2015', 'canola2015'

Combining TI (soil, lucl, terrain) and TV (crop inventory) datasets 

In [4]:
# selecting top attributes from computed pca
def select_top_attr(all_top_attr, attribute) -> list:
    selected_attr = []
    top_attr = all_top_attr[attribute]
    attrs_list = list(top_attr.values())

    for i in range(4):
        selected_attr.append(attrs_list[0][i])
    for i in range(2):
        selected_attr.append(attrs_list[1][i])
    return list(set(selected_attr))

In [5]:
# Generate new datasets with selected attributes
def generate_struct_df() -> pd.DataFrame:
    attr_collection = []
    struct_pca = structural_pca()
    for key in struct_pca.keys():
        attributes = select_top_attr(struct_pca, key)
        path = os.path.join(directory, "data", "raw_datasets", "structural_attributes", "140_TI_variables", f"leb_{key}.csv")
        stations_list = pd.read_csv(os.path.join(directory,  "data","raw_datasets", "structural_attributes", "stations_list.csv")).stations.tolist()
        # input_struct_data = input_struct_data[input_struct_data["station_id"].isin(stations_list)]

        att_file = pd.read_csv(path)
        _133_stations_df = att_file[att_file["station_id"].isin(stations_list)]
        _133_stations_df = _133_stations_df[attributes]
        attr_collection.append(_133_stations_df)

    return pd.concat(attr_collection, axis=1).reset_index(drop=True)

In [6]:
# Generate crop + struct attributes combined dataset.
def generate_struct_crop_df(year:int) -> pd.DataFrame:
    crops_pca = crop_pca()

    for key in crops_pca.keys():
        if year == int(key.split("_")[-1]):
            attributes = select_top_attr(crops_pca, key)
            path = os.path.join(directory, "data", "raw_datasets", "structural_attributes", "crop_inventories", f"{key}.csv")
            attr_file = pd.read_csv(path)
            attr_file = attr_file[attributes]
            output_df = pd.concat([generate_struct_df(), attr_file], axis=1)
        else:
            pass

    return output_df

generate_struct_crop_df(2011)

Unnamed: 0,std_topsoil_silt,std_topsoil_clay,median_topsoil_sand,std_topsoil_sand,median_topsoil_silt,%forest,%water,%wetland,%cropland,%grassland,...,std_elevation,mean_elevation,range_elevation,max_elevation,soyabeans2011,oats2011,tomatoes2011,winterwheat2011,grapes2011,springwheat2011
0,8.170080,8.705036,26,15.565180,50,15.832885,1.392658,0.664702,68.998906,0.437804,...,62.273206,408.804259,280,539,9.805613,0.000000,0.0,0.000000,0.000000,0.000000
1,5.104138,6.927044,17,12.031182,50,10.223111,0.057743,0.260911,84.426076,0.280158,...,18.732971,456.647797,105,508,18.084754,0.000000,0.0,0.000000,0.000000,0.000000
2,5.926256,9.597431,17,13.566371,50,11.655630,0.549254,0.249661,80.612989,0.220120,...,36.743025,354.024238,194,441,12.255434,0.000000,0.0,0.000000,0.000000,0.000000
3,7.847713,8.844053,26,13.724722,50,15.943270,2.085602,1.717514,74.802225,0.799634,...,14.320739,490.430777,102,539,7.356624,0.000000,0.0,0.000000,0.000000,0.000000
4,2.991387,4.134774,50,7.125058,36,26.616951,0.867342,0.477796,61.204512,0.340075,...,36.667031,385.497491,211,506,11.329457,0.000000,0.0,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128,2.715522,5.111560,31,3.409538,56,62.425168,0.341540,4.021247,20.926085,0.744922,...,93.604877,391.824850,415,598,0.230181,0.033966,0.0,0.158610,0.064596,0.000303
129,3.155841,6.100140,31,4.228345,51,49.756332,0.862061,4.273898,38.923298,0.727606,...,70.777109,465.915367,338,630,0.885742,0.077918,0.0,0.358078,0.000000,0.000000
130,3.521046,4.893305,31,4.476393,51,39.755651,0.989909,8.006014,43.565460,0.727843,...,95.489068,404.317641,365,630,1.098605,0.120242,0.0,0.567742,0.000000,0.000000
131,7.280444,3.999675,23,7.571920,51,28.492574,0.699812,17.236124,45.486890,0.528546,...,106.358210,314.382768,455,630,2.435316,0.137531,0.0,1.364272,0.000673,0.000000


feature engineer (functional attributes - riverflow metrics)

In [7]:
# metrics to drop.
drop_metrics = [
    "Station Name",
    "Country",
    "Watershed-Area",
    "Latitude",
    "Longitude",
    "YR-MaxFlow",
    "GM-MaxFlow",
    "NGM-MaxFlow",
    "YR-MinFlow",
    "GM-MinFlow",
    "NGM-MinFlow",
    "YR-MedianFlow",
    "GM-MedianFlow",
    "NGM-MedianFlow",
    "YR-Q95Flow",
    "GM-Q95Flow",
    "NGM-Q95Flow",
    "YR-Q5Flow",
    "GM-Q5Flow",
    "NGM-Q5Flow",
]

def functional_pca(drop_metrics: list = drop_metrics):
    functional_path = os.path.join(directory, "data", "raw_datasets", "functional_attributes", "133_riverflow")
    functional_files = glob.glob(f"{functional_path}/*.csv")

    all_top_load_dfs, all_top_metrics = [], {}

    for file in functional_files:
        df = pd.read_csv(file)
        df = df.set_index(df.columns[0])

        # drop metrics and checking for missing values.
        functional_df = df.loc[:, ~df.columns.str.startswith(tuple(drop_metrics))]
        functional_df = functional_df.fillna(functional_df.median())

        # pca analysis start here.
        # pca_df = pca.pca_analysis(functional_df)
        pca_loadings = pca.loadings(functional_df)

        explained_var = pca.explained_variance(functional_df)
        pc1_val = round(explained_var["Explained Variance"].iloc[1],2)
        pc2_val = round(explained_var["Explained Variance"].iloc[2],2)

        top_metrics = pca.top_attributes(pca_loadings, 5)
        all_top_metrics[file.split("/")[-1].split(".")[0]] = top_metrics
        new_keys = {'PC1': f'{file.split("/")[-1].split(".")[0]}_PC1_{pc1_val}', 'PC2': f'{file.split("/")[-1].split(".")[0]}_PC2_{pc2_val}'}
        renamed_top_metrics = {new_keys[key]: value for key, value in top_metrics.items()}
        yr_top_load_df = pd.DataFrame(renamed_top_metrics)
        all_top_load_dfs.append(yr_top_load_df)

    # out_put_df = pd.concat(all_top_load_dfs, axis = 1)
    # out_put_df =out_put_df.reindex(sorted(out_put_df.columns), axis=1)

    return all_top_metrics

functional_pca()


{'2020': {'PC1': ['Specific-NGM-MedianFlow-2020',
   'Specific-YR-MedianFlow-2020',
   'Specific-GM-Q95-2020',
   'Specific-GM-MedianFlow-2020',
   'Specific-YR-Q95-2020'],
  'PC2': ['NGM-RBI-2020',
   'YR-RBI-2020',
   'GM-RBI-2020',
   'YR-CVQ-2020',
   'GM-CVQ-2020']},
 '2019': {'PC1': ['Specific-YR-MedianFlow-2019',
   'Specific-GM-MedianFlow-2019',
   'Specific-GM-Q95-2019',
   'Specific-NGM-MedianFlow-2019',
   'Specific-GM-MaxFlow-2019'],
  'PC2': ['YR-CVQ-2019',
   'NGM-CVQ-2019',
   'YR-RBI-2019',
   'NGM-RBI-2019',
   'GM-CVQ-2019']},
 '2018': {'PC1': ['Specific-YR-MedianFlow-2018',
   'Specific-NGM-MedianFlow-2018',
   'Specific-GM-MedianFlow-2018',
   'Specific-NGM-Q5-2018',
   'Specific-NGM-Q95-2018'],
  'PC2': ['YR-RBI-2018',
   'NGM-RBI-2018',
   'GM-RBI-2018',
   'GM-CVQ-2018',
   'YR-CVQ-2018']},
 '2015': {'PC1': ['Specific-NGM-MedianFlow-2015',
   'Specific-YR-MedianFlow-2015',
   'Specific-GM-Q95-2015',
   'Specific-GM-MedianFlow-2015',
   'Specific-YR-Q95-2015'],
  

In [8]:
# pca for climate indices
def climate_pca():
    climate_path = os.path.join(directory, "data", "raw_datasets", "climate_indices")
    climate_files = glob.glob(f"{climate_path}/*.csv")

    all_top_load_dfs, all_top_metrics = [], {}

    for file in climate_files:
        df = pd.read_csv(file)
        df.columns.values[0] = "station_id"
        df = df.set_index(df.columns[0])

        # drop metrics and checking for missing values.
        climate_df = df.fillna(df.median())

    #     # pca analysis start here.
    #     # pca_df = pca.pca_analysis(climate_df)
        pca_loadings = pca.loadings(climate_df)

        explained_var = pca.explained_variance(climate_df)
        pc1_val = round(explained_var["Explained Variance"].iloc[1],2)
        pc2_val = round(explained_var["Explained Variance"].iloc[2],2)

        top_metrics = pca.top_attributes(pca_loadings, 5)
        all_top_metrics[file.split("/")[-1].split(".")[0]] = top_metrics
        new_keys = {'PC1': f'{file.split("/")[-1].split(".")[0]}_PC1_{pc1_val}', 'PC2': f'{file.split("/")[-1].split(".")[0]}_PC2_{pc2_val}'}
        renamed_top_metrics = {new_keys[key]: value for key, value in top_metrics.items()}
        yr_top_load_df = pd.DataFrame(renamed_top_metrics)
        all_top_load_dfs.append(yr_top_load_df)

    # out_put_df = pd.concat(all_top_load_dfs, axis = 1)
    # out_put_df =out_put_df.reindex(sorted(out_put_df.columns), axis=1)

    return all_top_metrics

climate_pca()

{'climate_indices_2018': {'PC1': ['AnnualTotalSnowFall_2018',
   'AnnualNumberRainOnSWEdays_2018',
   'AnnualMaxSWE_2018',
   'AnnualSumOfRainOnSWE_2018',
   'AnnualMinTmin_2018'],
  'PC2': ['AnnualTotalPrcp_2018',
   'AnnualMaxTmax_2018',
   'AnnualMinTmin_2018',
   'AnnualSumOfRainOnSWE_2018',
   'AnnualNumberRainOnSWEdays_2018']},
 'climate_indices_2019': {'PC1': ['AnnualSumOfRainOnSWE_2019',
   'AnnualNumberRainOnSWEdays_2019',
   'AnnualTotalSnowFall_2019',
   'AnnualMaxSWE_2019',
   'AnnualMaxTmax_2019'],
  'PC2': ['AnnualMinTmin_2019',
   'AnnualTotalPrcp_2019',
   'AnnualMaxTmax_2019',
   'AnnualMaxSWE_2019',
   'AnnualNumberRainOnSWEdays_2019']},
 'climate_indices_2020': {'PC1': ['AnnualTotalSnowFall_2020',
   'AnnualMaxSWE_2020',
   'AnnualNumberRainOnSWEdays_2020',
   'AnnualSumOfRainOnSWE_2020',
   'AnnualMinTmin_2020'],
  'PC2': ['AnnualTotalPrcp_2020',
   'AnnualMaxTmax_2020',
   'AnnualMinTmin_2020',
   'AnnualMaxSWE_2020',
   'AnnualSumOfRainOnSWE_2020']},
 'climate_ind

In [9]:
# generate new dataset for AP classification (riverflows metrics - year and seasonal).
def selected_func_metrics() -> dict:
    stations_list = pd.read_csv(
        os.path.join(
            directory, "data", "raw_datasets", "structural_attributes", "stations_list.csv"
        )
    ).stations.tolist()

    func_metrics = {}

    for i in range(2011, 2021):
        attributes = select_top_attr(functional_pca(), str(i))

        functional_path = os.path.join(
            directory, "data", "raw_datasets", "functional_attributes", "133_riverflow", f'{i}.csv'
        )
        functional_df = pd.read_csv(functional_path)
        functional_df = functional_df[attributes]
        functional_df['station_id'] = stations_list
        functional_df = functional_df[["station_id"] + [col for col in functional_df.columns if col != "station_id"]]
        func_metrics[str(i)] = functional_df
        # dir = os.path.join(output_dir, "func")
        # functional_df.to_csv(dir +  f"/{i}_func_metrics.csv")

    return func_metrics

selected_func_metrics()

{'2011':     station_id  Specific-NGM-MedianFlow-2011  Specific-NGM-MinFlow-2011  \
 0      02GA003                      5.828355                   1.909926   
 1      02GA005                      0.139449                   0.023755   
 2      02GA010                      6.077592                   1.352264   
 3      02GA014                      2.388208                   0.235868   
 4      02GA015                      0.906573                   0.269391   
 ..         ...                           ...                        ...   
 128    4215500                      0.291444                   0.025949   
 129    4216418                      0.609942                   0.152486   
 130    4217000                      2.914681                   0.357351   
 131    4218000                     56.850563                   5.873827   
 132    4218518                      0.478461                   0.108237   
 
      NGM-CVQ-2011  Specific-YR-MedianFlow-2011  YR-CVQ-2011  \
 0          0.

In [10]:
# generate new dataset for AP classification (riverflow metrics + structural attributes)
def selected_func_struct_attrs() -> dict:
    func_struct_attrs = {}
    for i in range(2011, 2021):
        functional_df = selected_func_metrics()[str(i)]
        output_df = pd.concat([generate_struct_crop_df(i), functional_df], axis=1)
        output_df = output_df[
            ["station_id"] + [col for col in output_df.columns if col != "station_id"]
        ]
        func_struct_attrs[str(i)] = output_df

    return func_struct_attrs

selected_func_struct_attrs()

{'2011':     station_id  std_topsoil_silt  std_topsoil_clay  median_topsoil_sand  \
 0      02GA003          8.170080          8.705036                   26   
 1      02GA005          5.104138          6.927044                   17   
 2      02GA010          5.926256          9.597431                   17   
 3      02GA014          7.847713          8.844053                   26   
 4      02GA015          2.991387          4.134774                   50   
 ..         ...               ...               ...                  ...   
 128    4215500          2.715522          5.111560                   31   
 129    4216418          3.155841          6.100140                   31   
 130    4217000          3.521046          4.893305                   31   
 131    4218000          7.280444          3.999675                   23   
 132    4218518          2.129150          1.064575                   23   
 
      std_topsoil_sand  median_topsoil_silt    %forest    %water   %wetland  \

In [None]:
# generate new dataset for AP classification (riverflow metrics + structural attributes + climate indices)
def selected_func_struct_climate_attrs() -> pd.DataFrame:
    for i in tqdm(range(2011, 2021)):
        df = pd.read_csv(
            os.path.join(
                directory,
                "data",
                "raw_datasets",
                "climate_indices",
                f"climate_indices_{str(i)}.csv",
            )
        )

        stations_list = pd.read_csv(
        os.path.join(
                directory, "data", "raw_datasets", "structural_attributes", "stations_list.csv"
            )
        ).stations.tolist()

        df.columns.values[0] = "station_id"
        df['station_id'] = df['station_id'].apply(lambda val: val.lstrip("0") if val.startswith("04") else val)

        climate_df = df[df["station_id"].isin(stations_list)]

        attributes = select_top_attr(climate_pca(), f"climate_indices_{str(i)}")
        climate_df = climate_df[attributes]

        print(climate_df.duplicated().sum())

        output_df = pd.concat(
            [selected_func_struct_attrs()[str(i)], climate_df], axis=1)

        output_df = output_df[
            ["station_id"] + [col for col in output_df.columns if col != "station_id"]
        ]

        output_df.to_csv(os.path.join(output_dir, "all_attributes", f"{i}_func_struct_climate_attrs.csv"), index=False)

    return output_df

print(selected_func_struct_climate_attrs())