In [1]:
import pandas as pd
import numpy as np

In [130]:

# --- Step 1: Base dataframe ---
df_pos = pd.read_csv("df_pos_calculatedarea.csv")

# --- Step 2: Expansion rates ---
exp5 = pd.read_csv("pos_expansion5y.csv")[["expansion_ha_peryr"]] \
          .rename(columns={"expansion_ha_peryr": "5y_expansion_rate"})
exp10 = pd.read_csv("pos_expansion10y.csv")[["expansion_ha_peryr"]] \
           .rename(columns={"expansion_ha_peryr": "10y_expansion_rate"})

# Concatenate side-by-side with base
df_pos = pd.concat([df_pos.reset_index(drop=True),
                    exp5.reset_index(drop=True),
                    exp10.reset_index(drop=True)], axis=1)

# --- Step 3: Glacier features ---
glac = pd.read_csv("glac_pos_list_correct_slope.csv")

glac = glac[[
    "glacier_area_ha",
    "slope_glac_to_lake",
    "glacier_contact",
    "glacier_touch_count",
    "nearest_glacier_dist_m",
    "glacier_elev_m"
]]

# Concatenate glacier features
uncleaned_ml_pos = pd.concat([df_pos.reset_index(drop=True),
                              glac.reset_index(drop=True)], axis=1)

# --- Final check ---
print("Final shape:", uncleaned_ml_pos.shape)
print(uncleaned_ml_pos.head())




Final shape: (241, 20)
   Unnamed: 0  Elevation_m  GLOF     Lake_type Lake_type_simplified  Latitude  \
0           0         3466     1  water pocket                other    42.522   
1           1         3269     1           ice                  ice    42.207   
2           2         4779     1           ice                  ice    35.677   
3           3         4594     1  water pocket                other    29.307   
4           4         3301     1  water pocket                other    36.039   

   Longitude  Year_final  id  is_supraglacial  Lake_area_calculated_ha  \
0     74.609        1984   0                0                      NaN   
1     79.846        1984   1                0                      NaN   
2     77.193        1984   2                0                      NaN   
3     83.967        1985   3                0                      NaN   
4     74.633        1985   4                0                      NaN   

   Lake_area_ha  5y_expansion_rate  10y_expan

In [106]:

# Optionally save to CSV
uncleaned_ml_neg.to_csv("uncleaned_ml_neg.csv", index=False)

In [123]:
# --- Step 1: Load the base dataframe ---
df_pos = pd.read_csv("df_pos_calculatedarea.csv")

# Drop unwanted columns
cols_to_drop = ["Lake_type", "id", "Lake_area_ha", "Unnamed: 0"]
df_pos = df_pos.drop(columns=cols_to_drop)

# --- Step 2: Load expansion rates ---
exp5 = pd.read_csv("pos_expansion5y.csv")
exp10 = pd.read_csv("pos_expansion10y.csv")

# Keep only 'expansion_ha_peryr' from each and rename
exp5 = exp5[["expansion_ha_peryr"]].rename(columns={"expansion_ha_peryr": "5y_expansion_rate"})
exp10 = exp10[["expansion_ha_peryr"]].rename(columns={"expansion_ha_peryr": "10y_expansion_rate"})

# Concatenate them horizontally
df_pos = pd.concat([df_pos, exp5, exp10], axis=1)

# --- Step 3: Load glacier features ---
glac = pd.read_csv("glac_pos_list_correct_slope.csv")

# Keep only the required glacier columns
needed_glacier_cols = [
    "glacier_area_ha",
    "slope_glac_to_lake",
    "glacier_contact",
    "glacier_touch_count",
    "nearest_glacier_dist_m",
    "glacier_elev_m"
]
glac = glac[needed_glacier_cols]

# Concatenate glacier features to main df
df_pos = pd.concat([df_pos, glac], axis=1)


# --- Step 5: Rename final dataframe ---
uncleaned_ml_pos = df_pos

# Quick check
uncleaned_ml_pos.shape


(241, 16)

In [64]:
# --- Define the correct final column order ---
final_order = [
    "Longitude",
    "Latitude",
    "Year_final",
    "Lake_area_calculated_ha",
    "Elevation_m",
    "Lake_type_simplified",
    "is_supraglacial",
    "glacier_area_ha",
    "slope_glac_to_lake",
    "glacier_contact",
    "glacier_touch_count",
    "nearest_glacier_dist_m",
    "glacier_elev_m",
    "5y_expansion_rate",
    "10y_expansion_rate",
    "GLOF"
]



In [65]:
# --- Reorder negative dataframe ---
uncleaned_ml_neg = uncleaned_ml_neg[final_order]

# --- Reorder positive dataframe ---
uncleaned_ml_pos = uncleaned_ml_pos[final_order]

In [66]:
uncleaned_ml_neg.head()

Unnamed: 0,Longitude,Latitude,Year_final,Lake_area_calculated_ha,Elevation_m,Lake_type_simplified,is_supraglacial,glacier_area_ha,slope_glac_to_lake,glacier_contact,glacier_touch_count,nearest_glacier_dist_m,glacier_elev_m,5y_expansion_rate,10y_expansion_rate,GLOF
0,72.901,36.412,2019,19.373351,4256,moraine,0,735.52788,0.0,True,4,0.0,4706.0,-20.935474,0.925453,0
1,73.048,36.263,2019,18.463383,4618,other,0,12.913846,0.386765,False,0,726.539168,4899.0,-0.014481,-2.158693,0
2,73.407,36.642,2019,14.050228,2748,moraine,0,4204.096644,0.0,True,3,0.0,4413.0,0.417908,-4.397514,0
3,73.384,36.131,2019,8.558558,4503,other,0,12.377344,0.291547,False,0,607.10722,4681.0,-0.116048,0.11587,0
4,73.423,36.116,2019,11.896605,4437,other,0,31.006463,0.456346,False,0,703.413916,4764.0,-0.551315,-1.320284,0


In [67]:
uncleaned_ml_pos.head()

Unnamed: 0,Longitude,Latitude,Year_final,Lake_area_calculated_ha,Elevation_m,Lake_type_simplified,is_supraglacial,glacier_area_ha,slope_glac_to_lake,glacier_contact,glacier_touch_count,nearest_glacier_dist_m,glacier_elev_m,5y_expansion_rate,10y_expansion_rate,GLOF
0,74.609,42.522,1984.0,,3466.0,other,0.0,536.125805,0.0,True,2,0.0,3985.0,,,1.0
1,79.846,42.207,1984.0,,3269.0,ice,0.0,396.118156,2.959854,False,0,446.305779,4590.0,,,1.0
2,77.193,35.677,1984.0,,4779.0,ice,0.0,9444.515664,0.71855,False,0,1036.810675,5524.0,,,1.0
3,83.967,29.307,1985.0,,4594.0,other,0.0,,,False,0,,,,,1.0
4,74.633,36.039,1985.0,,3301.0,other,0.0,6998.712273,0.0,True,3,0.0,4575.0,,,1.0


In [76]:
# --- Step 1: Drop duplicate Lake_area_calculated_ha ---
uncleaned_ml_pos = uncleaned_ml_pos.loc[:, ~uncleaned_ml_pos.columns.duplicated()]

# --- Step 2: Enforce correct dtypes ---
dtype_map = {
    # numeric (floats)
    "Longitude": "float",
    "Latitude": "float",
    "Lake_area_calculated_ha": "float",
    "glacier_area_ha": "float",
    "slope_glac_to_lake": "float",
    "nearest_glacier_dist_m": "float",
    "glacier_elev_m": "float",
    "5y_expansion_rate": "float",
    "10y_expansion_rate": "float",

    # integers (nullable)
    "Year_final": "Int64",
    "Elevation_m": "Int64",
    "glacier_touch_count": "Int64",
    #flags
    "is_supraglacial": "Int64",
    "glacier_contact": "Int64",

    # categorical
    "Lake_type_simplified": "category",

    # label
    "GLOF": "Int64"
}

uncleaned_ml_pos = uncleaned_ml_pos.astype(dtype_map)
uncleaned_ml_neg = uncleaned_ml_neg.astype(dtype_map)




In [81]:
uncleaned_ml_neg[:10]

Unnamed: 0,Longitude,Latitude,Year_final,Lake_area_calculated_ha,Elevation_m,Lake_type_simplified,is_supraglacial,glacier_area_ha,slope_glac_to_lake,glacier_contact,glacier_touch_count,nearest_glacier_dist_m,glacier_elev_m,5y_expansion_rate,10y_expansion_rate,GLOF
0,72.901,36.412,2019,19.373351,4256,moraine,0,735.52788,0.0,1,4,0.0,4706.0,-20.935474,0.925453,0
1,73.048,36.263,2019,18.463383,4618,other,0,12.913846,0.386765,0,0,726.539168,4899.0,-0.014481,-2.158693,0
2,73.407,36.642,2019,14.050228,2748,moraine,0,4204.096644,0.0,1,3,0.0,4413.0,0.417908,-4.397514,0
3,73.384,36.131,2019,8.558558,4503,other,0,12.377344,0.291547,0,0,607.10722,4681.0,-0.116048,0.11587,0
4,73.423,36.116,2019,11.896605,4437,other,0,31.006463,0.456346,0,0,703.413916,4764.0,-0.551315,-1.320284,0
5,73.449,36.112,2019,16.831355,4427,other,0,10.582141,0.355723,0,0,567.858127,4693.0,-1e-06,-0.333334,0
6,73.465,36.108,2019,16.831355,4488,other,0,10.582141,0.355723,0,0,567.858127,4693.0,-1e-06,0.769022,0
7,73.461,36.086,2019,16.831355,4577,other,0,10.582141,0.355723,0,0,567.858127,4693.0,-1e-06,0.769022,0
8,73.328,36.024,2019,8.715218,4559,other,0,11.846031,0.211689,0,0,760.549757,4714.0,0.058102,0.232406,0
9,73.312,36.0,2019,9.153732,4615,other,0,2.851505,0.196036,0,0,505.008072,4713.0,-0.377781,0.043584,0


In [82]:
uncleaned_ml_pos[:10]

Unnamed: 0,Longitude,Latitude,Year_final,Lake_area_calculated_ha,Elevation_m,Lake_type_simplified,is_supraglacial,glacier_area_ha,slope_glac_to_lake,glacier_contact,glacier_touch_count,nearest_glacier_dist_m,glacier_elev_m,5y_expansion_rate,10y_expansion_rate,GLOF
0,74.609,42.522,1984,,3466,other,0,536.125805,0.0,1,2,0.0,3985.0,,,1
1,79.846,42.207,1984,,3269,ice,0,396.118156,2.959854,0,0,446.305779,4590.0,,,1
2,77.193,35.677,1984,,4779,ice,0,9444.515664,0.71855,0,0,1036.810675,5524.0,,,1
3,83.967,29.307,1985,,4594,other,0,,,0,0,,,,,1
4,74.633,36.039,1985,,3301,other,0,6998.712273,0.0,1,3,0.0,4575.0,,,1
5,76.867,41.975,1985,,3627,moraine,0,38.457305,0.4188,0,0,768.862677,3949.0,,,1
6,86.586,27.874,1985,,4368,moraine,0,652.480729,0.0,1,2,0.0,4977.0,,,1
7,79.846,42.207,1985,,3269,ice,0,396.118156,2.959854,0,0,446.305779,4590.0,,,1
8,74.879,36.414,1986,,2501,other,0,2339.676629,9.981328,0,0,198.570765,4483.0,,,1
9,88.027,27.586,1986,,4437,other,0,29403.154647,0.0,1,4,0.0,5350.0,,,1


In [75]:
uncleaned_ml_pos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 269 entries, 0 to 268
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   Longitude                241 non-null    float64 
 1   Latitude                 241 non-null    float64 
 2   Year_final               269 non-null    Int64   
 3   Lake_area_calculated_ha  181 non-null    float64 
 4   Elevation_m              269 non-null    Int64   
 5   Lake_type_simplified     241 non-null    category
 6   is_supraglacial          241 non-null    boolean 
 7   glacier_area_ha          267 non-null    float64 
 8   slope_glac_to_lake       267 non-null    float64 
 9   glacier_contact          269 non-null    boolean 
 10  glacier_touch_count      269 non-null    Int64   
 11  nearest_glacier_dist_m   267 non-null    float64 
 12  glacier_elev_m           267 non-null    float64 
 13  5y_expansion_rate        128 non-null    float64 
 14  10y_expans

In [87]:
uncleaned_ml_pos["Lake_type_simplified"].unique()

['other', 'ice', 'moraine', NaN]
Categories (3, object): ['ice', 'moraine', 'other']

In [88]:
uncleaned_ml_pos.to_csv("uncleaned_ml_pos.csv")
uncleaned_ml_neg.to_csv("uncleaned_ml_neg.csv")

In [129]:
print("Rows per source:")
print("df_pos_calculatedarea.csv   :", len(pd.read_csv("df_pos_calculatedarea.csv")))
print("pos_expansion5y.csv         :", len(pd.read_csv("pos_expansion5y.csv")))
print("pos_expansion10y.csv        :", len(pd.read_csv("pos_expansion10y.csv")))
print("glac_pos_list_correct_slope :", len(pd.read_csv("glac_pos_list_correct_slope.csv")))


Rows per source:
df_pos_calculatedarea.csv   : 241
pos_expansion5y.csv         : 241
pos_expansion10y.csv        : 241
glac_pos_list_correct_slope : 241


Shape after cleaning: (269, 14)


In [103]:
import pandas as pd

glac = pd.read_csv("glac_pos_list_correct_slope.csv")

# Keep just one row per saved-index value, then drop that index column
if "Unnamed: 0" in glac.columns:
    glac = glac.drop_duplicates(subset=["Unnamed: 0"], keep="first")
    glac = glac.drop(columns=["Unnamed: 0"])

# Optional: save cleaned file
glac.to_csv("glac_pos_list_correct_slope_CLEAN.csv", index=False)
print("Cleaned shape:", glac.shape)


Cleaned shape: (241, 14)


In [125]:
glac = pd.read_csv("glac_pos_list_correct_slope.csv")
glac.shape

(241, 14)