In [1]:
import pandas as pd
import sys
import os   
import yaml


In [2]:
'python 3.12.7'

'python 3.12.7'

In [3]:
sys.path.append(os.path.abspath(os.path.join('..')))

from src.helpers.config import ConfigHelper


config = ConfigHelper.instance('config')
sample_path = config.get('sample_path')

In [4]:
print(sample_path)

..\data\samplePoints.csv


In [5]:
      
def load_agb_csv(csv_path):
    """
    Load the agb CSV file with multiple fallback methods.
    
    Args:
        csv_path (str): Path to the CSV file
    
    Returns:
        pd.DataFrame: Loaded CSV data or empty DataFrame if failed
    """
    
    # Method 1: Try with semicolon separator
    try:
        csv_df = pd.read_csv(csv_path, encoding='utf-8-sig', sep=';', on_bad_lines='skip')
        print("✓ CSV loaded successfully with semicolon separator")
        return csv_df
    except Exception as e:
        print(f"Method 1 (semicolon) failed: {e}")
    
    # Method 2: Try with comma separator
    try:
        csv_df = pd.read_csv(csv_path, encoding='utf-8-sig', sep=',', on_bad_lines='skip')
        print("✓ CSV loaded successfully with comma separator")
        return csv_df
    except Exception as e:
        print(f"Method 2 (comma) failed: {e}")
    
    # Method 3: Try with tab separator
    try:
        csv_df = pd.read_csv(csv_path, encoding='utf-8-sig', sep='\t', on_bad_lines='skip')
        print("✓ CSV loaded successfully with tab separator")
        return csv_df
    except Exception as e:
        print(f"Method 3 (tab) failed: {e}")
    
    # Method 4: Auto-detect separator
    try:
        csv_df = pd.read_csv(csv_path, encoding='utf-8-sig', sep=None, engine='python', on_bad_lines='skip')
        print("✓ CSV loaded successfully with auto-detected separator")
        return csv_df
    except Exception as e:
        print(f"Method 4 (auto-detect) failed: {e}")
    
    # Method 5: Manual inspection and flexible loading
    try:
        print("Attempting manual inspection...")
        
        # Read first few lines to inspect
        with open(csv_path, 'r', encoding='utf-8-sig') as f:
            lines = f.readlines()[:5]
            for i, line in enumerate(lines, 1):
                print(f"Line {i}: {repr(line[:100])}...")  # Show first 100 chars
        
        # Try reading with more flexible options
        csv_df = pd.read_csv(
            csv_path, 
            encoding='utf-8-sig',
            sep=';',  # Based on your data structure
            quotechar='"',
            skipinitialspace=True,
            on_bad_lines='skip',
            dtype=str  # Read everything as string first
        )
        print("✓ CSV loaded with flexible options")
        return csv_df
        
    except Exception as e:
        print(f"All methods failed: {e}")
        print("Manual intervention required - check the CSV file structure")
        return pd.DataFrame()


In [7]:
df= load_agb_csv(sample_path)

df.head()

✓ CSV loaded successfully with semicolon separator


Unnamed: 0,Code,Sample Point,Type,Territory,Municipality,XETRS89,YETRS89,ZETRS89,Water range,Water range type,Basin,Section,Subgroup
0,L-A10,Litoral de Ondarroa,LITORALES,Bizkaia,Ondarroa,548439,4798291,0.0,Matxitxako-Getaria,Aguas costeras atlánticas del cantábrico orien...,Deba,Deba costa,Aguas (Fisicoquímica)#Macroinvertebrados#Fitop...
1,L-BI10,Litoral de Hondarribia,LITORALES,Gipuzkoa,Hondarribia,597007,4805570,0.0,Getaria-Higer,Aguas costeras atlánticas del cantábrico orien...,Bidasoa intracomunitario,Bidasoa costa,Aguas (Fisicoquímica)#Macroinvertebrados#Fitop...
2,L-B10,Litoral de Gorliz (Cabo Villano),LITORALES,Bizkaia,Gorliz,503617,4809354,0.0,Cantabria-Matxitxako,Aguas costeras atlánticas del cantábrico orien...,Butroe,Butroe costa,Aguas (Fisicoquímica)#Macroinvertebrados#Fitop...
3,L-B20,Litoral de Bakio,LITORALES,Bizkaia,Bakio,515916,4810520,0.0,Cantabria-Matxitxako,Aguas costeras atlánticas del cantábrico orien...,Butroe,Butroe costa,Aguas (Fisicoquímica)#Macroinvertebrados#Fitop...
4,L-D10,Litoral de Deba,LITORALES,Gipuzkoa,Deba,552500,4797285,0.0,Matxitxako-Getaria,Aguas costeras atlánticas del cantábrico orien...,Deba,Deba costa,Aguas (Fisicoquímica)#Macroinvertebrados#Fitop...


In [9]:
codes=df['Code'].unique()


df.describe()

Unnamed: 0,XETRS89,YETRS89,ZETRS89
count,52.0,52.0,42.0
mean,548067.884615,4800708.0,0.0
std,34754.864216,4988.397,0.0
min,488122.0,4793827.0,0.0
25%,513969.25,4796817.0,0.0
50%,552816.0,4798918.0,0.0
75%,580712.5,4805160.0,0.0
max,597938.0,4810520.0,0.0


In [18]:
import os

# Carpeta donde están los CSV de cada código
csv_folder = os.path.join('..', 'data', 'measures_csv_fq')

# Lista para almacenar los DataFrames de cada código
all_code_dfs = []

for code in df['Code'].unique():
    if code.lower().startswith('l-'):
        csv_filename = f"{code.lower()}_measure_fq.csv"
        csv_path = os.path.join(csv_folder, csv_filename)
        if os.path.exists(csv_path):
            code_df = pd.read_csv(csv_path, encoding='latin1', sep=';', on_bad_lines='skip')
            code_df['code'] = code  # Añade la columna de código para identificar el origen
            all_code_dfs.append(code_df)
        else:
            print(f"Archivo {csv_filename} no encontrado.")

# Unir todos los DataFrames en uno solo
if all_code_dfs:
    combined_df = pd.concat(all_code_dfs, ignore_index=True)
    print(combined_df.info())
    print(combined_df.describe())
    print(combined_df['code'].value_counts())
else:
    print("No se cargó ningún archivo CSV de códigos l-.")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 354 entries, 0 to 353
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Sample Point Code       354 non-null    object 
 1   Date                    354 non-null    object 
 2   Hour                    354 non-null    object 
 3   Type                    354 non-null    object 
 4   Subgroup                354 non-null    object 
 5   Parameter               354 non-null    object 
 6   Species                 354 non-null    object 
 7   Operator                354 non-null    object 
 8   Value                   354 non-null    float64
 9   Unit                    354 non-null    object 
 10  Additional information  0 non-null      float64
 11  Situation               354 non-null    object 
 12  Level                   354 non-null    object 
 13  Depth                   354 non-null    int64  
 14  code                    354 non-null    ob

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Asegúrate de tener el DataFrame combinado
if 'combined_df' in locals():
    plt.figure(figsize=(12,6))
    sns.countplot(data=combined_df, x='code', order=combined_df['code'].value_counts().index)
    plt.title('Cantidad de filas por código')
    plt.xlabel('Código')
    plt.ylabel('Cantidad de filas')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
else:
    print('No hay datos combinados para graficar.')

In [19]:
# No hay ningun archivo m-... .csv

In [20]:
import pandas as pd

# Lista para almacenar los DataFrames de cada código
all_code_dfs = []

for code in df['Code'].unique():
    csv_filename = f"{code.lower()}_measure_fq.csv"
    csv_path = os.path.join(csv_folder, csv_filename)
    if os.path.exists(csv_path):
        code_df = pd.read_csv(csv_path, encoding='latin1', sep=';', on_bad_lines='skip')
        code_df['code'] = code  # Añade la columna de código para identificar el origen
        all_code_dfs.append(code_df)

# Unir todos los DataFrames en uno solo
if all_code_dfs:
    combined_df = pd.concat(all_code_dfs, ignore_index=True)
    print(combined_df.info())
    print(combined_df.describe())
    # Ejemplo: contar filas por código
    print(combined_df['code'].value_counts())
else:
    print("No se cargó ningún archivo CSV de códigos.")
    

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 354 entries, 0 to 353
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Sample Point Code       354 non-null    object 
 1   Date                    354 non-null    object 
 2   Hour                    354 non-null    object 
 3   Type                    354 non-null    object 
 4   Subgroup                354 non-null    object 
 5   Parameter               354 non-null    object 
 6   Species                 354 non-null    object 
 7   Operator                354 non-null    object 
 8   Value                   354 non-null    float64
 9   Unit                    354 non-null    object 
 10  Additional information  0 non-null      float64
 11  Situation               354 non-null    object 
 12  Level                   354 non-null    object 
 13  Depth                   354 non-null    int64  
 14  code                    354 non-null    ob

In [21]:
combined_df.head()

Unnamed: 0,Sample Point Code,Date,Hour,Type,Subgroup,Parameter,Species,Operator,Value,Unit,Additional information,Situation,Level,Depth,code
0,L-A10,10/03/2025,10:24,LITORALES,Sedimentos,Potencial Redox,-,=,348.0,mV,,-,-,0,L-A10
1,L-A10,10/03/2025,10:24,LITORALES,Sedimentos,% Humedad,-,=,23.62,%,,-,-,0,L-A10
2,L-A10,10/03/2025,10:24,LITORALES,Sedimentos,Níquel,-,=,11.0,mg/kg PS,,-,-,0,L-A10
3,L-A10,10/03/2025,10:24,LITORALES,Sedimentos,Zinc,-,=,63.0,mg/kg PS,,-,-,0,L-A10
4,L-A10,10/03/2025,10:24,LITORALES,Sedimentos,Cromo,-,=,14.0,mg/kg PS,,-,-,0,L-A10
