In [22]:
import pandas as pd
import numpy as np
import json

In [23]:
# Utilizando raw string para la ruta del archivo
archivo_csv = r"C:\Users\JSLV3\Documents\5to Semestre\ETL\Proyecto Water Quality\1ra parte\watersucia.csv"

# Cargar los datos desde el archivo CSV
water = pd.read_csv(archivo_csv, delimiter=';')

# Mostrar las primeras filas del DataFrame
print(water.head())



    Año NombreDepartamento  Div_dpto NombreMunicipio  Divi_muni IrcaMinimo  \
0  2010            Bolívar        13        El Guamo      13248          0   
1  2010            Bolívar        13        El Guamo      13248          0   
2  2010            Bolívar        13        El Guamo      13248          0   
3  2010            Bolívar        13        El Guamo      13248          0   
4  2010            Bolívar        13        El Guamo      13248          0   

  IrcaMaximo IrcaPromedio NombreParametroAnalisis2  MuestrasEvaluadas  \
0        100        37,32        Alcanilidad Total                 67   
1        100        37,32                 Aluminio                 67   
2        100        37,32                 Arsénico                 67   
3        100        37,32                   Cadmio                 67   
4        100        37,32                   Calcio                 67   

   MuestrasTratadas  MuestrasSinTratar  NumeroParametrosMinimo  \
0                67       

## Creating greater value from our data
### Transformations

##### We identified the analysis parameters that have the greatest influence on water pollution:

In [25]:
water['IrcaPromedio'] = water['IrcaPromedio'].str.replace(',', '.').astype(float)

In [26]:
parametros_influencia = water.groupby('NombreParametroAnalisis2')['IrcaPromedio'].mean().sort_values(ascending=False)
top_15_parametros = parametros_influencia.head(15)
top_15_parametros

NombreParametroAnalisis2
ph                               23.941989
Cromo total                      23.941989
Olor                             23.941989
Mesófilos                        23.941989
Mercurio                         23.941989
Manganeso                        23.941989
Magnesio                         23.941989
Hierro total                     23.941989
Organofosforados y carbamatos    23.941989
Alcanilidad Total                23.941989
Fosfatos                         23.941989
Fluoruros                        23.941989
Plomo                            23.941989
E.coli                           23.941989
Dureza total                     23.941989
Name: IrcaPromedio, dtype: float64

All these parameters have an average IRCA of approximately 23.94, suggesting an association with a considerable risk level. This can be useful for prioritizing which water quality parameters need more critical attention in monitoring and treatment programs.

In [27]:
water = water[water['NombreParametroAnalisis2'].isin(top_15_parametros.index)]

water.head(), water.shape

(     Año NombreDepartamento  Div_dpto NombreMunicipio  Divi_muni IrcaMinimo  \
 0   2010            Bolívar        13        El Guamo      13248          0   
 13  2010            Bolívar        13        El Guamo      13248          0   
 14  2010            Bolívar        13        El Guamo      13248          0   
 15  2010            Bolívar        13        El Guamo      13248          0   
 16  2010            Bolívar        13        El Guamo      13248          0   
 
    IrcaMaximo  IrcaPromedio NombreParametroAnalisis2  MuestrasEvaluadas  \
 0         100         37.32        Alcanilidad Total                 67   
 13        100         37.32              Cromo total                 67   
 14        100         37.32             Dureza total                 67   
 15        100         37.32                   E.coli                 67   
 16        100         37.32                Fluoruros                 67   
 
     MuestrasTratadas  MuestrasSinTratar  NumeroParametrosMi

We have filtered the dataset to only leave the rows corresponding to the top 15 analysis parameters related to water pollution, and the rest have been removed.

In [28]:
max_parametros = water['NumeroParametrosMaximo'].max()

water['cobertura_analisis'] = (
    (water['NumeroParametrosPromedio'] / max_parametros) *
    (1 - ((water['NumeroParametrosMaximo'] - water['NumeroParametrosMinimo']) / max_parametros))
)

water[['cobertura_analisis']].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  water['cobertura_analisis'] = (


Unnamed: 0,cobertura_analisis
0,0.045706
13,0.045706
14,0.045706
15,0.045706
16,0.045706


##### Classification of the Water Quality Risk Index (IRCA)

In [30]:
def clasificar_irca(irca):
    try:
        # Si irca ya es numérico (float), no se intenta reemplazar comas
        if not isinstance(irca, float):
            irca = float(irca.replace(',', '.'))
        if irca == 0:
            return 'Sin información'
        elif 0.001 <= irca <= 5:
            return 'Sin riesgo'
        elif 5.001 <= irca <= 14:
            return 'Riesgo bajo'
        elif 14.001 <= irca <= 35:
            return 'Riesgo medio'
        elif 35.001 <= irca <= 80:
            return 'Riesgo alto'
        elif 80.001 <= irca <= 100:
            return 'Inviable sanitariamente'
        else:
            return 'No clasificado'
    except ValueError:
        return 'No clasificado'

water['rango_irca'] = water['IrcaPromedio'].apply(clasificar_irca)
print(water[['rango_irca','IrcaPromedio']].head())


     rango_irca  IrcaPromedio
0   Riesgo alto         37.32
13  Riesgo alto         37.32
14  Riesgo alto         37.32
15  Riesgo alto         37.32
16  Riesgo alto         37.32


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  water['rango_irca'] = water['IrcaPromedio'].apply(clasificar_irca)



The classify_irca function transforms the numerical value of the Average IRCA into descriptive categories ranging from 'No risk' to 'Sanitarily unviable', facilitating the interpretation and decision-making in water quality management. This categorization is crucial for public health and environmental analyses, as it simplifies data visualization, allows for quick comparisons between regions, and is essential for modeling and predicting water quality, resulting in more effective interventions and evidence-based policies.

##### Treatment Category

In [31]:
def categorize_treatment(row):
    if row['MuestrasTratadas'] == 0:
        return 'Sin tratamiento'
    elif row['MuestrasTratadas'] == row['MuestrasEvaluadas']:
        return 'Tratamiento completo'
    else:
        return 'Tratamiento parcial'

water['TratamientoCategoría'] = water.apply(categorize_treatment, axis=1)
water.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  water['TratamientoCategoría'] = water.apply(categorize_treatment, axis=1)


Unnamed: 0,Año,NombreDepartamento,Div_dpto,NombreMunicipio,Divi_muni,IrcaMinimo,IrcaMaximo,IrcaPromedio,NombreParametroAnalisis2,MuestrasEvaluadas,...,MuestrasSinTratar,NumeroParametrosMinimo,NumeroParametrosMaximo,NumeroParametrosPromedio,ResultadoMinimo,ResultadoMaximo,ResultadoPromedio,cobertura_analisis,rango_irca,TratamientoCategoría
0,2010,Bolívar,13,El Guamo,13248,0,100,37.32,Alcanilidad Total,67,...,0,2,7,2,23.0,23.0,23.0,0.045706,Riesgo alto,Tratamiento completo
13,2010,Bolívar,13,El Guamo,13248,0,100,37.32,Cromo total,67,...,0,2,7,2,,,,0.045706,Riesgo alto,Tratamiento completo
14,2010,Bolívar,13,El Guamo,13248,0,100,37.32,Dureza total,67,...,0,2,7,2,68.0,68.0,68.0,0.045706,Riesgo alto,Tratamiento completo
15,2010,Bolívar,13,El Guamo,13248,0,100,37.32,E.coli,67,...,0,2,7,2,0.0,1.0,13.0,0.045706,Riesgo alto,Tratamiento completo
16,2010,Bolívar,13,El Guamo,13248,0,100,37.32,Fluoruros,67,...,0,2,7,2,,,,0.045706,Riesgo alto,Tratamiento completo



The "Treatment Category" column classifies each set of water samples according to the degree of treatment they have received. This classification helps understand the management and effectiveness of treatment processes implemented in different locations. The categories are:

No treatment: Indicates that none of the evaluated samples were treated.

Partial treatment: Indicates that a portion of the evaluated samples was treated, but not all.

Complete treatment: Indicates that all evaluated samples were treated.

##### Data Cleaning and Variable Selection

In [32]:
water = water.drop(['ResultadoMinimo', 'ResultadoMaximo', 'ResultadoPromedio'], axis=1)


To obtain a deeper understanding of the methodology and justification behind the removal of the 'Minimum Result', 'Maximum Result', and 'Average Result' columns, we invite you to consult the EDA_water_quality file. This document contains our exploratory data analysis, which distills key criteria and reveals significant insights that have guided the cleaning of our dataset.

In [33]:
columnas_a_eliminar = ['MuestrasTratadas', 'MuestrasEvaluadas', 'MuestrasSinTratar',
                      'NumeroParametrosMinimo', 'NumeroParametrosMaximo']
water = water.drop(columns=columnas_a_eliminar)


water.head()

Unnamed: 0,Año,NombreDepartamento,Div_dpto,NombreMunicipio,Divi_muni,IrcaMinimo,IrcaMaximo,IrcaPromedio,NombreParametroAnalisis2,NumeroParametrosPromedio,cobertura_analisis,rango_irca,TratamientoCategoría
0,2010,Bolívar,13,El Guamo,13248,0,100,37.32,Alcanilidad Total,2,0.045706,Riesgo alto,Tratamiento completo
13,2010,Bolívar,13,El Guamo,13248,0,100,37.32,Cromo total,2,0.045706,Riesgo alto,Tratamiento completo
14,2010,Bolívar,13,El Guamo,13248,0,100,37.32,Dureza total,2,0.045706,Riesgo alto,Tratamiento completo
15,2010,Bolívar,13,El Guamo,13248,0,100,37.32,E.coli,2,0.045706,Riesgo alto,Tratamiento completo
16,2010,Bolívar,13,El Guamo,13248,0,100,37.32,Fluoruros,2,0.045706,Riesgo alto,Tratamiento completo


In [34]:
new_column_names = [
    "año", 
    "nombre_departamento", 
    "div_dpto", 
    "nombre_municipio", 
    "divi_muni", 
    "irca_minimo", 
    "irca_maximo", 
    "irca_promedio", 
    "rango_irca", 
    "nombre_parametro_analisis", 
    "numero_parametros_promedio", 
    "tratamiento_categoría",
    "cobertura_analisis"
]
water.columns = new_column_names
print(water.head())

     año nombre_departamento  div_dpto nombre_municipio  divi_muni  \
0   2010             Bolívar        13         El Guamo      13248   
13  2010             Bolívar        13         El Guamo      13248   
14  2010             Bolívar        13         El Guamo      13248   
15  2010             Bolívar        13         El Guamo      13248   
16  2010             Bolívar        13         El Guamo      13248   

   irca_minimo irca_maximo  irca_promedio         rango_irca  \
0            0         100          37.32  Alcanilidad Total   
13           0         100          37.32        Cromo total   
14           0         100          37.32       Dureza total   
15           0         100          37.32             E.coli   
16           0         100          37.32          Fluoruros   

    nombre_parametro_analisis  numero_parametros_promedio  \
0                           2                    0.045706   
13                          2                    0.045706   
14         

In [35]:
water.to_csv('water_cleaned.csv', index=False)

### Dimensional Modeling