In [2]:
from scipy import stats
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px

In [3]:
analyses_I2M2 = pd.read_csv("./naiades_export_I2M2/ResultatsBiologiques.CSV",sep=";",parse_dates=["DateDebutOperationPrelBio"])

In [4]:
pd.set_option('display.max_columns', None)
analyses_I2M2.columns

Index(['CdStationMesureEauxSurface', 'LbStationMesureEauxSurface',
       'CdPointEauxSurf', 'DateDebutOperationPrelBio', 'CdSupport',
       'LbSupport', 'DtProdResultatBiologique', 'HeureResultat',
       'CdParametreResultatBiologique', 'LbLongParametre',
       'ResIndiceResultatBiologique', 'CdUniteMesure', 'SymUniteMesure',
       'CdRqIndiceResultatBiologique', 'MnemoRqAna', 'CdMethEval',
       'RefOperationPrelBio', 'CdProducteur', 'NomProducteur',
       'CdAccredRsIndiceResultatBiologique',
       'MnAccredRsIndiceResultatBiologique'],
      dtype='object')

In [5]:
analyses_I2M2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 328 entries, 0 to 327
Data columns (total 21 columns):
 #   Column                              Non-Null Count  Dtype         
---  ------                              --------------  -----         
 0   CdStationMesureEauxSurface          328 non-null    int64         
 1   LbStationMesureEauxSurface          328 non-null    object        
 2   CdPointEauxSurf                     328 non-null    int64         
 3   DateDebutOperationPrelBio           328 non-null    datetime64[ns]
 4   CdSupport                           328 non-null    int64         
 5   LbSupport                           328 non-null    object        
 6   DtProdResultatBiologique            0 non-null      float64       
 7   HeureResultat                       0 non-null      float64       
 8   CdParametreResultatBiologique       328 non-null    int64         
 9   LbLongParametre                     328 non-null    object        
 10  ResIndiceResultatBiologiqu

In [6]:
analyses_I2M2.loc[analyses_I2M2['CdStationMesureEauxSurface'] == 2001000]

Unnamed: 0,CdStationMesureEauxSurface,LbStationMesureEauxSurface,CdPointEauxSurf,DateDebutOperationPrelBio,CdSupport,LbSupport,DtProdResultatBiologique,HeureResultat,CdParametreResultatBiologique,LbLongParametre,ResIndiceResultatBiologique,CdUniteMesure,SymUniteMesure,CdRqIndiceResultatBiologique,MnemoRqAna,CdMethEval,RefOperationPrelBio,CdProducteur,NomProducteur,CdAccredRsIndiceResultatBiologique,MnAccredRsIndiceResultatBiologique
0,2001000,L'AUGRABEN À BARTENHEIM,2,2020-07-07,13,Macroinvertébrés aquatiques,,,7613,Indice Invertébrés Multimétrique (I2M2),0.3387,X,X,1,Résultat > seuil de quantification et < au seu...,,7082963,13001025900021,DIRECTION REGIONALE DE L'ENVIRONNEMENT DE L'AM...,1,Analyse réalisée sous accréditation
1,2001000,L'AUGRABEN À BARTENHEIM,2,2021-08-25,13,Macroinvertébrés aquatiques,,,7613,Indice Invertébrés Multimétrique (I2M2),0.2082,X,X,1,Résultat > seuil de quantification et < au seu...,,7089758,13001025900021,DIRECTION REGIONALE DE L'ENVIRONNEMENT DE L'AM...,1,Analyse réalisée sous accréditation


In [7]:
# Order by the number of I2M2 analysis on one station
analyses_I2M2.groupby(['CdStationMesureEauxSurface']).size().sort_values(ascending=False)

CdStationMesureEauxSurface
6408800    4
6000990    4
2045350    2
2045283    2
2045200    2
          ..
6001312    1
6001313    1
6001314    1
6001316    1
2001006    1
Length: 203, dtype: int64

In [8]:
# Number of I2M2 analysis per station
analyses_I2M2.groupby(['CdStationMesureEauxSurface']).size().mean()

1.6157635467980296

In [9]:
analyses_I2M2 = analyses_I2M2[pd.to_datetime(analyses_I2M2['DateDebutOperationPrelBio']).dt.year != 2022]
analyses_I2M2 = analyses_I2M2[pd.to_datetime(analyses_I2M2['DateDebutOperationPrelBio']).dt.year != 2023]

In [10]:
# Number of I2M2 analysis per station after erasing the recent dates
analyses_I2M2.groupby(['CdStationMesureEauxSurface']).size().mean()

1.6059113300492611

In [11]:
new_df = analyses_I2M2.loc[:, ['CdStationMesureEauxSurface', 'ResIndiceResultatBiologique', 'DateDebutOperationPrelBio']]
new_df = new_df[pd.to_datetime(new_df['DateDebutOperationPrelBio']).dt.year == 2021]
print(new_df)


     CdStationMesureEauxSurface  ResIndiceResultatBiologique  \
1                       2001000                       0.2082   
2                       2001006                       0.4760   
5                       2001025                       0.1127   
6                       2001030                       0.0454   
9                       2001500                       0.3451   
..                          ...                          ...   
318                     6001313                       0.3120   
319                     6001314                       0.5280   
320                     6001316                       0.5930   
325                     6408800                       0.8900   
327                     6455540                       0.3150   

    DateDebutOperationPrelBio  
1                  2021-08-25  
2                  2021-05-26  
5                  2021-08-06  
6                  2021-08-06  
9                  2021-08-05  
..                        ...  
318    

In [12]:
# Number of remaining stations
new_df['CdStationMesureEauxSurface'].unique()

array([2001000, 2001006, 2001025, 2001030, 2001500, 2001738, 2001750,
       2002000, 2002800, 2003100, 2003200, 2003350, 2003400, 2003670,
       2003800, 2004000, 2004300, 2005700, 2007000, 2009000, 2009085,
       2010000, 2011000, 2013000, 2015500, 2016050, 2017000, 2017850,
       2018000, 2018500, 2018780, 2019000, 2020000, 2021000, 2022675,
       2023000, 2024000, 2025100, 2025200, 2025700, 2026200, 2026500,
       2028000, 2028300, 2028500, 2029000, 2029160, 2030200, 2030500,
       2031400, 2031650, 2032000, 2035000, 2036000, 2037400, 2037500,
       2041000, 2041230, 2041650, 2041750, 2041950, 2042000, 2042050,
       2043017, 2043500, 2043600, 2043655, 2043750, 2044100, 2044400,
       2045000, 2045050, 2045150, 2045200, 2045283, 2045350, 2045500,
       2046000, 2046550, 2047000, 2047500, 2047750, 2048960, 2048980,
       2049000, 2049500, 2049900, 2050000, 2051500, 2051600, 2051820,
       2052000, 2052500, 2054900, 2055100, 2057480, 2061500, 2061970,
       2063000, 2064