In [11]:
%%HTML
<style>
    body {
        --vscode-font-family: "Verdana"
    }
</style>

This file corresponds to an exploratory analysis needed to develop a preprocessing pipeline of the raw data. All comments and useful descriptions are in spanish. However, a more detailed and official documentation of the preprocessing pipeline can be found on the file *'processing_tools.py'*.

In [12]:
import pandas as pd # type: ignore
import numpy as np # type: ignore
import matplotlib.pyplot as plt # type: ignore
import os # type: ignore

In [13]:
path = r'C:\Users\marce\Escritorio\Proyectos\projections_pys\data\query_adelia.csv'
data = pd.read_csv(path)
data.head(5) # Todos los datos (de más de una especie y sitio)

Unnamed: 0,site_name,site_id,cammlr_region,longitude_epsg_4326,latitude_epsg_4326,common_name,day,month,year,season_starting,penguin_count,accuracy,count_type,vantage,reference
0,Acuna Island,ACUN,48.2,-44.637,-60.761,adelie penguin,,,1993,1993,2008.0,1.0,nests,ground,"</sub>author [1] ""N&eacute;stor R. Coria"" ..."
1,Acuna Island,ACUN,48.2,-44.637,-60.761,adelie penguin,,,1994,1994,1920.0,1.0,nests,,"</sub>author [1] ""Eric J. Woehler"" ""J. P. Crox..."
2,Acuna Island,ACUN,48.2,-44.637,-60.761,adelie penguin,,,2004,2004,1880.0,1.0,nests,ground,"</sub>author [1] ""N&eacute;stor R. Coria"" ..."
3,Acuna Island,ACUN,48.2,-44.637,-60.761,adelie penguin,25.0,2.0,2011,2010,3079.0,5.0,nests,vhr,"</sub>author [1] ""Heather J. Lynch"" ""Michelle..."
4,Acuna Island,ACUN,48.2,-44.637,-60.761,chinstrap penguin,28.0,12.0,1983,1983,4000.0,4.0,nests,ground,"</sub>author [1] ""Sally Poncet"" ""J&eacute;r&o..."


In [14]:
name_map = {
    'adelie penguin': 'adelie',
    'chinstrap penguin': 'chinstrap',
    'macaroni penguin': 'macaroni',
    'gentoo penguin': 'gentoo',
    'emperor penguin': 'emperor',
    'king penguin': 'king'
}

data['name_id'] = data['common_name'].map(name_map)
data.head(5)

Unnamed: 0,site_name,site_id,cammlr_region,longitude_epsg_4326,latitude_epsg_4326,common_name,day,month,year,season_starting,penguin_count,accuracy,count_type,vantage,reference,name_id
0,Acuna Island,ACUN,48.2,-44.637,-60.761,adelie penguin,,,1993,1993,2008.0,1.0,nests,ground,"</sub>author [1] ""N&eacute;stor R. Coria"" ...",adelie
1,Acuna Island,ACUN,48.2,-44.637,-60.761,adelie penguin,,,1994,1994,1920.0,1.0,nests,,"</sub>author [1] ""Eric J. Woehler"" ""J. P. Crox...",adelie
2,Acuna Island,ACUN,48.2,-44.637,-60.761,adelie penguin,,,2004,2004,1880.0,1.0,nests,ground,"</sub>author [1] ""N&eacute;stor R. Coria"" ...",adelie
3,Acuna Island,ACUN,48.2,-44.637,-60.761,adelie penguin,25.0,2.0,2011,2010,3079.0,5.0,nests,vhr,"</sub>author [1] ""Heather J. Lynch"" ""Michelle...",adelie
4,Acuna Island,ACUN,48.2,-44.637,-60.761,chinstrap penguin,28.0,12.0,1983,1983,4000.0,4.0,nests,ground,"</sub>author [1] ""Sally Poncet"" ""J&eacute;r&o...",chinstrap


In [15]:
data_adelia = data[(data['common_name'] == 'adelie penguin')]
data_adelia['site_id'].unique()[0:10] # Primeros 10 sitios
data_adelia.head(5)

Unnamed: 0,site_name,site_id,cammlr_region,longitude_epsg_4326,latitude_epsg_4326,common_name,day,month,year,season_starting,penguin_count,accuracy,count_type,vantage,reference,name_id
0,Acuna Island,ACUN,48.2,-44.637,-60.761,adelie penguin,,,1993,1993,2008.0,1.0,nests,ground,"</sub>author [1] ""N&eacute;stor R. Coria"" ...",adelie
1,Acuna Island,ACUN,48.2,-44.637,-60.761,adelie penguin,,,1994,1994,1920.0,1.0,nests,,"</sub>author [1] ""Eric J. Woehler"" ""J. P. Crox...",adelie
2,Acuna Island,ACUN,48.2,-44.637,-60.761,adelie penguin,,,2004,2004,1880.0,1.0,nests,ground,"</sub>author [1] ""N&eacute;stor R. Coria"" ...",adelie
3,Acuna Island,ACUN,48.2,-44.637,-60.761,adelie penguin,25.0,2.0,2011,2010,3079.0,5.0,nests,vhr,"</sub>author [1] ""Heather J. Lynch"" ""Michelle...",adelie
7,Adams Island,ADAM,58.4.1,92.549,-66.546,adelie penguin,12.0,11.0,2009,2009,76.0,5.0,nests,vhr,"</sub>author [1] ""Heather J. Lynch"" ""Michelle...",adelie


In [16]:
def filter_data(data):
    sites = list(data['site_id'].unique())
    filtered_sites = []
    removed_sites = []

    for site_id in sites:
        data_site = data[data['site_id'] == site_id].dropna(subset=['penguin_count'])
        
        if data_site.empty:
            removed_sites.append(site_id)
            continue
        
        span = data_site['year'].max() - data_site['year'].min()
        if span >= 10:
            filtered_sites.append(site_id)
        else:
            removed_sites.append(site_id)

    data_filtered = data[data['site_id'].isin(filtered_sites)].dropna(subset=['penguin_count'])
    
    data_filtered = data_filtered[['site_name', 'site_id', 'year', 'penguin_count', 'count_type', 'name_id']]
    data_filtered = data_filtered.groupby(['site_name', 'site_id', 'year', 'count_type', 'name_id'], as_index = False)['penguin_count'].mean()
    
    return data_filtered

data_filtered = filter_data(data_adelia)
data_filtered.head(10)

Unnamed: 0,site_name,site_id,year,count_type,name_id,penguin_count
0,Acuna Island,ACUN,1993,nests,adelie,2008.0
1,Acuna Island,ACUN,1994,nests,adelie,1920.0
2,Acuna Island,ACUN,2004,nests,adelie,1880.0
3,Acuna Island,ACUN,2011,nests,adelie,3079.0
4,Akarui Point,AKAR,1975,adults,adelie,2.0
5,Akarui Point,AKAR,1975,chicks,adelie,56.0
6,Akarui Point,AKAR,1981,adults,adelie,211.0
7,Akarui Point,AKAR,1984,adults,adelie,187.0
8,Akarui Point,AKAR,2011,nests,adelie,106.0
9,Amphibolite Point,AMPH,1983,nests,adelie,5000.0


In [17]:
def extract_valid_chunks(years, min_span = 10, max_gap = 2):
    
    """
    Esta función detecta los segmentos de años que cumplen las siguientes condiciones:
    
    1. Hay un rango mínimo de 10 años
    2. Solo pueden haber saltos de como máximo 2 años
    
    Devuelve todos los segmentos en una lista de tuplas (año inicial, año final) que cumplen las condiciones.
    
    """
    
    if not years:
        return []

    years = sorted(set(years))
    chunks = []
    current_chunk = [years[0]]

    for y in years[1:]:
        if y - current_chunk[-1] <= max_gap:
            current_chunk.append(y)
        else:
            # Guarda el segmento si es valido, y empieza con el siguiente
            if current_chunk[-1] - current_chunk[0] >= min_span:
                chunks.append((current_chunk[0], current_chunk[-1]))
            current_chunk = [y]
    
    # Chequea el último segmento
    if current_chunk[-1] - current_chunk[0] >= min_span:
        chunks.append((current_chunk[0], current_chunk[-1]))
    
    return chunks


def select_good_chunks(data):
    
    """
    Esta función segmenta una base de datos en subsets que cumplan las condiciones que chequea la función de arriba.
    Realiza este procedimiento para cada sitio y cada tipo de dato presente.
    
    """
    
    types = ['nests', 'adults', 'chicks']
    dataframes = []
    
    for count_type in types:
        data_count = data[(data['count_type'] == count_type)]
        sites = list(data_count['site_id'].unique())
        
        for site in sites:
            data_count_site = data_count[(data_count['site_id'] == site)]
            years = list(data_count_site['year'])
            good_chunks = extract_valid_chunks(years, min_span = 10, max_gap = 2)
        
            for start, end in good_chunks:
                data_chunk = data_count_site[(data_count_site['year'] >= start) & (data_count_site['year'] <= end)]
                dataframes.append(data_chunk)
                
    return dataframes

In [18]:
subsets_adelia = select_good_chunks(data_filtered)
len(subsets_adelia)

29

In [19]:
def interpolate_two_year_gaps(data):
    
    """
    Esta función interpola todos los gaps dentro de un subset, calculando el promedio entre 
    el año anterior y el siguiente al gap.
    
    """
    
    data = data.sort_values('year').reset_index(drop=True)
    interpolated_rows = []

    for i in range(len(data) - 1):
        y1, y2 = data.loc[i, 'year'], data.loc[i + 1, 'year']
        
        if y2 - y1 == 2:
            
            # Se interpola calculando el promedio entre el año anterior y el siguiente al año gap
            y_interp = y1 + 1
            count_interp = (data.loc[i, 'penguin_count'] + data.loc[i + 1, 'penguin_count']) / 2

            new_row = data.loc[i].copy()
            new_row['year'] = y_interp
            new_row['penguin_count'] = count_interp
            interpolated_rows.append(new_row)

    data_interpolated = pd.concat([data, pd.DataFrame(interpolated_rows)], ignore_index=True)
    data_interpolated = data_interpolated.sort_values('year').reset_index(drop=True)

    return data_interpolated


def interpolate_all_subsets(subsets):
    
    """
    Se aplica la interpolación a todos los dataframes del subset.
    
    """
    
    interpolated_subsets = []
    num_subsets = len(subsets)
    
    for i in range(num_subsets):
        interpolated = interpolate_two_year_gaps(subsets[i])
        interpolated_subsets.append(interpolated)

    return interpolated_subsets

In [20]:
subsets_adelia_interp = interpolate_all_subsets(subsets_adelia)
subsets_adelia_interp[0]

Unnamed: 0,site_name,site_id,year,count_type,name_id,penguin_count
0,Ardley Island,ARDL,1993,nests,adelie,1205.0
1,Ardley Island,ARDL,1994,nests,adelie,1095.0
2,Ardley Island,ARDL,1995,nests,adelie,1226.0
3,Ardley Island,ARDL,1996,nests,adelie,923.0
4,Ardley Island,ARDL,1997,nests,adelie,1173.0
5,Ardley Island,ARDL,1998,nests,adelie,1192.0
6,Ardley Island,ARDL,1999,nests,adelie,974.0
7,Ardley Island,ARDL,2000,nests,adelie,880.0
8,Ardley Island,ARDL,2001,nests,adelie,780.0
9,Ardley Island,ARDL,2002,nests,adelie,771.0
