In [10]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("data/zealandia_nesting_data.csv")

In [4]:
nesting_outcomes = (
    df.groupby("nsbsid")
        .agg({"nsoffspring": "sum", "nsbpid": "nunique"})
        .rename(columns={"nsoffspring": "number_of_offspring", "nsbpid": "number_of_pairs"})
        .sort_index()
)

nesting_outcomes["offspring_per_pair"] = (
        nesting_outcomes["number_of_offspring"]/
        nesting_outcomes["number_of_pairs"]
).round(2)

nesting_outcomes["proportion_with_1+_offspring"] = (
    df.groupby(["nsbsid", "nsbpid"])
        ["nsoffspring"].sum()
        .gt(0)
        .groupby("nsbsid")
        .mean()
        .round(2)
)

In [5]:
def prev_year(nsbsid):
    return "/".join([str(int(year)-1) for year in nsbsid.split("/")])

df["prev_nsbsid"] = df["nsbsid"].apply(prev_year)

In [6]:
year = (
    df.groupby(["nsbsid", "nsbpid"])
        .size()
        .rename("year")
)

subsequent_year = (
    df.groupby(["prev_nsbsid", "nsbpid"])
        .size()
        .rename("subsequent_year")
)

In [7]:
present_in_year = pd.concat([year, subsequent_year], axis=1).notna()
present_in_year = present_in_year.loc[present_in_year["year"]]

In [11]:
nesting_outcomes["proportion_returning_next_year"] = (
    present_in_year.groupby(
        present_in_year
            .index
            .get_level_values(0))
        ["subsequent_year"].mean()
        .round(2)
        .replace({0:np.nan})
)

In [12]:
nesting_outcomes

Unnamed: 0_level_0,number_of_offspring,number_of_pairs,offspring_per_pair,proportion_with_1+_offspring,proportion_returning_next_year
nsbsid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014/2015,44,30,1.47,0.8,0.57
2015/2016,43,33,1.3,0.76,0.45
2016/2017,53,37,1.43,0.78,0.51
2017/2018,55,34,1.62,0.94,0.32
2018/2019,58,32,1.81,0.84,0.5
2019/2020,42,31,1.35,0.77,0.45
2020/2021,56,31,1.81,0.77,0.39
2021/2022,77,40,1.92,0.82,0.45
2022/2023,68,41,1.66,0.78,
