# Semana 2, tarea 11 

In [10]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd

sns.set_theme(style="whitegrid", context="talk")

In [3]:
df = pd.read_csv("Terminos_lagoon_TA_DIC_2023_RawData.csv")

# Variables 
CANDIDATES_DIC  = ["dic_micromol_kg"]
CANDIDATES_TEMP = ["temp_c"]

In [4]:
def first_existing(cols, candidates):
    for c in candidates:
        if c in cols:
            return c
    raise ValueError(f"No encontré ninguna de estas columnas: {candidates}")

DIC_COL  = first_existing(df.columns, CANDIDATES_DIC)
TEMP_COL = first_existing(df.columns, CANDIDATES_TEMP)

# --- Definir factores ---
FACTOR1 = "estuary"
FACTOR2 = "area"

# Convertir a categóricos
df[FACTOR1] = df[FACTOR1].astype(str).str.strip()
df[FACTOR2] = df[FACTOR2].astype(str).str.strip()

In [5]:
def run_two_way_anova(data, response, factor1, factor2):
    """
    Corre un ANOVA de dos vías con interacción usando statsmodels.
    """
    print(f"\n=== ANOVA de dos vías para {response} ===")
    formula = f"{response} ~ C({factor1}) + C({factor2}) + C({factor1}):C({factor2})"
    model = ols(formula, data=data).fit()
    anova_table = sm.stats.anova_lm(model, typ=2)
    print(anova_table)
    return model, anova_table

In [7]:
def run_tukey(data, response, factor):
    """
    Corre Tukey HSD para un solo factor (columna categórica).
    """
    print(f"\n>>> Tukey HSD para {response} según {factor}")
    tukey = pairwise_tukeyhsd(endog=data[response], groups=data[factor], alpha=0.05)
    print(tukey.summary())
    return tukey

In [14]:
if __name__ == "__main__":
    # DIC
    model_dic, anova_dic = run_two_way_anova(df, DIC_COL, FACTOR1, FACTOR2)

    # Post-hoc Tukey para cada factor por separado
    run_tukey(df, DIC_COL, FACTOR1)
    run_tukey(df, DIC_COL, FACTOR2)

    # Temperatura
    model_temp, anova_temp = run_two_way_anova(df, TEMP_COL, FACTOR1, FACTOR2)

    # Post-hoc Tukey
    run_tukey(df, TEMP_COL, FACTOR1)
    run_tukey(df, TEMP_COL, FACTOR2)


=== ANOVA de dos vías para dic_micromol_kg ===
                          sum_sq     df          F        PR(>F)
C(estuary)          1.996309e+04    1.0   0.164162  6.862183e-01
C(area)             1.328999e+07    2.0  54.643781  9.179713e-17
C(estuary):C(area)  7.537039e+05    2.0   3.098966  4.945351e-02
Residual            1.216057e+07  100.0        NaN           NaN

>>> Tukey HSD para dic_micromol_kg según estuary
     Multiple Comparison of Means - Tukey HSD, FWER=0.05     
  group1    group2  meandiff p-adj    lower    upper   reject
-------------------------------------------------------------
Candelaria Palizada  33.7863 0.7297 -159.6129 227.1855  False
-------------------------------------------------------------

>>> Tukey HSD para dic_micromol_kg según area
  Multiple Comparison of Means - Tukey HSD, FWER=0.05  
group1 group2 meandiff p-adj   lower     upper   reject
-------------------------------------------------------
 Coast  Plume 167.1587 0.1205 -32.8827  367.2002  Fa

# Interpretation
## ANOVA

The factors were estuary and area.
A two-way ANOVA was performed for DIC and temperature, followed by a Tukey post-hoc test.

### Dissolved Inorganic Carbon (DIC)

- The estuary factor, with p = 0.686, did not have a significant effect on DIC.

- The area factor, with p = 0.0001, had a highly significant effect on DIC.

- The interaction between the two factors, with p = 0.049, had a marginally significant effect, suggesting that the relationship between DIC and area may depend on the estuary.

### Temperature

- The estuary factor, with p = 0.0577, had a marginally significant effect on temperature.

- The area factor, with p = 0.00122, had a highly significant effect on temperature.

- The interaction between the two factors, with p = 0.01512, was significant, indicating that the effect of area on temperature varies depending on the estuary.

## Tukey
### DIC

- Candelaria vs Palizada: mean difference ≈ 33.8 µmol/kg, p = 0.73, not significant. This agrees with the ANOVA results.

- Coast vs Plume: mean difference ≈ 167 µmol/kg, p = 0.12, not significant.

- Coast vs River: mean difference ≈ 821 µmol/kg, p < 0.001, significant.

- Plume vs River: mean difference ≈ 654 µmol/kg, p < 0.001, significant.

- DIC in the river is significantly higher than in the coast or plume. Coast and plume are statistically similar.

### Temperature

- Candelaria vs Palizada: mean difference = -0.3035 °C, adjusted p = 0.0887, not significant.

- Coast vs Plume: mean difference ≈ 0.5642 °C, p = 0.0206, significant.

- Coast vs River: mean difference ≈ 0.7031 °C, p < 0.0028, significant.

- Plume vs River: mean difference ≈ 0.1389 °C, p = 0.7839, not significant.
