
# 1. Data reading

reading the "Pigments" Dataset, provided in an emial

First load the libraries / modules.

In [0]:
# Load the needed python libraries by executing this python code (press ctrl enter)
import numpy as np
import scipy.stats
import matplotlib.pyplot as plt
import pandas as pd

Load the dataset into a dataframe.

In [0]:
import pandas as pd

# URL zur Datei (GitHub)
url = "https://raw.githubusercontent.com/Tao-Pi/CAS-Applied-Data-Science-DAVID/refs/heads/main/data%2C%20calculations%2C%20results/Pigments_Version202509051820.csv"

# CSV einlesen und leere Zeilen überspringen
df = pd.read_csv(url, skip_blank_lines=True)

# Ausgabe prüfen
print(df.head())
print(f"\nShape: {df.shape}")


Browse through all rows.

In [0]:
pd.set_option('display.max_rows', 200)
df

change col names to more descriptive names

In [0]:
df.rename(columns={
    'name': 'sample',
    'RABD670': 'green pigments: index',
    'TChl-a': 'green pigments: direct concentration measurement (ug/g)',
    'locality': 'locality',
    'OC': 'organic carbon content in %'
}, inplace=True)
df

Print some descriptive statistics, using pandas summary.

In [0]:
df.describe()

# 02 Descriptive Stats

In [0]:
# === Descriptive statistics & plots for:
#     - "green pigments: index"
#     - "green pigments: direct concentration measurement (ug/g)"
# Append this block AFTER your df.rename(...) and df.describe().

from scipy import stats  # already imported as scipy.stats; this just gives 'stats' alias if needed

# Column aliases (matches your renamed columns)
col_x = "green pigments: index"
col_y = "green pigments: direct concentration measurement (ug/g)"

# Pairwise-clean data
df_pair = df.dropna(subset=[col_x, col_y]).copy()

# ---------- Descriptive statistics ----------
def describe_series(s: pd.Series) -> pd.Series:
    s_clean = s.dropna()
    out = pd.Series(dtype="float64")
    out["count"] = s_clean.shape[0]
    out["missing"] = s.shape[0] - s_clean.shape[0]
    out["missing_%"] = 100 * (1 - s_clean.shape[0] / s.shape[0]) if s.shape[0] else float("nan")
    out["mean"] = s_clean.mean()
    out["std"] = s_clean.std(ddof=1)
    out["cv (std/mean)"] = (out["std"] / out["mean"]) if out["mean"] not in (0, None) else float("nan")
    out["min"] = s_clean.min()
    out["q1"] = s_clean.quantile(0.25)
    out["median"] = s_clean.median()
    out["q3"] = s_clean.quantile(0.75)
    out["iqr"] = out["q3"] - out["q1"]
    out["max"] = s_clean.max()
    out["skewness"] = stats.skew(s_clean, bias=False)
    out["kurtosis (excess)"] = stats.kurtosis(s_clean, fisher=True, bias=False)
    if 3 <= len(s_clean) <= 5000:
        W, p = stats.shapiro(s_clean)
        out["shapiro_W"] = W
        out["shapiro_p"] = p
    else:
        out["shapiro_W"] = float("nan")
        out["shapiro_p"] = float("nan")
    return out

desc_table = pd.DataFrame({
    col_x: describe_series(df[col_x]),
    col_y: describe_series(df[col_y]),
})
print("\n=== Descriptive statistics ===")
print(desc_table.round(4))

# ---------- Correlation tests ----------
pearson_r, pearson_p = stats.pearsonr(df_pair[col_x], df_pair[col_y])
spearman_r, spearman_p = stats.spearmanr(df_pair[col_x], df_pair[col_y])
kendall_tau, kendall_p = stats.kendalltau(df_pair[col_x], df_pair[col_y])

corr_tbl = pd.DataFrame(
    {
        "statistic": ["Pearson r", "Spearman ρ", "Kendall τ"],
        "value": [pearson_r, spearman_r, kendall_tau],
        "p_value": [pearson_p, spearman_p, kendall_p],
    }
)
print("\n=== Correlation tests (pairwise complete) ===")
print(corr_tbl.round(6))




In [0]:
# === Plot 01: Histogram and box plot for the measures of pigments
#     (Index and Direct concentration measurement of green pigments (µg/g)) in different samples

# cm to inches conversion
cm_to_inch = 1/2.54
fig_width = 24.2 * cm_to_inch
fig_height = 10.08 * cm_to_inch

fig, axes = plt.subplots(2, 2, figsize=(fig_width, fig_height))
fig.suptitle(
    "Plot 01: Histogram and box plot for the measures of pigments\n"
    "(Index and Direct concentration measurement of green pigments (µg/g)) in different samples",
    fontsize=11,
    weight="bold"
)

col_x = "green pigments: index"
col_y = "green pigments: direct concentration measurement (ug/g)"

# Histogram for index
axes[0,0].hist(df[col_x].dropna(), bins=20)
axes[0,0].set_title("Histogram: Green pigments index", fontsize=9)
axes[0,0].set_xlabel(col_x, fontsize=8)
axes[0,0].set_ylabel("Frequency", fontsize=8)

# Boxplot for index
axes[1,0].boxplot(df[col_x].dropna(), vert=True)
axes[1,0].set_title("Boxplot: Green pigments index", fontsize=9)
axes[1,0].set_ylabel(col_x, fontsize=8)

# Histogram for direct concentration
axes[0,1].hist(df[col_y].dropna(), bins=20)
axes[0,1].set_title("Histogram: Direct chlorophyll concentration (µg/g)", fontsize=9)
axes[0,1].set_xlabel(col_y, fontsize=8)
axes[0,1].set_ylabel("Frequency", fontsize=8)

# Boxplot for direct concentration
axes[1,1].boxplot(df[col_y].dropna(), vert=True)
axes[1,1].set_title("Boxplot: Direct chlorophyll concentration (µg/g)", fontsize=9)
axes[1,1].set_ylabel(col_y, fontsize=8)

plt.tight_layout(rect=[0, 0.05, 1, 0.95])
plt.show()


# Hypotheses
# 
**H₀ (Null Hypothesis)**: The green pigment index and the direct chlorophyll concentration are not correlated.

**H₁ (Alternative Hypothesis)**: There is a statistically significant correlation between the pigment index and the direct chlorophyll concentration.

In [0]:
import scipy.stats as stats# Drop missing values for the test

df_clean = df.dropna(subset=[
    "green pigments: index",
    "green pigments: direct concentration measurement (ug/g)"
])

# Define variables
x = df_clean["green pigments: index"]
y = df_clean["green pigments: direct concentration measurement (ug/g)"]

# Hypothesis test: Pearson correlation
corr, p_value = stats.pearsonr(x, y)

print("Pearson correlation coefficient:", corr)
print("p-value:", p_value)

# Interpret the result
alpha = 0.05
if p_value < alpha:
    print("Reject H0: There is a significant correlation.")
else:
    print("Fail to reject H0: No significant correlation.")

# Visualization
plt.scatter(x, y, alpha=0.7)
plt.xlabel("index")
plt.ylabel("direct concentration measurement (µg/g)")
plt.title("Relationship between pigment index and direct concentration measurement")

# Add regression line
slope, intercept, r_value, p_val, std_err = stats.linregress(x, y)
plt.plot(x, slope*x + intercept, color="red", label=f"Linear fit (R²={r_value**2:.2f})")
plt.legend()
plt.show()