<a href="https://colab.research.google.com/github/RandivCosta/data-analysis/blob/main/Report.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Analysis Report

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from io import StringIO
import sys

In [None]:
data = pd.read_csv('adjusted_data.csv')

In [None]:
def capture_print(func):
    def wrapper(*args, **kwargs):
        old_stdout = sys.stdout
        new_stdout = StringIO()
        sys.stdout = new_stdout
        func(*args, **kwargs)
        output = new_stdout.getvalue()
        sys.stdout = old_stdout
        return output
    return wrapper

In [None]:
def generate_analysis_report(df):
    report = []

    # dataset Overview
    report.append("="*80)
    report.append("BEACH WATER QUALITY DATASET ANALYSIS REPORT")
    report.append("="*80)
    report.append("\n1. DATASET OVERVIEW")
    report.append(f"- Number of observations: {len(df)}")
    report.append(f"- Number of sites: {df['Site'].nunique()} ({', '.join(df['Site'].unique())})")
    report.append(f"- Number of weeks: {df['Week'].nunique()}")
    report.append(f"- Sample types: {', '.join(df['Area'].unique())}")

    # variable Types
    report.append("\n2. VARIABLE TYPES")
    report.append("- Categorical Variables:")
    report.append("  * Week (ordinal) - Week 1 to Week 10")
    report.append("  * Site (nominal) - Different sampling locations")
    report.append("  * Area (nominal) - Sea Water vs Sand")
    report.append("\n- Continuous Numerical Variables:")
    report.append("  * Bacterial counts (CFU/100ml): Enterococcus, Fecal Coliform, E.coli")
    report.append("  * Physical parameters: pH, Conductivity, Salinity, Temperature")
    report.append("  * Environmental parameters: Rainfall, Wind Speed")

    # normality Assessment
    report.append("\n3. NORMALITY ASSESSMENT")
    bacterial_vars = ['BC-Enterococcus (CFU/100ml)', 'BC-Fecal Coliform(CFU/100ml)', 'BC-E-Coli (CFU/100ml)']

    for var in bacterial_vars:
        stat, p = stats.shapiro(df[var])
        report.append(f"- {var}: Shapiro-Wilk p-value = {p:.4f} {'(Normal)' if p > 0.05 else '(Non-normal)'}")

    report.append("\nCONCLUSION: Bacterial counts are typically non-normally distributed, suggesting non-parametric tests are more appropriate.")

    # recommended Statistical Tests
    report.append("\n4. RECOMMENDED STATISTICAL TESTS")

    report.append("\nA. COMPARING SEA WATER VS SAND SAMPLES")
    report.append("- Test: Wilcoxon signed-rank test (non-parametric paired test)")
    report.append("- Why: Compares paired measurements (same site/week) without normality assumption")
    report.append("- Use case: Are bacterial levels significantly different between sea water and sand?")

    # example test output
    sea_water = df[df['Area'] == 'Sea Water'][bacterial_vars[0]]
    sand = df[df['Area'] == 'Sand'][bacterial_vars[0]]
    stat, p = stats.wilcoxon(sea_water, sand)
    report.append(f"\n  Example for {bacterial_vars[0]}: W = {stat:.1f}, p = {p:.4f}")

    report.append("\nB. COMPARING ACROSS SITES")
    report.append("- Test: Kruskal-Wallis test (non-parametric ANOVA alternative)")
    report.append("- Why: Compares >2 independent groups without normality assumption")
    report.append("- Use case: Do bacterial levels differ between the three sites?")

    # example test output
    sites_data = [df[df['Site'] == site][bacterial_vars[0]] for site in df['Site'].unique()]
    stat, p = stats.kruskal(*sites_data)
    report.append(f"\n  Example for {bacterial_vars[0]}: H = {stat:.1f}, p = {p:.4f}")

    report.append("\nC. TEMPORAL TRENDS (ACROSS WEEKS)")
    report.append("- Test: Spearman's rank correlation (non-parametric)")
    report.append("- Why: Assesses monotonic relationships without linearity/normality assumptions")
    report.append("- Use case: Is there a trend in bacterial levels over the 10-week period?")

    # example test output
    df['Week_num'] = df['Week'].str.extract('(\d+)').astype(int)
    corr, p = stats.spearmanr(df['Week_num'], df[bacterial_vars[0]])
    report.append(f"\n  Example for {bacterial_vars[0]}: rho = {corr:.2f}, p = {p:.4f}")

    report.append("\nD. RELATIONSHIP WITH ENVIRONMENTAL FACTORS")
    report.append("- Test: Spearman's rank correlation (non-parametric)")
    report.append("- Why: Examines relationships between bacterial counts and continuous environmental variables")
    report.append("- Use case: Are higher bacterial counts associated with higher rainfall or temperature?")

    # example test output
    corr, p = stats.spearmanr(df['EP-Rainfall'], df[bacterial_vars[0]])
    report.append(f"\n  Example between {bacterial_vars[0]} and Rainfall: rho = {corr:.2f}, p = {p:.4f}")

    report.append("\nE. COMPARING BACTERIAL SPECIES")
    report.append("- Test: Friedman test (non-parametric repeated measures ANOVA)")
    report.append("- Why: Compares multiple related samples (same physical samples tested for different bacteria)")
    report.append("- Use case: Do the three bacterial types show different concentration patterns?")

    # example test output (using first n complete cases)
    n = min(len(df), 30)  # Friedman requires balanced data
    stat, p = stats.friedmanchisquare(
        df[bacterial_vars[0]][:n],
        df[bacterial_vars[1]][:n],
        df[bacterial_vars[2]][:n]
    )
    report.append(f"\n  Example: Chi-square = {stat:.1f}, p = {p:.4f}")

    # visualization Examples
    report.append("\n5. VISUALIZATION EXAMPLES")
    report.append("- Boxplots recommended for group comparisons")
    report.append("- Scatter plots with trend lines for correlations")
    report.append("- Time series plots for temporal patterns")

    return "\n".join(report)

In [None]:
report = generate_analysis_report(data)
print(report)

BEACH WATER QUALITY DATASET ANALYSIS REPORT

1. DATASET OVERVIEW
- Number of observations: 60
- Number of sites: 3 (Site 1, Site 2, Site 3)
- Number of weeks: 10
- Sample types: Sea Water, Sand

2. VARIABLE TYPES
- Categorical Variables:
  * Week (ordinal) - Week 1 to Week 10
  * Site (nominal) - Different sampling locations
  * Area (nominal) - Sea Water vs Sand

- Continuous Numerical Variables:
  * Bacterial counts (CFU/100ml): Enterococcus, Fecal Coliform, E.coli
  * Physical parameters: pH, Conductivity, Salinity, Temperature
  * Environmental parameters: Rainfall, Wind Speed

3. NORMALITY ASSESSMENT
- BC-Enterococcus (CFU/100ml): Shapiro-Wilk p-value = 0.0076 (Non-normal)
- BC-Fecal Coliform(CFU/100ml): Shapiro-Wilk p-value = 0.0033 (Non-normal)
- BC-E-Coli (CFU/100ml): Shapiro-Wilk p-value = 0.0037 (Non-normal)

CONCLUSION: Bacterial counts are typically non-normally distributed, suggesting non-parametric tests are more appropriate.

4. RECOMMENDED STATISTICAL TESTS

A. COMPARIN