In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from concurrent.futures import ThreadPoolExecutor
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter, AutoDateLocator
from dateutil import parser, tz
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import warnings
import seaborn as sns
from prefixspan import PrefixSpan

# Suppress warnings
warnings.filterwarnings("ignore", message="UnknownTimezoneWarning")

# Paths
DATA_PATH = "your_path"
RESULT_PATH = "your_path"

# Filenames
file_names = [f"IoTpond{i}.csv" for i in range(1, 13) if i != 5]  # Exclude Pond 5

# Define timezone mapping for CET
tzinfos = {"CET": tz.gettz("CET")}

# Load and preprocess data
def load_and_preprocess_data(file_names, data_path):
    datasets = []
    scaler = StandardScaler()

    for file_name in file_names:
        file_path = os.path.join(data_path, file_name)
        try:
            data = pd.read_csv(file_path, low_memory=False)
        except Exception as e:
            print(f"Error loading {file_name}: {e}")
            continue

        # Validate necessary columns
        required_columns = ['Date/Time', 'Temperature(C)', 'Turbidity(NTU)', 'Dissolved Oxygen(g/ml)',
                            'PH', 'Ammonia(g/ml)', 'Nitrate(g/ml)', 'Population', 'Fish_Length(cm)', 'Fish_Weight(g)']
        if not all(col in data.columns for col in required_columns):
            print(f"Skipping {file_name}: Missing required columns.")
            continue

        # Parse Date/Time and handle timezone
        try:
            data['Date/Time'] = data['Date/Time'].apply(
                lambda x: parser.parse(x, tzinfos=tzinfos).replace(tzinfo=None) if pd.notna(x) else None
            )
            data.dropna(subset=['Date/Time'], inplace=True)
        except Exception as e:
            print(f"Error parsing datetime in {file_name}: {e}")
            continue

        # Convert all required columns to numeric
        for col in required_columns[1:]:
            data[col] = pd.to_numeric(data[col], errors='coerce')

        # Replace invalid values and handle missing data
        data.replace([np.inf, -np.inf], np.nan, inplace=True)
        for col in required_columns[1:]:
            if data[col].isna().sum() > 0:
                data[col].fillna(data[col].median(), inplace=True)

        # Scale features
        try:
            data[required_columns[1:]] = scaler.fit_transform(data[required_columns[1:]])
        except ValueError as e:
            print(f"Error scaling features in {file_name}: {e}")
            continue

        # Ensure dataset has enough rows
        if data.shape[0] < 5:  # Minimum 5 rows
            print(f"Skipping {file_name}: Insufficient data after cleaning ({data.shape[0]} rows).")
            continue

        datasets.append(data)

    if len(datasets) == 0:
        print("No valid datasets found. Please check your data files.")
        return []

    return datasets

datasets = load_and_preprocess_data(file_names, DATA_PATH)

if len(datasets) == 0:
    raise ValueError("No valid datasets were loaded. Check your CSV files for errors.")

# Local anomaly detection using PrefixSpan
def detect_prefixspan_patterns(data, min_support=0.1):
    # Convert continuous features to discrete bins for rule mining
    binned_data = data.copy()
    for col in ['Temperature(C)', 'Turbidity(NTU)', 'Dissolved Oxygen(g/ml)',
                'PH', 'Ammonia(g/ml)', 'Nitrate(g/ml)', 'Population']:
        binned_data[col] = pd.qcut(binned_data[col], q=4, labels=False, duplicates='drop')

    # Convert data into sequences
    sequences = []
    for _, row in binned_data.iterrows():
        sequence = []
        for col in binned_data.columns:
            sequence.append(f"{col}={row[col]}")
        sequences.append(sequence)

    # Perform PrefixSpan mining
    ps = PrefixSpan(sequences)
    ps.minlen = 2  # Minimum length of sequential patterns
    ps.maxlen = 5  # Maximum length of sequential patterns
    patterns = ps.frequent(min_support * len(sequences))

    # Convert patterns to a DataFrame
    patterns_df = pd.DataFrame(patterns, columns=["Support", "Pattern"])
    return patterns_df

# Perform federated learning across ponds using parallelization
def federated_learning_parallel(datasets, min_support=0.1):
    global_patterns = pd.DataFrame()

    def process_pond(data, pond_index):
        patterns = detect_prefixspan_patterns(data, min_support=min_support)
        patterns["Pond"] = f"Pond {pond_index + 1}"
        return patterns

    with ThreadPoolExecutor() as executor:
        results = list(executor.map(lambda x: process_pond(x[1], x[0]), enumerate(datasets)))

    for patterns in results:
        global_patterns = pd.concat([global_patterns, patterns], ignore_index=True)

    return global_patterns

# Save results to Excel
def save_results_to_excel(patterns, datasets):
    os.makedirs(RESULT_PATH, exist_ok=True)
    output_file = os.path.join(RESULT_PATH, "FL_PrefixSpan_Patterns.xlsx")

    with pd.ExcelWriter(output_file) as writer:
        patterns.to_excel(writer, sheet_name="Global PrefixSpan Patterns", index=False)

        for i, data in enumerate(datasets):
            data.to_excel(writer, sheet_name=f"Pond {i + 1}", index=False)

    print(f"Results saved to {output_file}")

# Visualization
def plot_all_visualizations(datasets, global_patterns):

    # Time Series
    fig, axs = plt.subplots(nrows=(len(datasets) + 4) // 5, ncols=5, figsize=(20, 10))
    axs = axs.flatten()

    for i, data in enumerate(datasets):
        axs[i].plot(data['Date/Time'], data['Fish_Length(cm)'], label="Fish Length", linestyle='-', marker='o', markersize=3)
        axs[i].plot(data['Date/Time'], data['Fish_Weight(g)'], label="Fish Weight", linestyle='--', marker='x', markersize=3)
        axs[i].set_title(f"Pond {i + 1}", fontsize=10)
        axs[i].set_xlabel("Date/Time", fontsize=8)
        axs[i].set_ylabel("Growth Metrics", fontsize=8)
        axs[i].xaxis.set_major_locator(AutoDateLocator())
        axs[i].xaxis.set_major_formatter(DateFormatter("%Y-%m-%d"))
        axs[i].tick_params(axis="x", rotation=45, labelsize=7)
        axs[i].tick_params(axis="y", labelsize=7)
        axs[i].legend(fontsize=7)

    for j in range(len(datasets), len(axs)):
        fig.delaxes(axs[j])
        
        plt.tight_layout()
        plot_path = os.path.join(RESULT_PATH, "Ponds_Time_Series_Adjusted.png")
        plt.savefig(plot_path)
        plt.show()
        print(f"Time series visualization saved to {plot_path}")


    # Boxplots
    num_datasets = len(datasets)
    rows = (num_datasets + 4) // 5
    fig, axs = plt.subplots(nrows=rows, ncols=5, figsize=(20, 6 * rows))
    axs = axs.flatten()

    for i, data in enumerate(datasets):
        data[['Temperature(C)', 'Turbidity(NTU)', 'PH']].boxplot(ax=axs[i], widths=0.7)
        axs[i].set_title(f"Pond {i + 1}", fontsize=10)
        axs[i].set_ylabel("Values", fontsize=8)
        axs[i].tick_params(axis="x", labelsize=7)
        axs[i].tick_params(axis="y", labelsize=7)

    for j in range(num_datasets, len(axs)):
        fig.delaxes(axs[j])

    plt.tight_layout()
    plot_path = os.path.join(RESULT_PATH, "Ponds_Boxplots_Adjusted_5cols.png")
    plt.savefig(plot_path)
    plt.show()
    print(f"Boxplots saved to {plot_path}")

    # Anomaly Distribution
    plt.figure(figsize=(10, 6))
    global_patterns['Pond'].value_counts().plot(kind='bar')
    plt.title("Anomaly Distribution Across Ponds")
    plt.xlabel("Ponds")
    plt.ylabel("Number of Anomalies")
    plot_path = os.path.join(RESULT_PATH, "Anomaly_Distribution.png")
    plt.savefig(plot_path)
    plt.show()
    print(f"Anomaly distribution saved to {plot_path}")

# Evaluate relationship between anomalies and growth
def evaluate_anomalies_and_growth(global_patterns, datasets):
    results = []

    for i, data in enumerate(datasets):
        patterns = global_patterns[global_patterns["Pond"] == f"Pond {i + 1}"]
        correlation = data[["Fish_Length(cm)", "Fish_Weight(g)"]].corr()
        results.append({"Pond": f"Pond {i + 1}", "Correlation": correlation.iloc[0, 1], "Patterns": len(patterns)})

    evaluation_df = pd.DataFrame(results)
    evaluation_path = os.path.join(RESULT_PATH, "Evaluation_PrefixSpan.xlsx")
    evaluation_df.to_excel(evaluation_path, index=False)
    print(f"Evaluation metrics saved to {evaluation_path}")
    return evaluation_df

# Main Execution
def main():
    global_patterns = federated_learning_parallel(datasets, min_support=0.1)
    save_results_to_excel(global_patterns, datasets)
    plot_all_visualizations(datasets, global_patterns)
    evaluation_df = evaluate_anomalies_and_growth(global_patterns, datasets)
    print(evaluation_df)

# Ensure datasets are loaded
if 'datasets' not in locals() or not datasets:
    raise ValueError("Datasets not loaded. Ensure data preprocessing is complete before running the main script.")

if __name__ == "__main__":
    main()
