In [1]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from pathlib import Path
from collections import defaultdict
import os
from dotenv import load_dotenv
import json
import importlib
import warnings
warnings.filterwarnings('ignore')
import utils
importlib.reload(utils)

<module 'utils' from 'c:\\Users\\hranfs\\Documents\\VS code\\PlayGround\\utils.py'>

In [2]:

class InteractivePlotGen:
    def __init__(self, cumulative, new, merge_dict=False):
        self.cumulative = cumulative
        self.new = new
        self.merge_dict = merge_dict
        self.specifications = list(self.cumulative.keys()) # to get the index values

    def _treatment_merge(self, df, merge_list):
        merged_treatments = pd.concat([df[treatment]['median'] for treatment in merge_list], axis=1).median(axis=1)
        std_concatenated = pd.concat([df[treatment]['std'] for treatment in merge_list], axis=1)
        pooled_std = np.sqrt((std_concatenated**2).median(axis=1)) # RMS method
        error_minus = np.minimum(pooled_std, merged_treatments)  # ensures y - error_minus >= 0
        return merged_treatments, pooled_std, error_minus
    
    def _BB_treatment_plot(self, BB_threshold=False, Model_fitting=False):
        full_doy_range = self.cumulative[self.specifications[0]].index
        # Create 2 subplots for cumulative and new buds over time
        fig = make_subplots(rows=2, cols=1, subplot_titles=[f"Cumulative Buds", f"Daily New Buds"])

        # if the treatment_to_merge dict is not empty and is in the dict format start plotting the merged treatments
        if isinstance(self.merge_dict,dict):
            for merge_key, merge_list in self.merge_dict.items():
                merged_treatments_cumulative, pooled_std, error_minus = self._treatment_merge(self.cumulative, merge_list)
                fig.add_trace(go.Scatter(x=full_doy_range, y=merged_treatments_cumulative, mode='lines+markers', error_y=dict(type='data', array=pooled_std, thickness=1, width=2, visible=True, arrayminus=error_minus),name=f"{merge_key}"), row=1, col=1)

                # Fit polynomial over all treatments
                if Model_fitting==True:
                    fitted_cumulative = utils.logistic_fit(np.arange(0,len(full_doy_range)), merged_treatments_cumulative)
                    stats_cumulative = utils.calculate_fit_stats(merged_treatments_cumulative, fitted_cumulative)
                    fig.add_trace(
                        go.Scatter(x=full_doy_range, y=fitted_cumulative, mode='lines', line=dict(color="black", dash="dash") , name=f'{merge_key}--fitted [R2 = {stats_cumulative['R2']:.2f}]'),
                    row=1, col=1)

                # Add an annotation at the BB crossing point
                if BB_threshold is not False:
                    BB = BB_threshold*np.max(merged_treatments_cumulative)
                    idx = np.argmax(merged_treatments_cumulative >= BB) # Find the first index where y is greater than or equal to BB_threshold
                    fig.add_annotation(
                        x=list(full_doy_range)[idx],
                        y=merged_treatments_cumulative.iloc[idx],
                        text=f"{list(full_doy_range)[idx]}",
                        showarrow=True,
                        arrowhead=2)
        # If there isn't a given dict foe merging, or the treatment is not supposed to be merged, plot each separately
        for treatment in self.specifications:
            if  isinstance(self.merge_dict,dict) == False or not any(treatment in values for values in self.merge_dict.values()):
                # Add to the first subplot (Cumulative Sum)
                fig.add_trace(
                    go.Scatter(x=full_doy_range, y=self.cumulative[treatment]['median'], mode='lines+markers', error_y=dict(type='data', array=self.cumulative[treatment]['std'], thickness=1, width=2, visible=True, arrayminus=np.minimum(self.cumulative[treatment]['std'], self.cumulative[treatment]['median'])), name=treatment), row=1, col=1)


        # if the treatment_to_merge dict is not empty and is in the dict format start plotting the merged treatments
        if isinstance(self.merge_dict,dict):
            for merge_key, merge_list in self.merge_dict.items():
                merged_treatments_new, pooled_std, error_minus = self._treatment_merge(self.new, merge_list)
                fig.add_trace(go.Scatter(x=full_doy_range, y=merged_treatments_new, mode='lines+markers', error_y=dict(type='data', array=pooled_std, thickness=1, width=2, visible=True, arrayminus=error_minus),name=f"{merge_key}"), row=2, col=1)

                # Fit polynomial over all treatments
                if Model_fitting==True:
                    fitted_new = utils.gaussian_fit(np.arange(0,len(full_doy_range)), merged_treatments_new)
                    stats_new = utils.calculate_fit_stats(merged_treatments_new, fitted_new)
                    fig.add_trace(
                        go.Scatter(x=full_doy_range, y=fitted_new, mode='lines', line=dict(color="black", dash="dash") , name=f'{merge_key}--fitted [R2 = {stats_new['R2']:.2f}]'),
                        row=2, col=1)

        # If there isn't a given dict foe merging, or the treatment is not supposed to be merged, plot each separately
        for treatment in self.specifications:
            if isinstance(self.merge_dict,dict) == False or not any(treatment in values for values in self.merge_dict.values()):
                # Add to the second subplot (Daily New Buds)
                fig.add_trace(
                    go.Scatter(x=full_doy_range, y=self.new[treatment]['median'], mode='lines+markers', error_y=dict(type='data', array=self.new[treatment]['std'], thickness=1, width=2, visible=True, arrayminus=np.minimum(self.new[treatment]['std'], self.new[treatment]['median'])), name=treatment), row=2, col=1)


        # Set y-axis labels for each subplot
        [fig.update_yaxes(title_text="median bud number", row=row_num, col=1) for row_num in [1,2]]
        # Update layout
        fig.update_layout(
            title="Cumulative & Daily New Buds Over Time (All locations)",
            showlegend=True,
            height=700, width=1200)


        fig.update_xaxes(
            tickmode="array", 
            tickvals=full_doy_range,  # Use all available dates as ticks
            tickangle=45,
            tickfont=dict(size=8))  # Rotate for better visibility
        

        # Save as interactive HTML
        fig.write_html("treatment_specific_BB.html")
    
    def _BB_location_plot(self, BB_threshold=False, Model_fitting=False):
        first_loc_key = self.specifications[0]
        first_trtmnt_key = list(self.cumulative[first_loc_key].keys())[0]
        full_doy_range = self.cumulative[first_loc_key][first_trtmnt_key].index
        # Create 2 subplots for cumulative and new buds over time
        fig = make_subplots(rows=2, cols=1, subplot_titles=[f"Cumulative Buds", f"Daily New Buds"])
        for loc in self.specifications:
            # Add to the first subplot (Cumulative Sum)
            [fig.add_trace(
                go.Scatter(x=full_doy_range, y=self.cumulative[loc][treatment]['median'], mode='lines+markers', error_y=dict(type='data', array=self.cumulative[loc][treatment]['std'], thickness=1, width=2, visible=True, arrayminus=np.minimum(self.cumulative[loc][treatment]['std'], self.cumulative[loc][treatment]['median'])), name=f'{loc}: {treatment}'), row=1, col=1) for treatment in self.cumulative[loc].keys()]
                
            # Fit polynomial over all treatments
            if Model_fitting is True:
                fitted_cumulative = {treatment: utils.logistic_fit(np.arange(0,len(full_doy_range)), self.cumulative[loc][treatment]['median']) for treatment in self.cumulative[loc].keys()}
                stats_cumulative = {treatment: utils.calculate_fit_stats(self.cumulative[loc][treatment]['median'], fitted_cumulative[treatment]) for treatment in self.cumulative[loc].keys()}
                [fig.add_trace(
                    go.Scatter(x=full_doy_range, y=fitted_cumulative[treatment], mode='lines', line=dict(color="black", dash="dash") , name=f'{treatment}--fitted [R2 = {stats_cumulative[treatment]['R2']:.2f}]'),
                row=1, col=1)  for treatment in self.cumulative[loc].keys()]

            # Add an annotation at the BB crossing point
            if BB_threshold is not False:
                for treatment in self.cumulative[loc].keys():
                    BB = BB_threshold*np.max(self.cumulative[loc][treatment]['median'])
                    idx = np.argmax(self.cumulative[loc][treatment]['median'] >= BB) # Find the first index where y is greater than or equal to 0.1*BB
                    fig.add_annotation(
                        x=list(full_doy_range)[idx],
                        y=self.cumulative[loc][treatment]['median'].iloc[idx],
                        text=f"{list(full_doy_range)[idx]}",
                        showarrow=True,
                        arrowhead=2)
                    
        for loc in self.specifications:
            # Add to the second subplot (New buds)
            [fig.add_trace(
                go.Scatter(x=full_doy_range, y=self.new[loc][treatment]['median'], mode='lines+markers', error_y=dict(type='data', array=self.new[loc][treatment]['std'], thickness=1, width=2, visible=True, arrayminus=np.minimum(self.new[loc][treatment]['std'], self.new[loc][treatment]['median'])), name=f'{loc}: {treatment}'), row=2, col=1) for treatment in self.new[loc].keys()]
                
            # Fit polynomial over all treatments
            if Model_fitting is True:
                fitted_new = {treatment: utils.logistic_fit(np.arange(0,len(full_doy_range)), self.new[loc][treatment]['median']) for treatment in self.new[loc].keys()}
                stats_new = {treatment: utils.calculate_fit_stats(self.new[loc][treatment]['median'], fitted_new[treatment]) for treatment in self.new[loc].keys()}
                [fig.add_trace(
                    go.Scatter(x=full_doy_range, y=fitted_new[treatment], mode='lines', line=dict(color="black", dash="dash") , name=f'{treatment}--fitted [R2 = {stats_new[treatment]['R2']:.2f}]'),
                row=2, col=1)  for treatment in self.new[loc].keys()]

        # Set y-axis labels for each subplot
        [fig.update_yaxes(title_text="median bud number", row=row_num, col=1) for row_num in [1,2]]
        # Update layout
        fig.update_layout(
            title="Cumulative & Daily New Buds Over Time",
            showlegend=True,
            height=700, width=1200)


        fig.update_xaxes(
            tickmode="array", 
            tickvals=full_doy_range,  # Use all available dates as ticks
            tickangle=45,
            tickfont=dict(size=8))  # Rotate for better visibility

        
        # Save as interactive HTML
        fig.write_html("location_specific_BB.html")
    
    def _origin_predict_plot(self):
        # Create 2 subplots for cumulative and new buds (original vs predicted)
        fig = make_subplots(rows=2, cols=1, subplot_titles=["Cumulative Buds", "Daily New Buds"])

        fig.add_trace(go.Scatter(
            x=self.fitted_poly['cumulative'], y=self.all_trtmnts['cumulative'].values, mode='markers',
            name='Original vs Fitted', marker=dict(color='blue')), row=1, col=1)
        fig.add_trace(go.Scatter(
            x=self.all_trtmnts['cumulative'].values, y=self.all_trtmnts['cumulative'].values, mode='lines',
            name='Cumulative 1:1', marker=dict(color='red')), row=1, col=1)
        
        # Second subplot
        fig.add_trace(go.Scatter(
            x=self.fitted_poly['new'], y=self.all_trtmnts['new'].values, mode='markers',
            name='Original vs Fitted', marker=dict(color='blue')), row=2, col=1)
        fig.add_trace(go.Scatter(
            x=self.all_trtmnts['new'].values, y=self.all_trtmnts['new'].values, mode='lines',
            name='New 1:1', marker=dict(color='red')), row=2, col=1)
        
        # Add stats as an annotation
        fig.add_annotation(
            text=f"R2 = {stats_cumulative['R2']:.2f}<br>NRMSE = {stats_cumulative['NRMSE']:.2f}",
            x=fitted_cumulative[int(len(fitted_cumulative)/3)],  # Position at 1/3 of x-axis
            y=max(fitted_cumulative),      # Position near the top of y-axis
            showarrow=False,
            xref="x1",  # Referencing x-axis for subplot 1
            yref="y1",  # Referencing y-axis for subplot 1
            font=dict(size=10, color="black"),
            align="left",
            bordercolor="black",
            borderwidth=1,
            bgcolor="white")
        
        fig.add_annotation(
            text=f"R2 = {stats_new['R2']:.2f}<br>NRMSE = {stats_new['NRMSE']:.2f}",
            x=fitted_new[int(len(fitted_new)/3)],  # Position at 1/3 of x-axis
            y=max(fitted_new),      # Position near the top of y-axis
            showarrow=False,
            xref="x2",  # Referencing x-axis for subplot 2
            yref="y2",  # Referencing y-axis for subplot 2
            font=dict(size=10, color="black"),
            align="left",
            bordercolor="black",
            borderwidth=1,
            bgcolor="white")
        
        # Set x-axis & y-axis labels for each subplot
        [fig.update_yaxes(title_text="Original", title_font=dict(size=10), row=row_num, col=1) for row_num in [1,2]]
        [fig.update_xaxes(title_text="Predicted", title_font=dict(size=10), row=row_num, col=1) for row_num in [1,2]]
        # Customize layout
        fig.update_layout(title="Original vs. Fitted Data", template="plotly_white")

        # Save as interactive HTML file
        fig.write_html("original vs predicted.html")

In [3]:
# Load environment variables
load_dotenv()
FOLDER_PATH = os.getenv('FOLDER_PATH')
# Define the expected column names for the final concatenated DataFrame
CULTIVAR = "Cultivar" 
TREATMENT = "Treatment"
LOCATION = "Location"
# Create an Excel file with multiple sheets using ExcelWriter
output_excel = 'ObsAllData.xlsx'

def load_filtered_csv():
    with open("config_MultiDataFiles.json", "r") as file:
        config = json.load(file)

    # Load the Excel file

    # Initialize an empty list to store filtered DataFrames
    filtered_dfs = []
    excel_observs = []
    global all_treatments, all_locations

    # Process each file mentioned in the config
    # with pd.ExcelWriter(output_excel) as writer:

    for file_name, file_info in config.items():
        file_path = os.path.join(FOLDER_PATH, file_name)
        if not Path(file_path).exists():
            print(f"Warning: {file_name} not found in {FOLDER_PATH}")
            continue

        # Process each sheet
        for sheet_info in file_info['sheets']:
            # Extract sheet names, cultivars, and treatments
            sheet_name = sheet_info['sheet_name']
            location = sheet_info['location']
            max_observed_buds = sheet_info['max_observed_buds']
            cultivar_col, cultivar_name = list(sheet_info['cultivar'].items())[0]
            treatment_col, treatment_name = list(sheet_info['treatments'].items())[0]

            xls = pd.ExcelFile(file_path, engine="openpyxl")
            raw_df = pd.read_excel(xls, sheet_name = sheet_name)

            # Ensure columns exist before filtering
            if cultivar_col not in raw_df.columns or treatment_col not in raw_df.columns:
                print(f"Warning: Missing required column(s) in {file_name} - sheet: {sheet_name}")
                continue
            
            # Rename the Dates to Days of Year (DOY)
            doy_column = [pd.to_datetime(col).dayofyear for col in raw_df.columns if utils.is_date_column(col)]
            df = raw_df.rename(columns={col:pd.to_datetime(col).dayofyear for col in raw_df.columns if utils.is_date_column(col)})

            # Apply filtering for the given cultivar & treatments
            filtered_df = df[df[cultivar_col].isin([cultivar_name]) & df[treatment_col].isin([treatment_name])]

            # Rename columns
            columns_to_select = [cultivar_col] + [treatment_col] + doy_column  # Ensure it's a flat list
            filtered_df = filtered_df[columns_to_select]
            filtered_df.rename(columns={cultivar_col: CULTIVAR, treatment_col: TREATMENT}, inplace=True)
            # Add the LOCATION column to the DataFrame
            filtered_df[LOCATION] = location

            # Remove rows with any NaN values
            filtered_df = filtered_df.dropna()

            # Reset index 
            filtered_df = filtered_df.reset_index(drop=True)

            # Store the filtered DataFrame
            filtered_dfs.append(filtered_df)

            # Generate the output .xlsx file for observations(from experiment data) + predictions(placeholder column: to be valued from APSIM model)

            PBB, BB, BudBurstDOY = utils.BB_specifications(filtered_df[doy_column], max_observed_buds)
            excel_observ = pd.DataFrame(data={
                'SimulationName': f'{sheet_name}{filtered_df[TREATMENT].loc[0].replace(" ", "")}', 
                                                    'Clock.Today': [col for col in raw_df.columns if utils.is_date_column(col)],
                                                    'DOY': doy_column, 
                                                    LOCATION: filtered_df[LOCATION].loc[0], 
                                                    TREATMENT: filtered_df[TREATMENT].loc[0],
                                                    'KiwiFruit.Phenology.BrokenBuds': BB,
                                                    'KiwiFruit.Phenology.ProportionBB': PBB,
                                                    'KiwiFruit.Phenology.BudBurstDOY': BudBurstDOY})
            
            excel_observs.append(excel_observ)

    for i, df in enumerate(filtered_dfs):
        duplicate_columns = df.columns[df.columns.duplicated()]
        if not duplicate_columns.empty:
            print(f"Duplicate columns in DataFrame {i}: {duplicate_columns.tolist()}")

             
    final_df = pd.concat(filtered_dfs, ignore_index=True, sort=True).reset_index(drop=True)
    excel_observs_df = pd.concat(excel_observs, ignore_index=True, sort=False)


    excel_observs_df.to_excel(output_excel, sheet_name='ObsAllData', index=False)


    all_treatments = final_df[TREATMENT].unique()
    all_locations = final_df[LOCATION].unique()

    return final_df

def cumulative_and_daily(final_df):

    # Calculate cumulative/new daily buds {DayOfYear: [median , std]} for all treatments & all locations
    cumulative_daily_treatment = defaultdict(lambda: dict())
    new_daily_treatment = defaultdict(lambda: dict())
    cumulative_daily_location = defaultdict(lambda: dict())
    new_daily_location = defaultdict(lambda: dict())
    final_df_treatment_specific = defaultdict(lambda: dict())
    final_df_loc_specific = defaultdict(lambda: dict())

    for trtmnt in all_treatments:
        numeric_columns, nonNumeric_columns = utils.numeric_nonNumeric_col(final_df[final_df[TREATMENT].isin([trtmnt])])
        
        final_df_treatment_specific[trtmnt] = utils.interpolate_full_range(final_df[final_df[TREATMENT].isin([trtmnt])])
        # Calculate median & std for the number of buds per day
        numeric_columns_interpol, nonNumeric_columns_interpol = utils.numeric_nonNumeric_col(final_df_treatment_specific[trtmnt])

        # For cumulative bud num (median & std), for all locations
        cumulative_daily_treatment[trtmnt] = pd.DataFrame({k:v for k,v in zip(['median', 'std'],[[round(final_df_treatment_specific[trtmnt][day].median(),2) for day in numeric_columns_interpol],[np.nan  for day in numeric_columns_interpol]])}, index=numeric_columns_interpol)
        # Calculate std only for observed (non-interpolated) values
        STD = final_df[final_df[TREATMENT].isin([trtmnt])][numeric_columns].std(ddof=0).dropna()
        for idx in STD.index: # insert STD only for indexes of observed data
            cumulative_daily_treatment[trtmnt]['std'][idx] = round(STD[idx],2)

        # Taking the daily differences between the interpolated days
        daily_diff_interpol = utils.find_daily_diff(final_df_treatment_specific[trtmnt]) # all interpolations diff required for median 
        daily_diff = utils.find_daily_diff(final_df[final_df[TREATMENT].isin([trtmnt])].dropna(axis=1)) # only observation days diff is required for std
        # For new daily bud num (median & std), for all locations 
        new_daily_treatment[trtmnt] = pd.DataFrame({k:v for k,v in zip(['median','std'],[[round(daily_diff_interpol[day].median(),2) for day in numeric_columns_interpol], [np.nan for day in numeric_columns_interpol]])}, index=numeric_columns_interpol)
        STD = daily_diff.std(ddof=0)
        for idx in STD.index: # insert STD only for indexes of observed data
            new_daily_treatment[trtmnt]['std'][idx] = round(STD[idx],2)

    for loc in all_locations:
        for trtmnt in all_treatments:
            if final_df[final_df[LOCATION].isin([loc]) & final_df[TREATMENT].isin([trtmnt])].empty: # continue if for a given treatment the current location doesn't exist
                continue

            numeric_columns, nonnumeric_columns = utils.numeric_nonNumeric_col(final_df[final_df[LOCATION].isin([loc]) & final_df[TREATMENT].isin([trtmnt])])
            final_df_loc_specific[loc][trtmnt] = utils.interpolate_full_range(final_df[final_df[LOCATION].isin([loc]) & final_df[TREATMENT].isin([trtmnt])])

            # For cumulative bud num (median & std), per location
            cumulative_daily_location[loc][trtmnt] = pd.DataFrame({k:v for k,v in zip(['median','std'],[[round(final_df_loc_specific[loc][trtmnt][day].median(),2) for day in numeric_columns_interpol], [np.nan for day in numeric_columns_interpol]])}, index=numeric_columns_interpol)
            # Calculate std only for observed (non-interpolated) values
            STD = final_df[final_df[TREATMENT].isin([trtmnt]) & final_df[LOCATION].isin([loc])][numeric_columns].std(ddof=0).dropna()
            for idx in STD.index: # insert STD only for indexes of observed data
                cumulative_daily_location[loc][trtmnt]['std'][idx] = round(STD[idx],2)

            # Taking the daily differences between the interpolated days
            daily_diff_interpol = utils.find_daily_diff(final_df_loc_specific[loc][trtmnt]) # all interpolations diff required for median 
            daily_diff = utils.find_daily_diff(final_df[final_df[TREATMENT].isin([trtmnt]) & final_df[LOCATION].isin([loc])].dropna(axis=1)) # only observation days diff is required for std
            # For new daily bud num (median & std), for all locations 
            new_daily_location[loc][trtmnt] = pd.DataFrame({k:v for k,v in zip(['median','std'],[[round(daily_diff_interpol[day].median(),2) for day in numeric_columns_interpol], [np.nan for day in numeric_columns_interpol]])}, index=numeric_columns_interpol)
            STD = daily_diff.std(ddof=0)
            for idx in STD.index: # insert STD only for indexes of observed data
                new_daily_location[loc][trtmnt]['std'][idx] = round(STD[idx],2)
    return cumulative_daily_treatment, new_daily_treatment, cumulative_daily_location, new_daily_location


In [4]:
final_df = load_filtered_csv()
cumulative_daily_treatment, new_daily_treatment, cumulative_daily_location, new_daily_location = cumulative_and_daily(final_df)

# Treatment specific plot
# treatments_merge =  {'Control (merged)':['Con1', 'Con2', 'Control early', 'Control late', 'Control'], 'HC early (merged)':['HCT1', 'Hi-Cane early', 'HiCane 1'], 'HC late (merged)':['HCT5', 'Hi-Cane late', 'HiCane 5']}
# plotter = InteractivePlotGen(cumulative_daily_treatment, new_daily_treatment, treatments_merge)
# plotter._BB_treatment_plot(Model_fitting=True, BB_threshold=0.05)

# Location specific plot
plotter = InteractivePlotGen(cumulative_daily_location, new_daily_location)
plotter._BB_location_plot(BB_threshold=0.05)
