In [1]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from pathlib import Path
from collections import defaultdict
from scipy.optimize import curve_fit
from scipy.special import beta as beta_func  # Beta function
import os
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from dotenv import load_dotenv
import json
import warnings
warnings.filterwarnings('ignore')

In [84]:
def numeric_nonNumeric_col(df):
    '''
    seperates the numeric and non-numeric columns.
    '''
    return [col for col in df.columns if isinstance(col, int)], [col for col in df.columns if isinstance(col, str)]
def interpolate_full_range(df):
    # Extract numeric column names
    numeric_cols, nonNumeric_cols = numeric_nonNumeric_col(df)

    # Function to fill the row with the first valid value
    def fill_initial_nans(series):
        first_valid_idx = series.first_valid_index()
        last_valid_idx = series.last_valid_index()
        if first_valid_idx is not None:  # Ensure there's a valid index
            series.loc[:first_valid_idx] = series[first_valid_idx]   # Fill initial NaNs
        if last_valid_idx is not None:
            series.loc[last_valid_idx:] = series[last_valid_idx]
        return series

    # Apply the function row-wise
    df[numeric_cols] = df[numeric_cols].apply(fill_initial_nans, axis=1)

    # Create full range of numeric columns from min to max
    full_range = np.arange(min(numeric_cols), max(numeric_cols) + 1)

    # Reindex DataFrame to include all missing columns
    df_numeric = df[numeric_cols].reindex(columns=full_range)

    # Interpolate missing values row-wise
    df_interpolated = df_numeric.interpolate(method='linear', axis=1)

    # Combine back with categorical columns
    return pd.concat([df_interpolated, df[nonNumeric_cols]], axis=1).reset_index(drop=True)

def logistic_fit(x,y):
    # Define the logistic function
    def logistic(x, L, k, x0):
        # Provide initial guesses for parameters L, k, x0
        return L / (1 + np.exp(-k * (x - x0)))
    initial_guess = [8, 1, 4]
    # Fit the curve
    popt, _ = curve_fit(logistic, x, y, p0=initial_guess, bounds=(0, np.inf))
    # Use the fitted parameters to compute the model predictions
    y_fit = logistic(x, *popt)  
    return y_fit

def beta_fit(x,y):
    # Define the scaled beta function for fitting
    def beta_function(x, A, alpha, beta):
         # Map x to t in [0,1]
        t = x / len(x)
        # Compute the beta
        return A * (t**(alpha-1)) * ((1-t)**(beta-1)) / beta_func(alpha, beta)

    # Provide initial guesses and bounds for parameters:
    # A is around the maximum y value, and alpha, beta > 1 for a unimodal curve that is zero at boundaries.
    initial_guess = [7, 3, 3]
    bounds = ([0, 1, 1], [20, 15, 15])

    # Fit the function
    popt, pcov = curve_fit(beta_function, x, y, p0=initial_guess, bounds=bounds)

    # Compute the fitted values
    y_fit = beta_function(x, *popt) 
    return y_fit

def gaussian_fit(x,y):
    # Define the Gaussian function
    def gauss(x, A, mu, sigma):
        return A * np.exp(-((x - mu)**2) / (2 * sigma**2))

    # Initial parameter guess: A (around max(y)), mu (center), sigma
    initial_guess = [np.max(y), np.median(x), 1]
    bounds = ([0, 0, 0], [np.inf, np.inf, np.inf])

    # Fit the Gaussian function to the data
    popt, pcov = curve_fit(gauss, x, y, p0=initial_guess, bounds=bounds)

    # Compute fitted values
    y_fit = gauss(x, *popt)
    return y_fit

def calculate_fit_stats(original_values, fitted_values):
    # Compute statistical metrics
    mse = mean_squared_error(original_values, fitted_values) # Penalizes large errors
    rmse = np.sqrt(mse) # Easier to interpret (same unit as data)
    mae = mean_absolute_error(original_values, fitted_values) # Measures absolute errors
    r2 = r2_score(original_values, fitted_values) # Explains variance (0-1 range), closer to 1 is better
    nrmse = rmse / (original_values.max() - original_values.min()) # Typically, NRMSE < 0.1 is considered a good fit
    # Return results as a dictionary
    return {
        "MSE": mse,
        "RMSE": rmse,
        "MAE": mae,
        "R2": r2,
        "NRMSE": nrmse
    }

class InteractivePlotGen:
    def __init__(self, cumulative, new, treatments_to_merge=False):
        self.cumulative = cumulative
        self.new = new
        self.treatments_to_merge = treatments_to_merge
        self.treatments = list(self.cumulative.keys())
        self.full_doy_range = self.cumulative[self.treatments[0]]['all_loc'].index
    def _BB_treatment_plot(self):
        # Create 2 subplots for cumulative and new buds over time
        fig = make_subplots(rows=2, cols=1, subplot_titles=[f"Cumulative Buds", f"Daily New Buds"])

        # if ........................
        if isinstance(self.treatments_to_merge,dict):
            i=1
            for merge_key, merge_list in self.treatments_to_merge.items():
                merged_treatments_cumulative = pd.concat([self.cumulative[treatment]['all_loc']['mean'] for treatment in merge_list], axis=1).mean(axis=1)
                std_concatenated = pd.concat([self.cumulative[treatment]['all_loc']['std'] for treatment in merge_list], axis=1)
                pooled_std = np.sqrt((std_concatenated**2).mean(axis=1)) # RMS method
                fig.add_trace(go.Scatter(x=self.full_doy_range, y=merged_treatments_cumulative, mode='lines+markers', error_y=dict(type='data', array=pooled_std, thickness=1, width=2, visible=True),name=f"{merge_key}"), row=1, col=1)

                # Fit polynomial over all treatments
                fitted_cumulative = logistic_fit(np.arange(0,len(self.full_doy_range)), merged_treatments_cumulative)
                stats_cumulative = calculate_fit_stats(merged_treatments_cumulative, fitted_cumulative)
                fig.add_trace(
                    go.Scatter(x=self.full_doy_range, y=fitted_cumulative, mode='lines', line=dict(color="black", dash="dash") , name=f'{merge_key}--fitted'),
                row=1, col=1)
                fig.add_annotation(
                    text=f"R2 = {stats_cumulative['R2']:.2f}<br>NRMSE = {stats_cumulative['NRMSE']:.2f}",
                    x=self.full_doy_range[int(len(self.full_doy_range)/20)],  # Position at 1/6 of x-axis
                    y=(1- 0.2*i)*max(fitted_cumulative+pooled_std),      # Position near the top of y-axis
                    showarrow=False,
                    xref="x1",  # Referencing x-axis for subplot 1
                    yref="y1",  # Referencing y-axis for subplot 1
                    font=dict(size=6, color="black"),
                    align="left",
                    bordercolor="black",
                    borderwidth=1,
                    bgcolor="white")
                i+=1
        else:
             # Add first subplot (Cumulative Sum)
            [fig.add_trace(
                go.Scatter(x=self.full_doy_range, y=self.cumulative[treatment]['all_loc']['mean'], mode='lines+markers', error_y=dict(type='data', array=self.cumulative[treatment]['all_loc']['std'], thickness=1, width=2, visible=True), name=treatment), row=1, col=1) for treatment in self.treatments]


        # if  ........................
        if isinstance(self.treatments_to_merge,dict):
            i=1
            for merge_key, merge_list in self.treatments_to_merge.items():
                merged_treatments_new = pd.concat([self.new[treatment]['all_loc']['mean'] for treatment in merge_list], axis=1).mean(axis=1)
                std_concatenated = pd.concat([self.new[treatment]['all_loc']['std'] for treatment in merge_list], axis=1)
                pooled_std = np.sqrt((std_concatenated**2).mean(axis=1)) # RMS method
                fig.add_trace(go.Scatter(x=self.full_doy_range, y=merged_treatments_new, mode='lines+markers', error_y=dict(type='data', array=pooled_std, thickness=1, width=2, visible=True),name=f"{merge_key}"), row=2, col=1)

                # Fit polynomial over all treatments
                fitted_new = gaussian_fit(np.arange(0,len(self.full_doy_range)), merged_treatments_new)
                stats_new = calculate_fit_stats(merged_treatments_new, fitted_new)
                fig.add_trace(
                    go.Scatter(x=self.full_doy_range, y=fitted_new, mode='lines', line=dict(color="black", dash="dash") , name=f'{merge_key}--fitted'),
                    row=2, col=1)
                
                fig.add_annotation(
                    text=f"R2 = {stats_new['R2']:.2f}<br>NRMSE = {stats_new['NRMSE']:.2f}",
                    x=self.full_doy_range[int(len(self.full_doy_range)/20)],  # Position at 1/6 of x-axis
                    y=(1- 0.2*i)*max(fitted_new+pooled_std),      # Position near the top of y-axis
                    showarrow=False,
                    xref="x2",  # Referencing x-axis for subplot 2
                    yref="y2",  # Referencing y-axis for subplot 2
                    font=dict(size=6, color="black"),
                    align="left",
                    bordercolor="black",
                    borderwidth=1,
                    bgcolor="white")
                i+=1

        else:
        # Add second subplot (Daily New Buds)
            [fig.add_trace(
                go.Scatter(x=self.full_doy_range, y=self.new[treatment]['all_loc']['mean'], mode='lines+markers', error_y=dict(type='data', array=self.new[treatment]['all_loc']['std'], thickness=1, width=2, visible=True), name=treatment), row=2, col=1) for treatment in self.treatments]


        # Set y-axis labels for each subplot
        [fig.update_yaxes(title_text="Mean bud number", row=row_num, col=1) for row_num in [1,2]]
        # Update layout
        fig.update_layout(
            title="Cumulative & Daily New Buds Over Time",
            showlegend=True,
            height=700, width=1200)


        fig.update_xaxes(
            tickmode="array", 
            tickvals=self.full_doy_range,  # Use all available dates as ticks
            tickangle=45,
            tickfont=dict(size=8))  # Rotate for better visibility
        

        # Save as interactive HTML
        fig.write_html("bud_num_interactive_plot.html")
    
    
    # def _BB_location_plot(self):
    #     # Create 2 subplots for cumulative and new buds over time
    #     fig = make_subplots(rows=2, cols=1, subplot_titles=[f"Cumulative Buds", f"Daily New Buds"])
    #     # Add first subplot (Cumulative Sum)
    #     [fig.add_trace(
    #         go.Scatter(x=self.full_doy_range, y=self.cumulative[treatment]['all_loc']['mean'], mode='lines+markers', error_y=dict(type='data', array=self.cumulative[treatment]['all_loc']['std'], thickness=1, width=2, visible=True), name=treatment), row=1, col=1) for treatment in self.treatments]

        
    
    
    
    # def _origin_predict_plot(self):
    #     # Create 2 subplots for cumulative and new buds (original vs predicted)
    #     fig = make_subplots(rows=2, cols=1, subplot_titles=["Cumulative Buds", "Daily New Buds"])

    #     fig.add_trace(go.Scatter(
    #         x=self.fitted_poly['cumulative'], y=self.all_trtmnts['cumulative'].values, mode='markers',
    #         name='Original vs Fitted', marker=dict(color='blue')), row=1, col=1)
    #     fig.add_trace(go.Scatter(
    #         x=self.all_trtmnts['cumulative'].values, y=self.all_trtmnts['cumulative'].values, mode='lines',
    #         name='Cumulative 1:1', marker=dict(color='red')), row=1, col=1)
        
    #     # Second subplot
    #     fig.add_trace(go.Scatter(
    #         x=self.fitted_poly['new'], y=self.all_trtmnts['new'].values, mode='markers',
    #         name='Original vs Fitted', marker=dict(color='blue')), row=2, col=1)
    #     fig.add_trace(go.Scatter(
    #         x=self.all_trtmnts['new'].values, y=self.all_trtmnts['new'].values, mode='lines',
    #         name='New 1:1', marker=dict(color='red')), row=2, col=1)
        
        # Add stats as an annotation
        # fig.add_annotation(
        #     text=f"R2 = {stats_cumulative['R2']:.2f}<br>NRMSE = {stats_cumulative['NRMSE']:.2f}",
        #     x=fitted_cumulative[int(len(fitted_cumulative)/3)],  # Position at 1/3 of x-axis
        #     y=max(fitted_cumulative),      # Position near the top of y-axis
        #     showarrow=False,
        #     xref="x1",  # Referencing x-axis for subplot 1
        #     yref="y1",  # Referencing y-axis for subplot 1
        #     font=dict(size=10, color="black"),
        #     align="left",
        #     bordercolor="black",
        #     borderwidth=1,
        #     bgcolor="white")
        
        # fig.add_annotation(
        #     text=f"R2 = {stats_new['R2']:.2f}<br>NRMSE = {stats_new['NRMSE']:.2f}",
        #     x=fitted_new[int(len(fitted_new)/3)],  # Position at 1/3 of x-axis
        #     y=max(fitted_new),      # Position near the top of y-axis
        #     showarrow=False,
        #     xref="x2",  # Referencing x-axis for subplot 2
        #     yref="y2",  # Referencing y-axis for subplot 2
        #     font=dict(size=10, color="black"),
        #     align="left",
        #     bordercolor="black",
        #     borderwidth=1,
        #     bgcolor="white")
        
    #     # Set x-axis & y-axis labels for each subplot
    #     [fig.update_yaxes(title_text="Original", title_font=dict(size=10), row=row_num, col=1) for row_num in [1,2]]
    #     [fig.update_xaxes(title_text="Predicted", title_font=dict(size=10), row=row_num, col=1) for row_num in [1,2]]
    #     # Customize layout
    #     fig.update_layout(title="Original vs. Fitted Data", template="plotly_white")

    #     # Save as interactive HTML file
    #     fig.write_html("original vs predicted.html")

In [85]:
# Load environment variables
load_dotenv()
FOLDER_PATH = os.getenv('FOLDER_PATH')
# Define the expected column names for the final concatenated DataFrame
CULTIVAR = "Cultivar" 
TREATMENT = "Treatment"
LOCATION = "Location"

# Select the interpolation method
interpolation_method = 'linear'#, 'akima', 'pchip', 'quadratic'

def is_date_column(col):
    # Function to check if column names are dates
    try:
        pd.to_datetime(col)  # Try converting the column name to a date
        return True
    except:
        return False

with open("config_MultiDataFiles.json", "r") as file:
    config = json.load(file)

# Load the Excel file

# Initialize an empty list to store filtered DataFrames
filtered_dfs = []

# Process each file mentioned in the config
for file_name, file_info in config.items():
    file_path = os.path.join(FOLDER_PATH, file_name)
    # if not file_path.exists():
    #     print(f"Warning: {file_name} not found in {FOLDER_PATH}")
    #     continue

    # Process each sheet
    for sheet_info in file_info['sheets']:
        # Extract sheet names, cultivars, and treatments
        sheet_name = sheet_info['sheet_name']
        location = sheet_info['location']
        cultivar_col, cultivar_name = list(sheet_info['cultivar'].items())[0]
        treatment_col, treatment_name = list(sheet_info['treatments'].items())[0]

        xls = pd.ExcelFile(file_path, engine="openpyxl")
        df = pd.read_excel(xls, sheet_name = sheet_name)

        # Ensure columns exist before filtering
        if cultivar_col not in df.columns or treatment_col not in df.columns:
            print(f"Warning: Missing required column(s) in {file_name} - sheet: {sheet_name}")
            continue
        
        # Rename the Dates to Days of Year (DOY)
        doy_column = [pd.to_datetime(col).dayofyear for col in df.columns if is_date_column(col)]
        df.rename(columns={col:pd.to_datetime(col).dayofyear for col in df.columns if is_date_column(col)}, inplace=True)

        # Apply filtering for the given cultivar & treatments
        filtered_df = df[df[cultivar_col].isin([cultivar_name]) & df[treatment_col].isin([treatment_name])]

        # Rename columns
        columns_to_select = [cultivar_col] + [treatment_col] + doy_column  # Ensure it's a flat list
        filtered_df = filtered_df[columns_to_select]
        filtered_df.rename(columns={cultivar_col: CULTIVAR, treatment_col: TREATMENT}, inplace=True)
        # Add the LOCATION column to the DataFrame
        filtered_df[LOCATION] = location

        # Remove rows with any NaN values
        filtered_df = filtered_df.dropna()

        # Reset index 
        filtered_df = filtered_df.reset_index(drop=True)

        # Store the filtered DataFrame
        filtered_dfs.append(filtered_df)

# Concatenate all filtered DataFrames
final_df = pd.concat(filtered_dfs, ignore_index=True, sort=True)



# Calculate cumulative/new daily buds {DayOfYear: [mean , std]} for all treatments & all locations
cumulative_daily = defaultdict(lambda: dict())
new_daily = defaultdict(lambda: dict())
final_df_treatment_loc_specific = defaultdict(lambda: dict())

all_treatments = final_df[TREATMENT].unique()
all_locations = final_df[LOCATION].unique()

for trtmnt in all_treatments:
    final_df_treatment_loc_specific[trtmnt]['all_loc'] = interpolate_full_range(final_df[final_df[TREATMENT].isin([trtmnt])])
    # Calculate mean & std for the number of buds per day
    numeric_columns, nonNumeric_cols = numeric_nonNumeric_col(final_df_treatment_loc_specific[trtmnt]['all_loc'])

    # For cumulative bud num (mean & std), for all locations
    cumulative_daily[trtmnt]['all_loc'] = pd.DataFrame({k:v for k,v in zip(['mean','std'],[[round(final_df_treatment_loc_specific[trtmnt]['all_loc'][day].mean(),2) for day in numeric_columns], [round(final_df_treatment_loc_specific[trtmnt]['all_loc'][day].std(ddof=0),2) for day in numeric_columns]])}, index=numeric_columns)
    
    # Taking the daily differences between the interpolated days
    diff_daily = final_df_treatment_loc_specific[trtmnt]['all_loc'][numeric_columns].copy()
    diff_daily.iloc[:, 1:] = diff_daily.iloc[:, 1:].values - diff_daily.iloc[:, :-1].values
    daily_diff = pd.concat([diff_daily,final_df_treatment_loc_specific[trtmnt]['all_loc'][nonNumeric_cols]], axis=1)

    # For new daily bud num (mean & std), for all locations 
    new_daily[trtmnt]['all_loc'] = pd.DataFrame({k:v for k,v in zip(['mean','std'],[[round(daily_diff[day].mean(),2) for day in numeric_columns], [round(daily_diff[day].std(ddof=0),2) for day in numeric_columns]])}, index=numeric_columns)

    
    for loc in all_locations:
        if final_df[final_df[TREATMENT].isin([trtmnt]) & final_df[LOCATION].isin([loc])].empty: # continue if for a given treatment the current location doesn't exist
            continue
        final_df_treatment_loc_specific[trtmnt][loc] = interpolate_full_range(final_df[final_df[TREATMENT].isin([trtmnt]) & final_df[LOCATION].isin([loc])])

        # For cumulative bud num (mean & std), per location
        cumulative_daily[trtmnt][loc] = pd.DataFrame({k:v for k,v in zip(['mean','std'],[[round(final_df_treatment_loc_specific[trtmnt][loc][day].mean(),2) for day in numeric_columns], [round(final_df_treatment_loc_specific[trtmnt][loc][day].std(ddof=0),2) for day in numeric_columns]])}, index=numeric_columns)
        
        # Taking the daily differences between the interpolated days
        diff_daily = final_df_treatment_loc_specific[trtmnt][loc][numeric_columns].copy()
        diff_daily.iloc[:, 1:] = diff_daily.iloc[:, 1:].values - diff_daily.iloc[:, :-1].values
        daily_diff = pd.concat([diff_daily,final_df_treatment_loc_specific[trtmnt][loc][nonNumeric_cols]], axis=1)
        # For new daily bud num (mean & std), for all locations 
        new_daily[trtmnt][loc] = pd.DataFrame({k:v for k,v in zip(['mean','std'],[[round(daily_diff[day].mean(),2) for day in numeric_columns], [round(daily_diff[day].std(ddof=0),2) for day in numeric_columns]])}, index=numeric_columns)
    


In [86]:
merge_dict =  {'Control (merged)':['Con1', 'Control early', 'Control'], 'HC early (merged)':['HCT1', 'Hi-Cane early', 'HiCane 1'], 'HC late (merged)':['HCT5', 'Hi-Cane late', 'HiCane 5']}
plotter = InteractivePlotGen(cumulative_daily, new_daily, merge_dict)
plotter._BB_treatment_plot()