## Sandbox notebook


In [None]:
import os
import time 
import shutil  
import numpy as np
import pandas as pd
pd.set_option("display.precision", 20)

from script_01_00 import generate_initial_variables
from script_01_01 import generate_raw_confounds
from script_01_02 import generate_nonlin_confounds

from nets.nets_load_match import nets_load_match
from nets.nets_inverse_normal import nets_inverse_normal 
from nets.nets_normalise import nets_normalise 
from nets.nets_demean import nets_demean
from nets.nets_deconfound_multiple import nets_deconfound_multiple

from duplicate.duplicate_categorical import duplicate_categorical
from duplicate.duplicate_demedian_norm_by_site import duplicate_demedian_norm_by_site

from preproc.datenum import datenum
from preproc.switch_type import switch_type
from preproc.days_in_year import days_in_year
from preproc.filter_columns_by_site import filter_columns_by_site

from memmap.MemoryMappedDF import MemoryMappedDF
from memmap.read_memmap_df import read_memmap_df
from memmap.addBlockToMmap import addBlockToMmap

In [None]:
data_dir = '/well/win/projects/ukbiobank/fbp/confounds/data/72k_data/'

# Output directory (will eventually be equal to data_dir)
out_dir = '/well/nichols/users/inf852/confounds/data/'

In [None]:

# Precomputed filenames
p_fname = os.path.join(os.getcwd(),'saved_memmaps','p.npz')
ve_fname = os.path.join(os.getcwd(),'saved_memmaps','ve.npz')

# Read in precomputed p and ve
p = read_memmap_df(p_fname)
ve = read_memmap_df(ve_fname)

In [None]:

from scipy.stats import scoreatpercentile
# Get the average and maximum variance explained
avg_ve = ve[:,:].mean()
max_ve = ve[:,:].max()

# Get percentage thresholds
thr_for_avg = scoreatpercentile(avg_ve, 95)
thr_for_ve = max(0.75, scoreatpercentile(ve[:,:].dropna().values.flatten(), 99.9))

In [None]:
thr_for_ve

In [None]:
flattened_ve=ve[:,:].values.flatten()
flattened_ve = flattened_ve[~np.isnan(flattened_ve)]
thr_for_ve = max(0.75, scoreatpercentile(flattened_ve, 99.9))

In [None]:
thr_for_ve

In [None]:

# Read in precomputed memmaps
IDPs = read_memmap_df(os.path.join(os.getcwd(),'saved_memmaps','IDPs.npz'))
IDPs_deconf = read_memmap_df(os.path.join(os.getcwd(),'saved_memmaps','IDPs_deconf.npz'))
nonIDPs = read_memmap_df(os.path.join(os.getcwd(),'saved_memmaps','nonIDPs.npz'))
misc = read_memmap_df(os.path.join(os.getcwd(),'saved_memmaps','misc.npz'))
confounds = read_memmap_df(os.path.join(os.getcwd(),'saved_memmaps','confounds.npz'))
nonlinear_confounds = read_memmap_df(os.path.join(os.getcwd(),'saved_memmaps','nonlinear_confounds.npz'))
# p1 = read_memmap_df(os.path.join(os.getcwd(),'saved_memmaps','p.npz'))
# nonlinear_confounds = read_memmap_df(os.path.join(os.getcwd(),'saved_memmaps','nonlinear_confounds_reduced.npz'))
# IDPs_deconf_ct = read_memmap_df(os.path.join(os.getcwd(),'saved_memmaps','IDPs_deconf_ct.npz'))
# confounds_with_ct = read_memmap_df(os.path.join(os.getcwd(),'saved_memmaps','confounds_with_ct.npz'))


In [None]:
nonlinear_confounds.shape

In [None]:

'<iframe src="' + + '" width="100%" height="500"></iframe>'

In [None]:
def ascii_loading_bar(percentage):
    """
    Generate an ASCII loading bar based on the given percentage.

    Args:
        percentage (float): The percentage value (between 0 and 100).

    Returns:
        str: The ASCII loading bar string.
    """
    # Ensure the percentage is within the valid range
    percentage = max(0, min(100, percentage))

    # Calculate the number of filled and empty bars
    bar_length = 50  # Adjust this value to change the length of the loading bar
    filled_bars = int(bar_length * (percentage / 100))
    empty_bars = bar_length - filled_bars

    # Create the loading bar string
    bar = '[' + '█' * filled_bars + '░' * empty_bars + ']'

    # Format the percentage string
    percentage_str = f"{percentage:.1f}%"

    # Combine the loading bar and percentage
    loading_bar = f"{bar} {percentage_str}"

    return loading_bar

print(ascii_loading_bar(21))

In [None]:
def my_log(message, mode='a', filename='file.html'):
    """
    Write a message to an HTML file with a header, basic formatting, and styling.

    Args:
        message (str): The message to be written.
        mode (str, optional): The mode to open the file with. 'a' for append, 'r' for replace current line. Defaults to 'a'.
        filename (str, optional): The name of the HTML file. Defaults to 'file.html'.

    Returns:
        None
    """
    try:
        with open(filename, 'r+', encoding='utf-8') as file:
            lines = file.readlines()
            if not lines:  # If the file is empty, create the HTML structure
                file.write('<!DOCTYPE html>\n<html>\n<head>\n<title>Confounds Log</title>\n')
                file.write('<style>\nbody { font-family: Arial, sans-serif; margin: 20px; background-color: #e6f0ff; }\n')
                file.write('h1 { color: #333; position: sticky; top: 0; background-color: #e6f0ff; padding: 10px; }\n')
                file.write('hr { border: none; border-top: 1px solid #ccc; margin: 10px 0; }\n</style>\n</head>\n<body>\n<h1>Confounds Log</h1>\n<hr>\n')
                if mode == 'a':
                    file.write(f'<p>{message}</p>\n')
                elif mode == 'r':
                    file.write(f'<p>{message}</p>\n')
                file.write('</body>\n</html>')
            else:
                if mode == 'a':
                    lines.append(f'<p>{message}</p>\n')
                elif mode == 'r':
                    print('lines 2: ', lines[-1])
                    print(f'<p>{message}</p>\n')
                    if lines:
                        lines[-3] = f'<p>{message}</p>\n'
                        lines[-2] = '</body>\n'
                        lines[-1] = '</html>'
                    else:
                        lines = lines[:-2]
                        lines.append(f'<p>{message}</p>\n')
                        lines.append('</body>\n')
                        lines.append('</html>')
                file.seek(0)
                file.writelines(lines)
    except FileNotFoundError:
        with open(filename, 'w', encoding='utf-8') as file:
            file.write('<!DOCTYPE html>\n<html>\n<head>\n<title>Confounds Log</title>\n')
            file.write('<style>\nbody { font-family: Arial, sans-serif; margin: 20px; background-color: #e6f0ff; }\n')
            file.write('h1 { color: #333; position: sticky; top: 0; background-color: #e6f0ff; padding: 10px; }\n')
            file.write('hr { border: none; border-top: 1px solid #ccc; margin: 10px 0; }\n</style>\n</head>\n<body>\n<h1>Confounds Log</h1>\n<hr>\n')
            if mode == 'a':
                file.write(f'<p>{message}</p>\n')
            elif mode == 'r':
                file.write(f'<p>{message}</p>\n')
            file.write('</body>\n</html>')
            
for i in range(100):
    my_log('test' + str(i), mode='a', filename=os.path.join(os.getcwd(),'tmp3.html'))

In [None]:
from datetime import datetime
print(str(datetime.now()))

## draft 16

In [None]:

# Convert input to memory mapped dataframes if it isn't already
all_conf = switch_type(all_conf, out_type='MemoryMappedDF')
IDPs = switch_type(IDPs, out_type='MemoryMappedDF')

# Confound groups we are interested in.
conf_name = ['AGE', 'AGE_SEX', 'HEAD_SIZE',  'TE', 'STRUCT_MOTION', 
             'DVARS', 'HEAD_MOTION', 'HEAD_MOTION_ST', 'TABLE_POS', 
             'EDDY_QC']

# Get all the confounds in the group
conf_group = all_conf.get_groups(conf_name)

# Get the subject ids
sub_ids = IDPs.index

# -------------------------------------------------------------------------
# Estimate the block size (number of subjects we want to allow in memory at
# a given time).
# -------------------------------------------------------------------------
# Developer note: The below is only a rough estimate, but is fairly robust
# as it is a little conservative. The rule of thumb is to take the maximum
# amount of memory (MAXMEM) divide it by the number of subjects we have,
# divide by 64 (as each observation is float64 at most) and then divide by
# 8 (as we may want to make several copies of whatever we load in, but we
# rarely make more than 8). The resulting number should be roughly the
# number of columns of a dataframe we are able to load in at a time. This
# doesn't need to be perfect as often python can handle more - it is just
# a precaution, and does improve efficiency substantially.
# -------------------------------------------------------------------------

# Rough estimate of maximum memory (bytes)
MAXMEM = 2**32

# Get the number of subjects
n_sub = len(sub_ids)

# Block size computation
blksize = int(MAXMEM/n_sub/8/64)

# -------------------------------------------------------------------------

# Read in the IDs for site
site_ids = nets_load_match(os.path.join(data_dir, 'ID_SITE.txt'), sub_ids)

# Get the unique site ids
unique_site_ids = np.unique(site_ids)

# Initialize indSite as a list to hold the indices
inds_per_site = []

# Loop over each value in site ids
for site_id in unique_site_ids:

    # Find the indices where all elements in a row of siteDATA match the current valueSite
    # Note: This assumes siteDATA and siteValues have compatible shapes or values for comparison
    indices = np.where((site_ids == site_id).all(axis=1))[0]

    # Append the found indices to the indSite list
    inds_per_site.append(indices)

# Delete the indices
del indices

# -------------------------------------------------------------------------

# Initialise empty array to store results
conf_nonlin = pd.DataFrame(index=conf_group.index)

# Load in confounds
conf = all_conf[:,:]

# Site number
for site_index in (unique_site_ids + 1):
    
    # Subset the confounds to a specific site
    conf_group_site = filter_columns_by_site(conf_group, site_index)

    # Get indices for the current site
    site_indices = inds_per_site[site_index-1] 

    # Reduce to just the indices we're interested in
    conf_group_site = conf_group_site.iloc[site_indices, :]

    # Get all the confounds at the site
    all_conf_site = conf.iloc[site_indices, :]

    # Get index
    site_index = all_conf_site.index

    # Get conf_group_site squared
    conf_group_site_squared = nets_normalise(conf_group_site**2)
    conf_group_site_squared.columns = [f"{col}_squared" for col in conf_group_site_squared.columns]

    # Get conf_group_site inverse normalised
    conf_group_site_inormal = nets_inverse_normal(conf_group_site);
    conf_group_site_inormal.columns = [f"{col}_inormal" for col in conf_group_site_inormal.columns]

    # Get conf_group_site squared inverse normalised
    conf_group_site_squared_inormal = nets_inverse_normal(conf_group_site_squared);
    conf_group_site_squared_inormal.columns = [f"{col}_inormal" for col in conf_group_site_squared_inormal.columns]

    # Concatenate them side by side
    conf_group_site_nonlin = pd.concat([conf_group_site_squared,
                                        conf_group_site_inormal,
                                        conf_group_site_squared_inormal], axis=1)
    conf_group_site_nonlin.index = site_index

    # Catch any nans from fully empty columns (we'll drop these later)
    conf_group_site_nonlin = conf_group_site_nonlin.fillna(0)

    # -------------------------------------------------------
    # Deconfound for this site
    # -------------------------------------------------------

    # Perform deconfounding
    conf_nonlin_deconf = nets_deconfound_multiple(conf_group_site_nonlin,
                                                  all_conf_site,
                                                  mode='svd',
                                                  blksize=blksize)
    
    # Reindex the dataframe to fill off-site values with zeros
    conf_nonlin_deconf = conf_nonlin_deconf.reindex(conf_group.index).fillna(0)

    # Drop any columns with only 5 values or less
    # -------------------------------------------
    na_columns = ((~conf_group_site.isna()).sum(axis=0) >= 5)

    # Columns for squared
    na_columns_squared = na_columns.copy()
    na_columns_squared.index = [column + '_squared' for column in na_columns_squared.index]
    
    # Columns for inormal
    na_columns_inormal = na_columns.copy()
    na_columns_inormal.index = [column + '_inormal' for column in na_columns_inormal.index]
    
    # Columns for squared inormal
    na_columns_squared_inormal = na_columns.copy()
    na_columns_squared_inormal.index = [column + '_squared_inormal' for column in na_columns_squared_inormal.index]
    
    # Combine
    na_columns = pd.concat((na_columns_squared,na_columns_inormal,na_columns_squared_inormal))
    
    # MARKER - replicating nets_unconfound_par threshold
    #conf_nonlin_deconf = conf_nonlin_deconf.loc[:, (conf_nonlin_deconf.abs() >1e-8).sum(axis=0) >= 5]
    conf_nonlin_deconf = conf_nonlin_deconf.loc[:, na_columns]
    print('executed newer step')
    
    # Concatenate results
    conf_nonlin = conf_nonlin.join(conf_nonlin_deconf, how='outer')

In [None]:
conf_nonlin

In [None]:
# Get names directory
names_dir = os.path.join(data_dir, '..', 'NAMES_confounds')

In [None]:
# Set the IDPs and confounds we need
IDPs = IDPs
confounds = confounds_with_ct

In [None]:

# Get the subject IDs
sub_ids = IDPs.index

# -------------------------------------------------------------------------
# Estimate the block size (number of subjects we want to allow in memory at
# a given time).
# -------------------------------------------------------------------------
# Developer note: The below is only a rough estimate, but is fairly robust
# as it is a little conservative. The rule of thumb is to take the maximum
# amount of memory (MAXMEM) divide it by the number of subjects we have,
# divide by 64 (as each observation is float64 at most) and then divide by
# 8 (as we may want to make several copies of whatever we load in, but we
# rarely make more than 8). The resulting number should be roughly the
# number of columns of a dataframe we are able to load in at a time. This
# doesn't need to be perfect as often python can handle more - it is just
# a precaution, and does improve efficiency substantially.
# -------------------------------------------------------------------------

# Rough estimate of maximum memory (bytes)
MAXMEM = 2**32

# Get the number of subjects
n_sub = len(sub_ids)

# Block size computation
blksize = int(MAXMEM/n_sub/8/64)



In [None]:

# -------------------------------------------------------------------------
# Deconfound IDPs
# -------------------------------------------------------------------------

# Switch type to reduce transfer costs
confounds = switch_type(confounds, out_type='filename')
IDPs = switch_type(IDPs, out_type='filename')

# Deconfound IDPs
IDPs_deconf = nets_deconfound_multiple(IDPs, confounds, 'nets_svd', 
                                       blksize=blksize, coincident=False,
                                       cluster_cfg=cluster_cfg)


In [None]:

# Switch IDPs back
IDPs = switch_type(IDPs, out_type='pandas') 

In [None]:

# Get day fraction (time of day)
day_fraction = nonIDPs[:,'TOD']

# Normalise day fraction
conf_acq_time_linear = nets_normalise(day_fraction)
conf_acq_time_linear = conf_acq_time_linear.fillna(0)


In [None]:
conf_acq_time_linear

In [None]:

    conf_acq_time_linear = conf_acq_time_linear.sort_values(by='TOD')

In [None]:

    # Get sorted indices
    sub_ids_sorted = conf_acq_time_linear.index

In [None]:

# Sort IDPs and IDPs_deconf based on sorted sub_ids
IDPs_sorted = IDPs.loc[sub_ids_sorted,:]
IDPs_deconf_sorted = IDPs_deconf.loc[sub_ids_sorted,:]

In [None]:

# Read in the IDs for site
site_ids = nets_load_match(os.path.join(data_dir, 'ID_SITE.txt'), sub_ids)
site_ids.index = sub_ids
site_ids

In [None]:


# Sort site ids
site_ids_sorted = site_ids.loc[sub_ids_sorted,:]
site_ids_sorted

In [None]:

# Get the unique site ids
unique_site_ids = np.unique(site_ids)

# Initialize indSite as a list to hold the indices
inds_per_site_sorted = {}

# Loop over each value in site ids
for site_id in (unique_site_ids + 1):

    # Find the indices where all elements in a row of siteDATA match the current valueSite
    # Note: This assumes siteDATA and siteValues have compatible shapes or values for comparison
    indices = np.where((site_ids_sorted == site_id-1).all(axis=1))[0]

    # Append the found indices to the indSite list
    inds_per_site_sorted[site_id] = indices

# Delete the indices
del indices

In [None]:
num_sites = len(inds_per_site_sorted)

In [None]:
# Sigma value
sigma = 0.1
import time

In [None]:
# # Loop through sites
# for site_id in inds_per_site_sorted:

#     print('Running site ', str(site_id))
#     t1_total = time.time()

#     # Get subjects for this site
#     inds_site = inds_per_site_sorted[site_id]

#     # Get the IDPs for this site
#     IDPs_for_site = IDPs_deconf_sorted.iloc[inds_site,:]

#     # Initialise smoothed IDPs for this site
#     smoothed_IDPs_for_site = pd.DataFrame(np.zeros(IDPs_for_site.shape),
#                                           index=IDPs_for_site.index,
#                                           columns=IDPs_for_site.columns)

#     # Get IDPs for site as numpy array
#     IDPs_for_site = IDPs_for_site.values

#     # Loop through subjects within site
#     for j, sub_id in enumerate(inds_site):

#         print('Iteration ', j, '/', len(inds_site))

#         t1 = time.time()
#         # Get time delta
#         timedelta = conf_acq_time_linear.iloc[inds_site[j],:]-conf_acq_time_linear.iloc[inds_site,:]
#         t2 = time.time()
#         print('marker1: ', t2-t1)

#         t1 = time.time()
#         # Get the gaussian kernel
#         gauss_kernel = np.exp(-0.5*((timedelta/sigma)**2))
#         t2 = time.time()
#         print('marker2: ', t2-t1)

#         t1 = time.time()
#         # Handle any potential overflow
#         gauss_kernel[gauss_kernel.abs()<1e-10]=0
#         t2 = time.time()
#         print('marker3: ', t2-t1)

        
#         t1 = time.time()
#         # Get the numerator and denominator for smoothing
#         numerator = np.nansum(IDPs_for_site*gauss_kernel.values,axis=0)
#         t2 = time.time()
#         print('marker3.5: ', t2-t1)

#         t1 = time.time()
#         denominator = np.sum((1*~np.isnan(IDPs_for_site))*gauss_kernel.values,axis=0)
#         t2 = time.time()
#         print('marker4: ', t2-t1)

#         t1 = time.time()
#         # Smoothed IDPs for site
#         smoothed_IDPs_for_site.iloc[j, :] = numerator/denominator
#         t2 = time.time()
#         print('marker5: ', t2-t1)

#         print(numerator/denominator)

    
#     t2_total = time.time()
#     print('Done site ', str(site_id))
#     print('Time elapsed: ', t2_total-t1_total)


        

In [None]:
from nets.nets_smooth_multiple import nets_smooth_multiple
from nets.nets_svd import nets_svd

In [None]:
# Number of time points per block, no 8 is included here as
# we only ever construct the relevant matrix once in 
# nets_smooth_single (this is controlling the size of
# the xeval*xdata matrix)
blksize_time = int(MAXMEM/IDPs.shape[0]/64)


In [None]:
t1 = time.time()

# Dict to store smoothed IDPs and pca results
smoothed_IDPs_sorted_dict = {}
principal_components_sorted_dict = {}
esm_sorted_dict = {}

# Loop through sites
for site_id in inds_per_site_sorted:
    
    print('Smoothing confounds for site ', str(site_id))
    t1_total = time.time()
    
    # Get subjects for this site
    inds_site = inds_per_site_sorted[site_id]
    
    # Get the IDPs for this site
    IDPs_for_site = IDPs_deconf_sorted.iloc[inds_site,:]
    
    # Get the acquisition times for this site
    times_for_site = conf_acq_time_linear.iloc[inds_site,:]
    
    # Smooth the IDPs
    smoothed_IDPs_for_site = nets_smooth_multiple(times_for_site, IDPs_for_site, sigma,
                                                  blksize=blksize, blksize_time=blksize_time,
                                                  cluster_cfg=cluster_cfg)

    print('Confounds smoothed for site ', str(site_id))

    # Compute svd of IDPs
    principal_components_sorted, esm,_ = nets_svd(smoothed_IDPs_for_site.values)

    # Save results
    smoothed_IDPs_sorted_dict[site_id] = smoothed_IDPs_for_site
    principal_components_sorted_dict[site_id] = principal_components_sorted
    esm_sorted_dict[site_id] = esm

t2 = time.time()
print(t2-t1)

In [None]:
type(IDPs_deconf)

In [None]:
from nets.nets_normalise import nets_normalise

In [None]:

# Estimating the number of temporal components by choosing a number
# that explains at least 99% of the variance in the smoothed IDPs.
num_temp_comp_sorted = {}
conf_acq_time_dict = {}
 
# Loop through sites
for site_id in principal_components_sorted_dict:

    # Maximum variance explained
    max_ve = 0

    # Get the principal components for this site
    principal_components_site = principal_components_sorted_dict[site_id]

    # Get the smoothed IDPs for this site
    smoothed_IDPs_site = smoothed_IDPs_sorted_dict[site_id]

    # Record index
    site_index = smoothed_IDPs_site.index
    
    # Current number of principal components that we have considered
    n_current = 1

    # Get columns of rows with all non-nan values
    non_nan_rows = ~smoothed_IDPs_site.isna().any(axis=1)

    # Filter principal components and smoothed IDPs row wise
    principal_components_site = principal_components_site[non_nan_rows.values,:]
    smoothed_IDPs_site = smoothed_IDPs_site[non_nan_rows].values
    
    # Loop through principal components until we have 99% variance explained
    while max_ve < 99:

        # Get n_current principal components
        current_pcs = principal_components_site[:,:n_current]
            
        # Compute variance explained in smoothed_IDPs_site by current_pcs
        current_pcs_pinv = np.linalg.pinv(current_pcs)
    
        # Compute projection
        proj = current_pcs @ (current_pcs_pinv @ smoothed_IDPs_site)
        
        # Compute variance explained
        numerator = 100 * np.sum(proj.flatten() ** 2)
        denominator = np.sum(smoothed_IDPs_site.flatten() ** 2)
        max_ve = numerator / denominator

        print(n_current, max_ve)
        
        # Check if max_ve is greater than 99
        if max_ve < 99:

            # Increment counter
            n_current = n_current + 1

    # Save number of components
    num_temp_comp_sorted[site_id] = n_current

    # Save new array
    principal_components_sorted_dict[site_id] = pd.DataFrame(principal_components_site[:,:n_current],
                                                             index=site_index)
    conf_acq_time_dict[site_id] = nets_normalise(principal_components_sorted_dict[site_id]).fillna(0)

    print('Estimated number of components for site ', site_id, ': ', n_current)

*Matlab had 19 for site 1, 20 for site 2, 22 for site 3 and 21 for site 4 (82 total)*

*Python gives 18 for site 1, 19 for site 2, 21 for site 3 and 20 for site 4*

In [None]:
for site_id in smoothed_IDPs_sorted_dict:

    print(principal_components_sorted_dict[site_id].shape)

In [None]:
# Construct column names for temporal components
tc_colnames = []

# Loop through sites constructing colnames and converting principal components
for site_id in principal_components_sorted_dict:

    # Site columnnames 
    tc_colnames_site = ['ACQT_Site_' + str(site_id) + '__' + str(pc_id) for pc_id in range(1,num_temp_comp_sorted[site_id]+1)]

    # Replace header on site specific dataframes
    principal_components_sorted_dict[site_id].columns = tc_colnames_site

    # Update running column names
    tc_colnames = tc_colnames + tc_colnames_site
    
# Number of estimated components in total
num_temp_comp_sorted_total = 0

# Sum values over sites
for site_id in num_temp_comp_sorted:
    num_temp_comp_sorted_total = num_temp_comp_sorted_total + num_temp_comp_sorted[site_id]
    
# Reconstruct principal components confound dataframe
conf_acq_time = pd.DataFrame(np.zeros((n_sub,num_temp_comp_sorted_total)),
                             index = sub_ids_sorted,
                             columns = tc_colnames)

# Loop through sites constructing colnames and converting principal components
for site_id in principal_components_sorted_dict:

    # Add in temporal components sorted
    conf_acq_time.update(principal_components_sorted_dict[site_id])

# Convert the indexing back to the original order
conf_acq_time = conf_acq_time.loc[sub_ids,:]

In [None]:
confounds

In [None]:
timedelta = conf_acq_time_linear.iloc[inds_site[j],:]-conf_acq_time_linear.iloc[inds_site,:]

In [None]:
gauss_kernel = np.exp(-0.5*((timedelta/sigma)**2))

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming you have a pandas DataFrame 'df' with a single column
data = gauss_kernel.iloc[:, 0]  # Get the single column as a pandas Series

# Create a range of indices from 1 to the length of the data
indices = range(1, len(data) + 1)

# Plot the data against the indices
# plt.figure(figsize=(10, 6))  # Adjust the figure size as needed
# plt.plot(indices, data)
# plt.xlabel('Index')
# plt.ylabel('Value')
# plt.title('Dataframe Column Plot')
# plt.show()

In [None]:

# Handle any potential overflow
gauss_kernel[gauss_kernel.abs()<1e-10]=0

In [None]:
IDPs_for_site = IDPs.iloc[inds_site,:]

In [None]:
(IDPs_for_site.values*gauss_kernel.values).shape

In [None]:
# Get the numerator and denominator for smoothing
numerator = np.nansum(IDPs_for_site.values*gauss_kernel.values,axis=0)
denominator = np.sum((1*~np.isnan(IDPs_for_site.values))*gauss_kernel.values,axis=0)

# Get the smoothed IDP


In [None]:
numerator

In [None]:
denominator

In [None]:
# This code was adapted from the below answer on stack overflow
# https://stackoverflow.com/questions/24143320/gaussian-sum-filter-for-irregular-spaced-points

def gaussian_sum_smooth(xdata, ydata, xeval, sigma, null_thresh=0.6):
    """Apply gaussian sum filter to data.
    
    xdata, ydata : array
        Arrays of x- and y-coordinates of data. 
        Must be 1d and have the same length.
    
    xeval : array
        Array of x-coordinates at which to evaluate the smoothed result
    
    sigma : float
        Standard deviation of the Gaussian to apply to each data point
        Larger values yield a smoother curve.
    
    null_thresh : float
        For evaluation points far from data points, the estimate will be
        based on very little data. If the total weight is below this threshold,
        return np.nan at this location. Zero means always return an estimate.
        The default of 0.6 corresponds to approximately one sigma away 
        from the nearest datapoint.
    """
    # Distance between every combination of xdata and xeval
    # each row corresponds to a value in xeval
    # each col corresponds to a value in xdata
    delta_x = xeval[:, None] - xdata

    # Calculate weight of every value in delta_x using Gaussian
    # Maximum weight is 1.0 where delta_x is 0
    weights = np.exp(-0.5 * ((delta_x / sigma) ** 2))

    # Multiply each weight by every data point, and sum over data points
    smoothed = np.dot(weights, ydata)

    # Nullify the result when the total weight is below threshold
    # This happens at evaluation points far from any data
    # 1-sigma away from a data point has a weight of ~0.6
    nan_mask = weights.sum(1) < null_thresh
    smoothed[nan_mask] = np.nan
    
    # Normalize by dividing by the total weight at each evaluation point
    # Nullification above avoids divide by zero warning shere
    smoothed = smoothed / weights.sum(1)

    # Return result    
    return smoothed

In [None]:
xdata = conf_acq_time_linear.iloc[inds_site,:].values

ydata = IDPs_for_site[:,1:10]

# mask = ~np.isnan(ydata)

# xdata = xdata[mask]
xeval = np.array(xdata)
# ydata = ydata[mask]

null_thresh = 0.1

In [None]:
xdata.shape, xeval.shape, ydata.shape

In [None]:
t1 = time.time()
smoothed_IDPs=gaussian_sum_smooth(xdata, ydata, xeval, sigma, null_thresh=0.6)
t2 = time.time()
print(t2-t1)

In [None]:
smoothed_IDPs.shape

In [None]:
xdata.flatten().shape, ydata[:,1].shape


In [None]:
np.isnan(smoothed_IDPs).any()

In [None]:
# This code was adapted from the below answer on stack overflow
# https://stackoverflow.com/questions/24143320/gaussian-sum-filter-for-irregular-spaced-points

def nets_smooth(xdata, ydata, xeval, sigma, null_thresh=0.6):
    """Apply gaussian sum filter to data.
    
    xdata, ydata : array
        Arrays of x- and y-coordinates of data. 
        xdata be have only one dimension > 1 and have the same height as ydata.
        ydata must be two dimensional (n_obs x n_variables)
    
    xeval : array
        Array of x-coordinates at which to evaluate the smoothed result
    
    sigma : float
        Standard deviation of the Gaussian to apply to each data point
        Larger values yield a smoother curve.
    
    null_thresh : float
        For evaluation points far from data points, the estimate will be
        based on very little data. If the total weight is below this threshold,
        return np.nan at this location. Zero means always return an estimate.
        The default of 0.6 corresponds to approximately one sigma away 
        from the nearest datapoint.
    """
    
    # Flatten xdata and xeval
    xdata = xdata.flatten()
    xeval = xeval.flatten()
    
    # Distance between every combination of xdata and xeval
    # each row corresponds to a value in xeval
    # each col corresponds to a value in xdata
    delta_x = xeval[:, None] - xdata
    
    # Calculate weight of every value in delta_x using Gaussian
    # Maximum weight is 1.0 where delta_x is 0
    weights = np.exp(-0.5 * ((delta_x / sigma) ** 2))
    
    # Temporarily remove zeros from ydata
    ydata_wo_nans = np.array(ydata)
    ydata_wo_nans[np.isnan(ydata)]=0
    
    # Multiply each weight by every data point, and sum over data points
    smoothed = weights @ ydata_wo_nans
    
    # Nullify the result when the total weight is below threshold
    # This happens at evaluation points far from any data
    # 1-sigma away from a data point has a weight of ~0.6
    nan_mask = weights.sum(1) < null_thresh
    smoothed[nan_mask] = np.nan
    
    # Normalize by dividing by the total weight at each evaluation point
    # Nullification above avoids divide by zero warning shere
    for k in np.arange(smoothed.shape[1]):
        
        # Get nan mask
        non_nan_mask = ~np.isnan(ydata[:,k])
        
        # Get smoothed
        smoothed[:,k] = smoothed[:,k] / weights[:,non_nan_mask].sum(1)

    # Return result
    return(smoothed)


In [None]:
weights.sum(1).shape

In [None]:
smoothed.shape
for k in np.arange(smoothed.shape[1]):
    
    # Get nan mask
    non_nan_mask = ~np.isnan(ydata[:,k])
    
    # Get smoothed
    print(smoothed[:,k] / weights[:,non_nan_mask].sum(1))

In [None]:
w[ 0.23876152  0.30856143 -0.19740716 ... -0.00821319 -0.25510581
 -0.14734484]
marker5:  0.000518798828125
[ 0.23761886  0.30677943 -0.19650863 ... -0.00709767 -0.25337046
 -0.14570244]
[ 0.23742288  0.30647581 -0.19635486 ... -0.00690842 -0.25307346
 -0.14542344]

In [None]:
non_nan_mask = ~np.isnan(ydata[:,k])

In [None]:
sum(non_nan_mask)

In [None]:
weights.shape, weights.sum(1).shape

In [None]:
weights[non_nan_mask,:].shape, weights[non_nan_mask,:].sum(1).shape

In [None]:
import os

def my_log2(message, mode='a',filename=None):
    """
    Write a message to an HTML file with a header, basic formatting, and styling.

    Args:
        message (str): The message to be written.
        mode (str, optional): The mode to open the file with. 'a' for append, 'r' for replace current line. Defaults to 'a'.
        filename (str, optional): The name of the HTML file. Defaults to None (no output).

    Returns:
        None
    """
    
    # Check if file is in use
    fileLocked = True
    while fileLocked:
        try:
            # Create lock file, so other jobs know we are writing to this file
            f = os.open(filename + ".lock", os.O_CREAT|os.O_EXCL|os.O_RDWR)
            fileLocked = False
        except FileExistsError:
            fileLocked = True
            
    if filename is not None:
        try:
            # Read file line
            with open(filename, 'r+', encoding='utf-8') as file:
                lines = [line for line in file]

            # Writing filelines
            with open(filename, 'w+', encoding='utf-8') as file:
                if not lines:  # If the file is empty, create the HTML structure
                    print('here1')
                    file.write('<!DOCTYPE html>\n<html>\n<head>\n<title>Confounds Log</title>\n')
                    file.write('<style>\nbody { font-family: Arial, sans-serif; margin: 20px; background-color: #e6f0ff; }\n')
                    file.write('h1 { color: #333; position: sticky; top: 0; background-color: #e6f0ff; padding: 10px; }\n')
                    file.write('hr { border: none; border-top: 1px solid #ccc; margin: 10px 0; }\n</style>\n</head>\n<body>\n<h1>Confounds Log</h1>\n<hr>\n')
                    file.write('<p>' + message + '</p>\n')
                    file.write('</body>\n')
                    file.write('</html>')
                else:
                    if mode == 'a':
                        print('here2')
                        lines.append('</html>')
                        lines[-3] = '<p>' + message + '</p>\n'
                        lines[-2] = '</body>\n'
                    elif mode == 'r':
                        print('here5')
                        print(lines)
                        lines[-3] = '<p>' + message + '</p>\n'
                        lines[-2] = '</body>\n'
                        lines[-1] = '</html>'
                        print(lines)
                    file.writelines(lines)
        except FileNotFoundError:
            print('here3')
            with open(filename, 'w', encoding='utf-8') as file:
                print('here4')
                file.write('<!DOCTYPE html>\n<html>\n<head>\n<title>Confounds Log</title>\n')
                file.write('<style>\nbody { font-family: Arial, sans-serif; margin: 20px; background-color: #e6f0ff; }\n')
                file.write('h1 { color: #333; position: sticky; top: 0; background-color: #e6f0ff; padding: 10px; }\n')
                file.write('hr { border: none; border-top: 1px solid #ccc; margin: 10px 0; }\n</style>\n</head>\n<body>\n<h1>Confounds Log</h1>\n<hr>\n')
                file.write('<p>' + message + '</p>\n')
                file.write('</body>\n')
                file.write('</html>')

    # Release the file lock
    os.remove(filename + ".lock")
    os.close(f)

In [None]:
x = ['a','b']
x.append('c')
x[-2]='e'
x

In [None]:
import os
import shutil
import numpy as np
import pandas as pd
from datetime import datetime

from nets.nets_load_match import nets_load_match
from nets.nets_inverse_normal import nets_inverse_normal

from preproc.datenum import datenum
from preproc.days_in_year import days_in_year

from memmap.MemoryMappedDF import MemoryMappedDF

from logio.my_log import my_log
from logio.loading import ascii_loading_bar

logfile = '/well/nichols/users/inf852/confounds/log.html'

# Update log
my_log2(str(datetime.now()) +': Stage 1: Generating Initial Variables.', mode='a', filename=logfile)


In [None]:
# filename = '/well/nichols/users/inf852/confounds/log.html'
# mode='r'
# message=str(datetime.now()) +': Quartile normalisation complete.'
# with open(filename, 'r+', encoding='utf-8') as file:
#     lines = [line.rstrip() for line in file]

# with open(filename, 'w+', encoding='utf-8') as file:
#     if not lines:  # If the file is empty, create the HTML structure
#         print('here')
#     else:
#         if mode == 'a':
#             print('here2')
#         elif mode == 'r':
#             print(lines)
#             lines[-3] = f'<p>{message}</p>\n'
#             lines[-2] = '</body>\n'
#             lines[-1] = '</html>'

# print(lines)

In [None]:

# Update log
my_log2(str(datetime.now()) +': Loaded initial variables.', mode='a', filename=logfile)



In [None]:
my_log2(str(datetime.now()) +': Performing quartile normalisation of IDPs...', mode='a', filename=logfile)
# works up to here

In [None]:

# Update log
my_log2(str(datetime.now()) +': Quartile normalisation complete.', mode='r', filename=logfile)


In [None]:
my_log2(str(datetime.now()) +': Loading miscellaneous variables...', mode='a', filename=logfile)


In [None]:

# Update log
my_log2(str(datetime.now()) +': Loaded miscellaneous variables and sorted.', mode='r', filename=logfile)


In [None]:
my_log2(str(datetime.now()) +': Saving results...', mode='a', filename=logfile)


In [None]:

# Update
my_log2(str(datetime.now()) +': Results saved.', mode='r', filename=logfile)


In [None]:
my_log2(str(datetime.now()) +': Stage 1: Complete.', mode='a', filename=logfile)


