In [242]:

import pickle
#read a pickle file
n_shuffle_offset=0
n_shuffle=20
sub_id='01'
with open(f"/path/to/shuffle.pkl",'rb') as f:
    results=pickle.load(f)

In [244]:
def parse_string(s):
    """
    Parse underscore-separated string into a dictionary.
    
    Args:
        s (str): Input string in format 'centered_by_psc_recentered_by_psc_zscored_1_hrf_by_mean_6_runs_intersection'
    
    Returns:
        dict: Parsed parameters
    """
    parts = s.split('_')
    result = {}
    
    i = 0
    while i < len(parts):
        # Handle 'xxx_by_yyy' pattern
        if i + 2 < len(parts) and parts[i + 1] == "by":
            key = parts[i]
            value = parts[i + 2]
            result[key] = value
            i += 3
        # Handle 'key_value' pattern
        elif i + 1 < len(parts):
            key = parts[i]
            value = parts[i + 1]
            # Try converting to int if possible
            try:
                value = int(value)
            except ValueError:
                pass
            result[key] = value
            i += 2
        # Handle last unpaired item
        else:
            result['mask_method'] = parts[i]
            i += 1
    
    return result

In [245]:
#create a dictionary of specifications and store the scores for computing p-values
def score_organizer(samples_list:list):
    scores_dict={}
    for sample_ind,sample in enumerate(samples_list):
        for spec_score_config in sample.keys():
            #check if the specification is already in the dictionary
            #if not create a new key and store the score
            if not sample_ind:
                #create the empty dictionary for the current specification
                scores_dict[spec_score_config]={'mean':[]}
                scores_dict[spec_score_config]['vanilla_data']=sample[spec_score_config]
            else: scores_dict[spec_score_config]['mean'].append(sample[spec_score_config])
            #scores_dict[sample['specifications']]=[]
        #scores_dict[sample[spec_score_config]].append(sample['scores'])
    return scores_dict
spec_scores=score_organizer(results)

In [None]:
spec_scores

In [247]:
def parse_parameter_string(input_string):
    """
    Parse a parameter string of the format 'centered_by_psc_recentered_by_psc_zscored_1_hrf_by_mean_6_runs_intersection'
    into a dictionary with corresponding keys and values.
    
    Args:
        input_string (str): The input string to parse
        
    Returns:
        dict: Dictionary containing the parsed parameters
    """
    # Initialize empty dictionary for results
    params = {}
    
    # Split the string by underscore
    parts = input_string.split('_')
    
    # Initialize index to iterate through parts
    i = 0
    
    while i < len(parts):
        # Handle centered_by
        if parts[i] == 'centered' and parts[i+1] == 'by':
            params['centered_by'] = parts[i+2]
            i += 3
        
        # Handle recentered_by
        elif parts[i] == 'recentered' and parts[i+1] == 'by':
            params['recentered_by'] = parts[i+2]
            i += 3
            
        # Handle zscored
        elif parts[i] == 'zscored':
            params['zscored'] = "True" if int(parts[i+1]) else "False"
            i += 2
            
        # Handle hrf_by
        elif parts[i] == 'hrf' and parts[i+1] == 'by':
            params['hrf_by'] = parts[i+2]
            i += 3
            
        # Handle runs
        elif parts[i] == 'runs':
            params['number_of_runs'] = f"{parts[i-1]}-runs"  # Get the number before 'runs'
            i += 1
            
        # Handle mask_method (last part)
        elif i == len(parts) - 1:
            params['mask_method'] = parts[i]
            i += 1
            
        else:
            i += 1
    
    return params

In [None]:
parse_parameter_string('centered_by_psc_recentered_by_psc_zscored_1_hrf_by_mean_6_runs_intersection')

In [249]:
#extract the p-values for each analysis specification for each sample
import numpy as np
from scipy import stats
p_values={}
org_spec_list=[]
percentiles={}
for spec in spec_scores.keys():
    spec_dict=parse_parameter_string(spec)
    # if spec_dict["recentered_by"]=="psc" and spec_dict["centered_by"]!="off":
    #     continue
    p_values[spec]={'all_samples':[],'vanilla_data':None}
    #extract the sample correlation averages for each specification
    sample_scores=spec_scores[spec]['mean']+[spec_scores[spec]['vanilla_data']]
    sample_scores_nparray=np.array(sample_scores)
    #iterate through the sample scores
    for i in range(len(sample_scores)):
        #calulate the p-value
        if spec=='centered_by_psc_recentered_by_re-center_zscored_1_hrf_by_mean_6_runs_intersection':
            p_val=0
        else:
            p_val_var=stats.percentileofscore(sample_scores_nparray,sample_scores[i])/100
            p_val=2*(min(p_val_var,1-p_val_var))
        p_values[spec]['all_samples']+=[p_val]
    if spec=='centered_by_psc_recentered_by_re-center_zscored_1_hrf_by_mean_6_runs_intersection':
        p_values[spec]['vanilla_data']=0
        continue
    percentile_ratio=(stats.percentileofscore(sample_scores_nparray,spec_scores[spec]['vanilla_data'])/100)

    p_values[spec]['vanilla_data']=2*(min(percentile_ratio, 1 - percentile_ratio))
    # percentiles[spec]=percentile*100
    # p_values[spec]['vanilla_data']=fun(percentile)
    spec_dict=parse_parameter_string(spec)
    spec_dict['p-value']=p_val
    org_spec_list.append(spec_dict)
#p_values


In [None]:
rows=[]
for spec_combs in p_values:
    spec_dict=parse_parameter_string(spec_combs)
    spec_dict["significance"]=p_values[spec_combs]["vanilla_data"]
    rows.append(spec_dict)
import pandas as pd
df=pd.DataFrame(rows)
df.rename(columns={'centered_by': 'centering', 'recentered_by': 're-centering', 'zscored': 'zscoring_residuals', 'hrf_by': 'hrf_method', 'number_of_runs': 'runs_used', 'significance': 'significant'}, inplace=True)
# df['zscoring_residuals'] = df['zscoring_residuals'].astype(bool)
df["significant"]=np.where(df['significant'] < 0.025, 'Yes', 'No')
# df['runs_used'] = df['runs_used'].map(lambda x: f"{x}-runs")
df.head()

In [255]:
#iterate through the dictionary and extract the pvalues for vanilla data
spec_scores_vanilla=[]
for ind,spec in enumerate(p_values):
    spec_scores_vanilla.append(p_values[spec]['vanilla_data'])
    if spec=='centered_by_psc_recentered_by_off_zscored_1_hrf_by_mean_6_runs_intersection':
        og_index=ind

sorted_index=np.argsort(spec_scores_vanilla)


In [None]:
# RUN To get decision tree ds
# rows=[]
# for spec_combs in p_values:
#     spec_dict=parse_parameter_string(spec_combs)
#     spec_dict["significance"]=p_values[spec_combs]["vanilla_data"]
#     rows.append(spec_dict)
# import pandas as pd
# df=pd.DataFrame(rows)
# df.rename(columns={'centered_by': 'centering', 'recentered_by': 're-centering', 'zscored': 'zscoring_residuals', 'hrf_by': 'hrf_method', 'number_of_runs': 'runs_used', 'significance': 'significant'}, inplace=True)
# # df['zscoring_residuals'] = df['zscoring_residuals'].astype(bool)
# df["significant"]=np.where(df['significant'] < 0.025, 'Yes', 'No')

In [None]:
(np.array(spec_scores_vanilla)<0.025).sum()

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(6,4))

# Main p-values line with increased linewidth
plt.plot(np.array(spec_scores_vanilla)[sorted_index], 
         linewidth=3,  # Increased line thickness
         label='p-values')

# Significance level lines with increased linewidth
# plt.axhline(y=0.975, color='purple', linestyle='--', 
#             linewidth=3,  # Increased line thickness
#             label='0.975 significance level')
plt.axhline(y=0.025, color='magenta', linestyle='--', 
            linewidth=3,  # Increased line thickness
            label='0.025 significance level')

plt.xlabel('Specifications(sorted by p-values)', fontsize=20)
plt.ylabel('p-values', fontsize=20)

# Scatter plot with larger marker size
plt.scatter(og_index, 0, 
           c='r', s=150,  # Increased marker size
           marker='o', label='Original Specification')

plt.legend(fontsize=11, loc='upper left', bbox_to_anchor=(0, 0.95))
plt.box(on=True)

# Thicker tick marks and spines
plt.tick_params(axis='both', which='major', labelsize=20, width=3)  # Increased tick width

# Make the plot border (spines) thicker
for spine in plt.gca().spines.values():
    spine.set_linewidth(3)  # Thicker border

plt.tight_layout()
plt.savefig('/home/satwick22/Documents/plots/spec_curv_fmri_6_4.pdf', 
            dpi=300, 
            bbox_inches='tight')