In [52]:
import pandas as pd

def load_and_prepare_behav(subj_num, behav_base_path):
    behav_path = f"{behav_base_path}/sub{subj_num:02d}_behav.tsv"
    behav_df = pd.read_csv(behav_path, sep='\t')
    
    # Create cumulative fmri_index per session (0 to 749)
    behav_df['fmri_index'] = None
    for session in behav_df['SESSION'].unique():
        idxs = behav_df[behav_df['SESSION'] == session].index
        behav_df.loc[idxs, 'fmri_index'] = range(len(idxs))
    behav_df['fmri_index'] = behav_df['fmri_index'].astype(int)
    
    print(f"Subject {subj_num}: Loaded behav with {len(behav_df)} rows")
    return behav_df


In [53]:
def get_shared_trials_by_phase(behav_df):
    """
    Filter for shared1000 images and classify trials into encoding/retrieval phases.
    
    Returns:
        - encoding_df: DataFrame for encoding trials
        - retrieval1_df: DataFrame for first retrieval
        - retrieval2_df: DataFrame for second retrieval (if present)
    """
    # Keep only images that have a repNum entry (i.e., from shared1000 set)
    shared_df = behav_df[behav_df["shared1000_repNum"].notna()].copy()
    shared_df["shared1000_repNum"] = shared_df["shared1000_repNum"].astype(int)

    # Sanity check: each image ID (73KID) should occur at least twice
    image_counts = shared_df["73KID"].value_counts()
    valid_ids = image_counts[image_counts != 0].index
    shared_df = shared_df[shared_df["73KID"].isin(valid_ids)]

    # Label trial phase
    phase_map = {1: "encoding", 2: "retrieval1", 3: "retrieval2"}
    shared_df["PHASE"] = shared_df["shared1000_repNum"].map(phase_map)

    # Now split by phase
    encoding_df = shared_df[shared_df["PHASE"] == "encoding"]
    retrieval1_df = shared_df[shared_df["PHASE"] == "retrieval1"]
    retrieval2_df = shared_df[shared_df["PHASE"] == "retrieval2"]

    return encoding_df, retrieval1_df, retrieval2_df


In [3]:
# def get_phase_maps_for_session(behav_df, session_num):
#     behav_sess = behav_df[behav_df['SESSION'] == session_num]
#     encoding_df, retrieval1_df, retrieval2_df = get_shared_trials_by_phase(behav_sess)

#     enc_kids = set(encoding_df['73KID'])
#     ret1_kids = set(retrieval1_df['73KID'])
#     ret2_kids = set(retrieval2_df['73KID'])
#     common_kids_sess = enc_kids & ret1_kids & ret2_kids

#     enc_map = dict(zip(encoding_df['73KID'], encoding_df['fmri_index']))
#     ret1_map = dict(zip(retrieval1_df['73KID'], retrieval1_df['fmri_index']))
#     ret2_map = dict(zip(retrieval2_df['73KID'], retrieval2_df['fmri_index']))

#     print(f"Session {session_num}: {len(common_kids_sess)} common trials across phases")
#     return common_kids_sess, enc_map, ret1_map, ret2_map


In [54]:
def get_global_phase_maps(behav_df):
    """
    Match encoding and retrieval trials for shared1000 images across all sessions.

    Returns:
        - List of dicts: one per image with phase-to-(session, index, button, presentation_type) mapping
    """
    shared_df = behav_df[behav_df["shared1000_repNum"].notna()].copy()
    shared_df["shared1000_repNum"] = shared_df["shared1000_repNum"].astype(int)

    # # Keep images with all 3 phases
    # counts = shared_df.groupby("73KID")["shared1000_repNum"].nunique()
    # valid_kids = counts[counts == 3].index
    # shared_df = shared_df[shared_df["73KID"].isin(valid_kids)]

    # Keep images that appear more than once (i.e., at least 2 phases)
    counts = shared_df.groupby("73KID")["shared1000_repNum"].nunique()
    valid_kids = counts[counts > 1].index


    # Phase mapping
    phase_map = {1: "enc", 2: "ret1", 3: "ret2"}
    shared_df["PHASE"] = shared_df["shared1000_repNum"].map(phase_map)

    image_trials = []
    for kid in valid_kids:
        rows = shared_df[shared_df["73KID"] == kid]
        phase_dict = {"73KID": kid}
        for _, row in rows.iterrows():
            phase = row["PHASE"]
            phase_dict[f"{phase}_session"] = row["SESSION"]
            phase_dict[f"{phase}_index"] = row["fmri_index"]
            phase_dict[f"{phase}_button"] = row["BUTTON"]
            phase_dict[f"{phase}_presentation_type"] = row["presentation_type"]
        image_trials.append(phase_dict)

    return image_trials


In [55]:
from scipy.stats import pearsonr

def run_subject_pipeline(subj_num, behav_base_path, h5_base_path, roi_map=None):
    print(f"\n--- Running pipeline for Subject {subj_num} ---")
    
    behav_df = load_and_prepare_behav(subj_num, behav_base_path)
    h5_path = f"{h5_base_path}/subj{subj_num:02d}_all_mtl_sessions.h5"
    
    similarity_results = []
    
    with h5py.File(h5_path, 'r') as f:
        # Get global matched trials across all sessions/phases
        global_phase_list = get_global_phase_maps(behav_df)

        # For convenience: list all sessions and rois in the file
        available_sessions = list(f.keys())
        print(f"Subject {subj_num}: Found {len(available_sessions)} sessions in HDF5")
        
        for trial in global_phase_list:
            kid = trial["73KID"]
            
            # For each phase, get session and index
            enc_sess = trial.get("enc_session")
            enc_idx = trial.get("enc_index")
            ret1_sess = trial.get("ret1_session")
            ret1_idx = trial.get("ret1_index")
            ret2_sess = trial.get("ret2_session")
            ret2_idx = trial.get("ret2_index")

            # Skip trials with missing session info
            if None in [enc_sess, ret1_sess, ret2_sess, enc_idx, ret1_idx, ret2_idx]:
                continue

            # Skip if any session not in HDF5 (just to be safe)
            if not all(str(s) in available_sessions for s in [f"session{enc_sess:02d}", f"session{ret1_sess:02d}", f"session{ret2_sess:02d}"]):
                continue
            
            # Iterate ROIs from encoding session (assuming same ROIs in all sessions)
            roi_names = list(f[f"session{enc_sess:02d}"].keys())

            for roi in roi_names:
                roi_name = roi.split("_", 1)[1] if roi.startswith("roi_") else roi

                try:
                    enc_beta = f[f"session{enc_sess:02d}"][roi][enc_idx, :]
                    ret1_beta = f[f"session{ret1_sess:02d}"][roi][ret1_idx, :]
                    ret2_beta = f[f"session{ret2_sess:02d}"][roi][ret2_idx, :]
                except Exception:
                    # If indexing fails, skip
                    continue
                
                # sim_enc_ret1 = cosine_sim(enc_beta, ret1_beta)
                # sim_enc_ret2 = cosine_sim(enc_beta, ret2_beta)
                # sim_ret1_ret2 = cosine_sim(ret1_beta, ret2_beta)

                # Skip if any vector has zero variance (Pearson correlation is undefined)
                if np.std(enc_beta) == 0 or np.std(ret1_beta) == 0 or np.std(ret2_beta) == 0:
                    continue
                
                sim_enc_ret1, _ = pearsonr(enc_beta, ret1_beta)
                sim_enc_ret2, _ = pearsonr(enc_beta, ret2_beta)
                sim_ret1_ret2, _ = pearsonr(ret1_beta, ret2_beta)


                similarity_results.append({
                    "SUBJECT": subj_num,
                    "73KID": kid,
                    "ROI": roi_map.get(roi_name, roi_name) if roi_map else roi_name,
                    "enc_session": enc_sess,
                    "ret1_session": ret1_sess,
                    "ret2_session": ret2_sess,
                    "enc_index": enc_idx,
                    "ret1_index": ret1_idx,
                    "ret2_index": ret2_idx,
                    "enc_ret1": sim_enc_ret1,
                    "enc_ret2": sim_enc_ret2,
                    "ret1_ret2": sim_ret1_ret2,
                    # Optional: add button and presentation_type from trial dict if you want
                    "enc_BUTTON": trial.get("enc_button"),
                    "ret1_BUTTON": trial.get("ret1_button"),
                    "ret2_BUTTON": trial.get("ret2_button"),
                    "enc_presentation_type": trial.get("enc_presentation_type"),
                    "ret1_presentation_type": trial.get("ret1_presentation_type"),
                    "ret2_presentation_type": trial.get("ret2_presentation_type"),
                })
    
    similarity_df = pd.DataFrame(similarity_results)
    print(f"Subject {subj_num}: Computed similarities for {len(similarity_df)} trials")

    # out_csv = f"{h5_base_path}/subj{subj_num:02d}_similarity_results.csv"
    # similarity_df.to_csv(out_csv, index=False)
    # print(f"Subject {subj_num}: Results saved to {out_csv}")
    
    return similarity_df


In [4]:
# import h5py
# from sklearn.metrics.pairwise import cosine_similarity

# def cosine_sim(vec1, vec2):
#     return cosine_similarity(vec1.reshape(1, -1), vec2.reshape(1, -1))[0][0]

# def run_subject_pipeline(subj_num, behav_base_path, h5_base_path):
#     print(f"\n--- Running pipeline for Subject {subj_num} ---")
    
#     behav_df = load_and_prepare_behav(subj_num, behav_base_path)
#     h5_path = f"{h5_base_path}/subj{subj_num:02d}_all_mtl_sessions.h5"
    
#     similarity_results = []
    
#     with h5py.File(h5_path, 'r') as f:
#         sessions = list(f.keys())
#         print(f"Subject {subj_num}: Found {len(sessions)} sessions in HDF5")
        
#         for session in sessions:
#             session_num = int(session[-2:])
            
#             common_kids, enc_map, ret1_map, ret2_map = get_phase_maps_for_session(behav_df, session_num)
#             roi_names = list(f[session].keys())
            
#             for roi in roi_names:
#                 roi_data = f[session][roi]
                
#                 for kid in common_kids:
#                     try:
#                         enc_idx = enc_map[kid]
#                         ret1_idx = ret1_map[kid]
#                         ret2_idx = ret2_map[kid]

#                         enc_beta = roi_data[enc_idx, :]
#                         ret1_beta = roi_data[ret1_idx, :]
#                         ret2_beta = roi_data[ret2_idx, :]

#                         sim_enc_ret1 = cosine_sim(enc_beta, ret1_beta)
#                         sim_enc_ret2 = cosine_sim(enc_beta, ret2_beta)
#                         sim_ret1_ret2 = cosine_sim(ret1_beta, ret2_beta)

#                         similarity_results.append({
#                             'SUBJECT': subj_num,
#                             'SESSION': session_num,
#                             '73KID': kid,
#                             'ROI': roi,
#                             'enc_ret1': sim_enc_ret1,
#                             'enc_ret2': sim_enc_ret2,
#                             'ret1_ret2': sim_ret1_ret2
#                         })
#                     except KeyError:
#                         continue

#     similarity_df = pd.DataFrame(similarity_results)
#     print(f"Subject {subj_num}: Computed similarities for {len(similarity_df)} trials")
    
#     # Save to CSV
#     out_csv = f"{h5_base_path}/subj{subj_num:02d}_similarity_results.csv"
#     similarity_df.to_csv(out_csv, index=False)
#     print(f"Subject {subj_num}: Results saved to {out_csv}")
    
#     return similarity_df


In [40]:
from sklearn.metrics.pairwise import cosine_similarity

def cosine_sim(vec1, vec2):
    return cosine_similarity(vec1.reshape(1, -1), vec2.reshape(1, -1))[0][0]


In [57]:
import os
import numpy as np

behav_base_path = "/home/jovyan/cache/memoryNSD"
h5_base_path = "/home/jovyan/cache/memoryNSD"
# os.makedirs(behav_base_path, exist_ok=True)
# os.makedirs(h5_base_path, exist_ok=True)

# for subj in range(4,9):  # add all your subjects here
#     df = run_subject_pipeline(subj, behav_base_path, h5_base_path)
import pandas as pd

all_results = []

for subj in range(1, 9):
    df = run_subject_pipeline(subj, behav_base_path, h5_base_path)
    all_results.append(df)

combined_df = pd.concat(all_results, ignore_index=True)

combined_out_csv = f"{h5_base_path}/all1-3_subjects_similarity_results.csv"
combined_df.to_csv(combined_out_csv, index=False)
print(f"Combined results saved to {combined_out_csv}")



--- Running pipeline for Subject 1 ---
Subject 1: Loaded behav with 30000 rows
Subject 1: Found 40 sessions in HDF5
Subject 1: Computed similarities for 10000 trials

--- Running pipeline for Subject 2 ---
Subject 2: Loaded behav with 30000 rows
Subject 2: Found 40 sessions in HDF5
Subject 2: Computed similarities for 10000 trials

--- Running pipeline for Subject 3 ---
Subject 3: Loaded behav with 24000 rows
Subject 3: Found 32 sessions in HDF5
Subject 3: Computed similarities for 6140 trials

--- Running pipeline for Subject 4 ---
Subject 4: Loaded behav with 22500 rows
Subject 4: Found 30 sessions in HDF5
Subject 4: Computed similarities for 5150 trials

--- Running pipeline for Subject 5 ---
Subject 5: Loaded behav with 30000 rows
Subject 5: Found 40 sessions in HDF5
Subject 5: Computed similarities for 10000 trials

--- Running pipeline for Subject 6 ---
Subject 6: Loaded behav with 24000 rows
Subject 6: Found 32 sessions in HDF5
Subject 6: Computed similarities for 6140 trials



In [58]:
import pandas as pd

def update_roi_labels_in_file(csv_file, roi_map):
    print(f"Processing {csv_file} ...")
    df = pd.read_csv(csv_file)

    # Convert ROI column to string so it matches the mapping keys
    df['ROI'] = df['ROI'].astype(str).map(roi_map).fillna(df['ROI'])

    df.to_csv(csv_file, index=False)
    print(f"Updated ROI labels saved to {csv_file}")

# Your string-keyed ROI map
roi_map = {
    '1': 'ERC',
    '2': '35',
    '3': '36',
    '4': 'SUB',
    '5': 'CA1',
    '6': 'CA2',
    '7': 'CA3',
    '8': 'DG',
    '9': 'PHC',
    '10': 'HT'
}

combined_csv_path = "/home/jovyan/cache/memoryNSD/all1-3_subjects_similarity_results.csv"
update_roi_labels_in_file(combined_csv_path, roi_map)


Processing /home/jovyan/cache/memoryNSD/all1-3_subjects_similarity_results.csv ...
Updated ROI labels saved to /home/jovyan/cache/memoryNSD/all1-3_subjects_similarity_results.csv


In [59]:
import pandas as pd

# Load the CSV
csv_path = "/home/jovyan/cache/memoryNSD/all1-3_subjects_similarity_results.csv"
df = pd.read_csv(csv_path)

# Show the column names and a few rows
print("Column names:", df.columns.tolist())
print("\nSample rows:")
print(df.head())


Column names: ['SUBJECT', '73KID', 'ROI', 'enc_session', 'ret1_session', 'ret2_session', 'enc_index', 'ret1_index', 'ret2_index', 'enc_ret1', 'enc_ret2', 'ret1_ret2', 'enc_BUTTON', 'ret1_BUTTON', 'ret2_BUTTON', 'enc_presentation_type', 'ret1_presentation_type', 'ret2_presentation_type']

Sample rows:
   SUBJECT  73KID  ROI  enc_session  ret1_session  ret2_session  enc_index  \
0        1   2951  ERC            4            13            37        365   
1        1   2951   HT            4            13            37        365   
2        1   2951   35            4            13            37        365   
3        1   2951   36            4            13            37        365   
4        1   2951  SUB            4            13            37        365   

   ret1_index  ret2_index  enc_ret1  enc_ret2  ret1_ret2  enc_BUTTON  \
0         715         565  0.023881  0.008184   0.036706         1.0   
1         715         565 -0.040522 -0.044279  -0.124554         1.0   
2         715

In [60]:
unique_ids_per_subject = df.groupby("SUBJECT")["73KID"].nunique()
print("Unique image IDs per subject:")
print(unique_ids_per_subject)


Unique image IDs per subject:
SUBJECT
1    1000
2    1000
3     614
4     515
5    1000
6     614
7    1000
8     515
Name: 73KID, dtype: int64


In [36]:
import h5py
import os

def check_missing_sessions(h5_base_path, subjects=range(1, 9), expected_sessions=range(1, 41)):
    for subj in subjects:
        h5_path = os.path.join(h5_base_path, f"subj{subj:02d}_all_mtl_sessions.h5")
        if not os.path.exists(h5_path):
            print(f"Subject {subj}: HDF5 file not found.")
            continue

        with h5py.File(h5_path, "r") as f:
            available_sessions = set(f.keys())
            missing = [f"session{s}" for s in expected_sessions if f"session{s}" not in available_sessions]

            if missing:
                print(f"Subject {subj}: Missing {len(missing)} sessions -> {missing}")
            else:
                print(f"Subject {subj}: All sessions present ✅")

# Run the check
check_missing_sessions("/home/jovyan/cache/memoryNSD")


Subject 1: Missing 9 sessions -> ['session1', 'session2', 'session3', 'session4', 'session5', 'session6', 'session7', 'session8', 'session9']
Subject 2: Missing 9 sessions -> ['session1', 'session2', 'session3', 'session4', 'session5', 'session6', 'session7', 'session8', 'session9']
Subject 3: Missing 17 sessions -> ['session1', 'session2', 'session3', 'session4', 'session5', 'session6', 'session7', 'session8', 'session9', 'session33', 'session34', 'session35', 'session36', 'session37', 'session38', 'session39', 'session40']
Subject 4: Missing 19 sessions -> ['session1', 'session2', 'session3', 'session4', 'session5', 'session6', 'session7', 'session8', 'session9', 'session31', 'session32', 'session33', 'session34', 'session35', 'session36', 'session37', 'session38', 'session39', 'session40']
Subject 5: Missing 9 sessions -> ['session1', 'session2', 'session3', 'session4', 'session5', 'session6', 'session7', 'session8', 'session9']
Subject 6: Missing 17 sessions -> ['session1', 'sessio

In [None]:
#sub01.h5 - [sessions][roi1_10][n_trials][mv activation ~ N_Voxels]

In [None]:
#roi 1 = [n_trials] [0.2 0.5 0.9 .... (n_voxels)]

#[n_voxels - 1D] * [n_voxels - 1D]