In [1]:
# Cell 1: Import required libraries
import base64
from io import BytesIO
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import zipfile
import json
import pandas as pd
import os
import seaborn as sns

In [11]:
# Cell 2: Data extraction (updated)
def extract_valid_results(jrzip_path):
    """Robust extraction with duplicate prevention"""
    seen_pids = set()
    valid_results = []
    
    with zipfile.ZipFile(jrzip_path, 'r') as zip_ref:
        for file_info in zip_ref.infolist():
            if not file_info.filename.endswith('.txt'):
                continue
                
            with zip_ref.open(file_info) as f:
                for line in f:
                    try:
                        data = json.loads(line.decode('utf-8').strip())
                        pid = str(data['metadata']['participant_id']).strip().lower()  # Normalize IDs
                        
                        if pid in seen_pids:
                            print(f"Duplicate ID skipped: {pid} in {file_info.filename}")
                            continue
                            
                        seen_pids.add(pid)
                        valid_results.append(data)
                        
                    except Exception as e:
                        print(f"Invalid entry: {str(e)}")
    
    print(f"Loaded {len(valid_results)} unique participants")
    return valid_results

In [15]:
# Cell 3: DataFrame creation functions (updated)
def create_participant_df(valid_results):
    """Participant metadata with comprehensive validation"""
    participant_info = []
    
    for result in valid_results:
        try:
            meta = result['metadata']
            params = meta.get('parameters', {}).get('filter_parameters', {})
            
            entry = {
                'participant_id': meta['participant_id'],
                'timestamp': pd.to_datetime(meta['timestamp'], errors='coerce'),
                'filter_threshold': float(params.get('threshold', 0)),
                'preservation_factor': float(params.get('preservation_factor', 0)),
                'noise_reduction': float(params.get('noise_reduction_factor', 0)),
                'has_vviq': 'vviq_data' in meta,
                'has_caps': 'caps_data' in meta,
                'has_training': 'training_phase' in result
            }
            
            if entry['timestamp'] is pd.NaT:
                print(f"Invalid timestamp for {entry['participant_id']}")
                continue
                
            participant_info.append(entry)
            
        except KeyError as e:
            print(f"Missing key {e} in participant metadata")
            continue
            
    df = pd.DataFrame(participant_info).set_index('participant_id')
    print(f"Participant DF: {df.shape[0]} entries")
    return df

def create_training_df(valid_results):
    """Training data with robust type handling"""
    training_data = []
    
    for result in valid_results:
        try:
            training = result.get('training_phase', {})
            summary = training.get('summary', {})
            
            acc = str(summary.get('accuracy_percentage', '0%')).rstrip('%')
            
            entry = {
                'participant_id': result['metadata']['participant_id'],
                'total_trials': int(summary.get('total_trials', 0)),
                'correct_trials': int(summary.get('correct_trials', 0)),
                'accuracy': float(acc),
                'mean_rt': float(summary.get('mean_rt', 0)),
                'completed': bool(training.get('completed', False))
            }
            training_data.append(entry)
            
        except Exception as e:
            pid = result['metadata'].get('participant_id', 'unknown')
            print(f"Invalid training data for {pid}: {str(e)}")
            continue
            
    df = pd.DataFrame(training_data)
    if not df.empty:
        df = df.set_index('participant_id')
    else:
        df = pd.DataFrame(index=[r['metadata']['participant_id'] for r in valid_results])
    print(f"Training DF: {len(df)} entries")
    return df

def create_vviq_df(valid_results):
    """VVIQ processing with missing data handling"""
    vviq_data = []
    seen_pids = set()

    
    for result in valid_results:
        pid = str(result['metadata']['participant_id']).strip().lower()
        if pid in seen_pids:
            continue
        seen_pids.add(pid)
        try:
            meta = result['metadata']
            pid = meta['participant_id']
            vviq_meta = meta['vviq_data']
            
            for condition in ['eyes_open', 'eyes_closed']:
                responses = vviq_meta.get(condition, [])
                for response in responses:
                    q_idx = response['question_index']
                    items = response.get('responses', {})
                    for item, rating in items.items():
                        entry = {
                            'participant_id': pid,
                            'condition': condition,
                            'question_index': q_idx,
                            'item': item,
                            'rating': int(rating),
                            'is_attention_check': q_idx == 3 and item == 'item_5',
                            'attention_passed': vviq_meta['attention_check_passed'][condition]
                        }
                        vviq_data.append(entry)
        except Exception as e:
            pid = meta.get('participant_id', 'unknown')
            print(f"Skipping VVIQ for {pid}: {str(e)}")
            continue
            
    df = pd.DataFrame(vviq_data)
    if not df.empty:
        df = df.set_index('participant_id')
    else:
        df = pd.DataFrame(index=[r['metadata']['participant_id'] for r in valid_results])
    print(f"VVIQ DF: {len(df)} items")
    df = pd.DataFrame(vviq_data).set_index('participant_id')
    return df[~df.index.duplicated(keep='first')]


def create_caps_df(valid_results):
    """Handle yes/no responses properly"""
    caps_data = []
    seen_pids = set()
    for result in valid_results:
        pid = str(result['metadata']['participant_id']).strip().lower()
        if pid in seen_pids:
            continue
        seen_pids.add(pid)
        participant_id = result['metadata']['participant_id']
        caps_responses = result['metadata'].get('caps_data', [])
        
        for response in caps_responses:
            try:
                # Convert yes/no to numeric
                raw_response = response.get('response', '').lower()
                numeric_response = 1 if 'yes' in raw_response else 0 if 'no' in raw_response else None
                
                caps_data.append({
                    'participant_id': participant_id,
                    'question_index': response['question_index'],
                    'subscale': response['subscale'],
                    'response': numeric_response,
                    'raw_response': raw_response  # Keep original for validation
                })
            except Exception as e:
                print(f"Skipping CAPS for {participant_id}: {str(e)}")
                continue
    
    df = pd.DataFrame(caps_data).set_index('participant_id')
    return df[~df.index.duplicated(keep='first')]



def create_evolution_df(valid_results):
    """Evolution data with validation"""
    evolution_data = []
    
    for result in valid_results:
        try:
            pid = result['metadata']['participant_id']
            for session in result['evolution_summary']:
                entry = {
                    'participant_id': pid,
                    'session': session['session'],
                    'generation': session['generation'],
                    'n_selections': len(session['selected_parents']),
                    'duration': float(session.get('duration_ms', 0)),
                    'session_type': session.get('type', 'unknown')
                }
                evolution_data.append(entry)
        except Exception as e:
            print(f"Skipping evolution data for {pid}: {str(e)}")
            continue
            
    df = pd.DataFrame(evolution_data)
    if not df.empty:
        df = df.set_index('participant_id')
    else:
        df = pd.DataFrame(index=[r['metadata']['participant_id'] for r in valid_results])
    print(f"Evolution DF: {len(df)} entries")
    return df

In [16]:
# Cell 4: Analysis functions (updated)
def analyze_attention_checks(vviq_df):
    """Robust attention check analysis"""
    if vviq_df.empty:
        print("⚠️ Empty VVIQ DataFrame")
        return pd.DataFrame()

    required_cols = ['attention_passed', 'is_attention_check']
    if not all(col in vviq_df.columns for col in required_cols):
        missing = set(required_cols) - set(vviq_df.columns)
        print(f"Missing columns: {missing}")
        return pd.DataFrame()

    analysis = (
        vviq_df
        .groupby('participant_id', observed=True)
        .agg(
            total_checks=('is_attention_check', 'size'),
            passed_checks=('is_attention_check', 'sum'),
            all_passed=('attention_passed', 'first')
        )
        .assign(valid_participant=lambda x: (x['total_checks'] >= 2) & x['all_passed'])
    )
    
    print(f"Valid participants: {analysis['valid_participant'].sum()}/{len(analysis)}")
    return analysis

def analyze_training_performance(training_df):
    """Training analysis with empty DF handling"""
    if training_df.empty:
        print("⚠️ Empty Training DataFrame")
        return pd.DataFrame()

    stats = {
        'accuracy': ['mean', 'std', 'min', 'max', lambda x: x.quantile(0.25), lambda x: x.quantile(0.75)],
        'mean_rt': ['mean', 'std', 'min', 'max'],
        'completed': ['mean']
    }
    
    return (
        training_df
        .agg(stats)
        .rename(columns={'<lambda_0>': 'q25', '<lambda_1>': 'q75'})
        .round(2)
        .T
    )

def validate_and_align_dataframes(dfs):
    """Ensure consistent index across all DataFrames"""
    base_index = dfs['participants'].index.unique()
    
    for name in ['training', 'vviq', 'caps', 'evolution']:
        # Preserve data but align indices
        dfs[name] = dfs[name].reindex(base_index)
        
        # Add missing participant flag
        if name not in ['participants']:
            dfs[name]['data_present'] = ~dfs[name].index.duplicated(keep='first')
    
    # Check for duplicates
    for name, df in dfs.items():
        duplicates = df.index.duplicated()
        if duplicates.any():
            print(f"⚠️ Removing {duplicates.sum()} duplicates from {name}")
            dfs[name] = df[~duplicates]
    
    return dfs

def validate_indices(dfs):
    """Ensure unique indices across all dataframes"""
    base_index = dfs['participants'].index.unique()
    
    for name, df in dfs.items():
        # Remove duplicate indices
        dfs[name] = df[~df.index.duplicated(keep='first')]
        
        # Align to participant index
        dfs[name] = dfs[name].reindex(base_index)
        
        # Add existence flag for debugging
        if name != 'participants':
            dfs[name]['exists'] = dfs[name].index.isin(base_index)
    
    return dfs

In [17]:
# Cell 5: Final pipeline (updated)
def main_analysis_pipeline(jrzip_path):
    try:
        # Data extraction
        valid_results = extract_valid_results(jrzip_path)
        
        # Create base dataframes
        dfs = {
            'participants': create_participant_df(valid_results),
            'training': create_training_df(valid_results),
            'vviq': create_vviq_df(valid_results),
            'caps': create_caps_df(valid_results),
            'evolution': create_evolution_df(valid_results)
        }
        
        # Validate and align indices
        dfs = validate_indices(dfs)
        
        # Analysis
        attention_report = analyze_attention_checks(dfs['vviq'])
        training_report = analyze_training_performance(dfs['training'])
        
        return {
            'data': dfs,
            'reports': {
                'attention': attention_report,
                'training': training_report
            }
        }
        
    except Exception as e:
        print(f"Pipeline failed: {str(e)}")
        return None

In [None]:
# Cell 1: Import required libraries
import base64
from io import BytesIO
import zipfile
import json
import pandas as pd
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt

# Cell 2: Enhanced data extraction
def extract_valid_results(jrzip_path):
    """Extract and validate results from JRZIP file with improved validation"""
    def validate_participant(data):
        required_fields = {
            'metadata': ['participant_id', 'sona_id', 'timestamp', 'demographics'],
            'caps_data': list,
            'parameters': ['filter_parameters'],
            'training_phase': ['trials'],
            'sessions': list
        }
        try:
            for section, fields in required_fields.items():
                if section not in data:
                    return False
                if isinstance(fields, list):
                    for field in fields:
                        if field not in data[section]:
                            return False
            return True
        except:
            return False

    valid_results = []
    with zipfile.ZipFile(jrzip_path, 'r') as zip_ref:
        for file_name in zip_ref.namelist():
            if file_name.endswith('.txt'):
                with zip_ref.open(file_name) as f:
                    for line in f:
                        try:
                            decoded_line = line.decode('utf-8').strip()
                            if decoded_line:
                                data = json.loads(decoded_line)
                                if validate_participant(data):
                                    valid_results.append(data)
                        except (json.JSONDecodeError, UnicodeDecodeError) as e:
                            continue
    print(f"Found {len(valid_results)} valid participant records")
    return valid_results

# Cell 3: Enhanced DataFrame creation
def create_participant_df(valid_results):
    """Create comprehensive participant metadata DataFrame"""
    participant_data = []
    
    for result in valid_results:
        meta = result['metadata']
        params = result['parameters']
        filter_params = params['filter_parameters']
        
        participant_data.append({
            'participant_id': meta['participant_id'],
            'sona_id': meta['sona_id'],
            'timestamp': meta['timestamp'],
            'age': meta['demographics']['age'],
            'gender': meta['demographics']['gender'],
            'filter_threshold': filter_params['threshold'],
            'preservation_factor': filter_params['preservation_factor'],
            'noise_reduction': filter_params['noise_reduction_factor'],
            'total_trials': result['training_phase']['summary']['total_trials'],
            'accuracy': result['training_phase']['summary']['accuracy_percentage'],
            'mean_rt': result['training_phase']['summary']['mean_rt']
        })
    
    return pd.DataFrame(participant_data)

# Cell 4: Enhanced CAPS data processing
def create_caps_df(valid_results):
    """Create DataFrame with all CAPS responses"""
    caps_data = []
    
    for result in valid_results:
        pid = result['metadata']['participant_id']
        for item in result['caps_data']:
            record = {
                'participant_id': pid,
                'question_index': item['question_index'],
                'question_text': item['question_text'],
                'response': item['response']
            }
            if item['subscale']:
                record.update({
                    'distressing': item['subscale']['distressing'],
                    'distracting': item['subscale']['distracting'],
                    'frequency': item['subscale']['frequency']
                })
            caps_data.append(record)
    
    return pd.DataFrame(caps_data)

# Cell 5: Enhanced trial processing
def create_trial_df(valid_results):
    """Create comprehensive trial-level DataFrame"""
    trial_data = []
    
    for result in valid_results:
        pid = result['metadata']['participant_id']
        
        # Process training phase trials
        for trial in result['training_phase']['trials']:
            trial_data.append({
                'participant_id': pid,
                'phase': 'training',
                'trial_number': trial['trial_number'],
                'target_index': trial['target_index'],
                'reaction_time': trial.get('reaction_time_ms'),
                'selected_id': trial.get('selected_id'),
                'correct': trial.get('correct')
            })
        
        # Process session trials
        for session in result['sessions']:
            for gen in session['generations']:
                for trial in gen['trials']:
                    if 'participant_selection' in trial:
                        sel = trial['participant_selection']
                        trial_data.append({
                            'participant_id': pid,
                            'phase': 'session',
                            'trial_number': trial['trial_number'],
                            'target_index': trial['target_index'],
                            'reaction_time': sel.get('reaction_time_ms'),
                            'selected_id': sel.get('stimulus_number'),
                            'correct': sel.get('correct')
                        })
    
    return pd.DataFrame(trial_data)

# Cell 6: Image processing (unchanged but verified)
def process_image_data(valid_results):
    """Process and display image stimuli"""
    for result in valid_results[:1]:  # Just first participant for demo
        for session in result['sessions']:
            for gen in session['generations']:
                for trial in gen['trials']:
                    if 'displayed_stimuli' in trial:
                        print(f"Processing trial {trial['trial_number']}")
                        for stim in trial['displayed_stimuli']:
                            img_data = stim['image_data'].split(",")[1]
                            img = Image.open(BytesIO(base64.b64decode(img_data)))
                            plt.imshow(img)
                            plt.show()
                        break
                    break
                break
            break

# Usage example:
jrzip_path = "jatos_results_20250207155153.jrzip"
results = extract_valid_results(jrzip_path)

participant_df = create_participant_df(results)
caps_df = create_caps_df(results)
trial_df = create_trial_df(results)

print("\nParticipant Summary:")
print(participant_df.head())

print("\nCAPS Responses:")
print(caps_df.head())

print("\nTrial Data:")
print(trial_df.head())

Found 0 valid participant records

Participant Summary:
Empty DataFrame
Columns: []
Index: []

CAPS Responses:
Empty DataFrame
Columns: []
Index: []

Trial Data:
Empty DataFrame
Columns: []
Index: []
