#### This is a script to extract the simulated data using PGS based regression model to estimate indirect genetic effects

In [1]:
# set up 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

# a function to read the .txt file, select columns based on the focal trait, and return a DataFrame
# the last character focal trait colname names are 1 or 2 in the data, use those to filter
def read_trait_data(file_path, focal_trait='both', combine_pgs=False):
    """
    Read trait data from a file and extract specific traits.
    
    Parameters:
    file_path (str): Path to the data file
    focal_trait (str or int): Which trait(s) to extract
                              - 'trait1' or 1: Extract only trait 1 columns (ending with '1')
                              - 'trait2' or 2: Extract only trait 2 columns (ending with '2')  
                              - 'both' or 'all': Extract both trait 1 and trait 2 columns
    combine_pgs (bool): Whether to combine haplotypic PGS scores into full PGS scores
                        - True: Combine NTp+Tp->PGSp, NTm+Tm->PGSm, Tp+Tm->PGSo
                        - False: Keep original columns separate
    
    Returns:
    pandas.DataFrame: DataFrame containing the selected trait columns (and combined PGS if requested)
    """
    
    # Read the data file
    try:
        df = pd.read_csv(file_path, sep='\t')
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None
    except Exception as e:
        print(f"Error reading file: {e}")
        return None
    
    # Get all column names
    all_columns = df.columns.tolist()
    
    # Convert focal_trait to string for consistent handling
    if focal_trait == 1:
        focal_trait = 'trait1'
    elif focal_trait == 2:
        focal_trait = 'trait2'
    
    # Select columns based on focal_trait
    if focal_trait.lower() in ['trait1', '1']:
        # Select columns ending with '1'
        selected_columns = [col for col in all_columns if col.endswith('1')]
        print(f"Selected trait 1 columns: {selected_columns}")
        
    elif focal_trait.lower() in ['trait2', '2']:
        # Select columns ending with '2'
        selected_columns = [col for col in all_columns if col.endswith('2')]
        print(f"Selected trait 2 columns: {selected_columns}")
        
    elif focal_trait.lower() in ['both', 'all']:
        # Select all columns (both trait 1 and trait 2)
        selected_columns = all_columns
        print(f"Selected all columns: {len(selected_columns)} columns")
        
    else:
        print(f"Invalid focal_trait: {focal_trait}. Use 'trait1', 'trait2', or 'both'")
        return None
    
    # Get the DataFrame with selected columns
    result_df = df[selected_columns].copy()
    
    # Combine haplotypic PGS scores if requested
    if combine_pgs:
        print("Combining haplotypic PGS scores into full PGS scores...")
        
        # Helper function to combine PGS for a specific trait
        def combine_trait_pgs(df, trait_suffix):
            trait_cols = {}
            
            # Check if the required columns exist for this trait
            nt_col = f'NT{trait_suffix}'  # Non-transmitted PGS
            t_col = f'T{trait_suffix}'    # Transmitted PGS
            
            if nt_col in df.columns and t_col in df.columns:
                # Combine NTp + Tp -> PGSp (or NTm + Tm -> PGSm)
                pgs_col = f'PGS{trait_suffix}'
                df[pgs_col] = df[nt_col] + df[t_col]
                trait_cols[pgs_col] = f"Combined {nt_col} + {t_col}"
                print(f"  Created {pgs_col} = {nt_col} + {t_col}")
            
            return trait_cols
        
        # Combine PGS for paternal (p) and maternal (m) scores
        combined_cols = {}
        
        # For paternal PGS (NTp + Tp -> PGSp)
        if focal_trait.lower() in ['trait1', '1', 'both', 'all']:
            combined_cols.update(combine_trait_pgs(result_df, 'p1'))
        if focal_trait.lower() in ['trait2', '2', 'both', 'all']:
            combined_cols.update(combine_trait_pgs(result_df, 'p2'))
            
        # For maternal PGS (NTm + Tm -> PGSm)  
        if focal_trait.lower() in ['trait1', '1', 'both', 'all']:
            combined_cols.update(combine_trait_pgs(result_df, 'm1'))
        if focal_trait.lower() in ['trait2', '2', 'both', 'all']:
            combined_cols.update(combine_trait_pgs(result_df, 'm2'))
        
        # For offspring PGS (Tp + Tm -> PGSo) - combines parental transmitted alleles
        if focal_trait.lower() in ['trait1', '1', 'both', 'all']:
            if 'Tp1' in result_df.columns and 'Tm1' in result_df.columns:
                result_df['PGSo1'] = result_df['Tp1'] + result_df['Tm1']
                combined_cols['PGSo1'] = "Combined Tp1 + Tm1"
                print(f"  Created PGSo1 = Tp1 + Tm1")
                
        if focal_trait.lower() in ['trait2', '2', 'both', 'all']:
            if 'Tp2' in result_df.columns and 'Tm2' in result_df.columns:
                result_df['PGSo2'] = result_df['Tp2'] + result_df['Tm2']
                combined_cols['PGSo2'] = "Combined Tp2 + Tm2"
                print(f"  Created PGSo2 = Tp2 + Tm2")
        
        if combined_cols:
            print(f"Successfully created {len(combined_cols)} combined PGS columns")
        else:
            print("No PGS columns were combined (missing required T/NT columns)")
    
    # Return the DataFrame with selected columns (and combined PGS if requested)
    return result_df

# Example usage function to demonstrate how to use read_trait_data
def load_data_example():
    """
    Example function showing how to use read_trait_data with different parameters
    """
    # Example file path (adjust as needed)
    file_path = "/Users/xuly4739/Library/CloudStorage/OneDrive-UCB-O365/Documents/coding/PyProject/StatRev_IndirectGene/Data/phenoVT_geneticAM/nfam8000/phenoVT_geneticAM_run_001_nfam8000.txt"
    
    print("=== Loading trait 1 only ===")
    trait1_data = read_trait_data(file_path, focal_trait='trait1')
    if trait1_data is not None:
        print(f"Trait 1 data shape: {trait1_data.shape}")
        print(trait1_data.head(3))
    
    print("\n=== Loading trait 2 only ===")
    trait2_data = read_trait_data(file_path, focal_trait='trait2')
    if trait2_data is not None:
        print(f"Trait 2 data shape: {trait2_data.shape}")
        print(trait2_data.head(3))
    
    print("\n=== Loading both traits ===")
    both_traits_data = read_trait_data(file_path, focal_trait='both')
    if both_traits_data is not None:
        print(f"Both traits data shape: {both_traits_data.shape}")
        print(both_traits_data.head(3))
    
    print("\n=== Loading both traits with combined PGS ===")
    combined_pgs_data = read_trait_data(file_path, focal_trait='both', combine_pgs=True)
    if combined_pgs_data is not None:
        print(f"Combined PGS data shape: {combined_pgs_data.shape}")
        print("Available columns:", combined_pgs_data.columns.tolist())
        print(combined_pgs_data.head(3))
    
    return trait1_data, trait2_data, both_traits_data, combined_pgs_data

# a function to loop through the directory and run a specified regression on using each df
def run_analysis_on_directory(directory_path, analysis_function, 
                             focal_trait='both', combine_pgs=False, 
                             file_pattern='*.txt', save_results=True, 
                             output_dir=None, **kwargs):
    """
    Generic function to loop through files in a directory and run any analysis on each file.
    
    Parameters:
    directory_path (str): Path to the directory containing data files
    analysis_function (callable): Function to run analysis on each DataFrame
                                 Should accept DataFrame as first argument and filename as second
                                 Example: my_analysis(dataframe, filename, **other_params)
    focal_trait (str): Which trait(s) to extract ('trait1', 'trait2', or 'both')
    combine_pgs (bool): Whether to combine haplotypic PGS scores
    file_pattern (str): Pattern to match files (default: '*.txt')
    save_results (bool): Whether to save results to files
    output_dir (str): Directory to save results (if None, uses directory_path)
    **kwargs: Additional keyword arguments to pass to the analysis_function
    
    Returns:
    dict: Dictionary with filename as key and analysis results as value
    """
    import glob
    
    # Get all files matching the pattern
    file_pattern_full = os.path.join(directory_path, file_pattern)
    files = glob.glob(file_pattern_full)
    
    if not files:
        print(f"No files found matching pattern: {file_pattern_full}")
        return {}
    
    print(f"Found {len(files)} files to process")
    
    # Set output directory
    if output_dir is None:
        output_dir = directory_path
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    results = {}
    failed_files = []
    
    for i, file_path in enumerate(files, 1):
        filename = os.path.basename(file_path)
        print(f"\nProcessing file {i}/{len(files)}: {filename}")
        
        try:
            # Load data using our read_trait_data function
            data = read_trait_data(file_path, focal_trait=focal_trait, combine_pgs=combine_pgs)
            
            if data is None:
                print(f"  Failed to load data from {filename}")
                failed_files.append(filename)
                continue
            
            # Run analysis
            print(f"  Running analysis on {data.shape[0]} rows, {data.shape[1]} columns")
            analysis_result = analysis_function(data, filename=filename, **kwargs)
            
            # Store results
            results[filename] = analysis_result
            
            # Save results if requested
            if save_results and analysis_result is not None:
                output_file = os.path.join(output_dir, f"analysis_results_{filename.replace('.txt', '.csv')}")
                
                # Handle different types of analysis results
                if isinstance(analysis_result, pd.DataFrame):
                    analysis_result.to_csv(output_file, index=False)
                    print(f"  Saved results to: {output_file}")
                elif isinstance(analysis_result, dict):
                    # Convert dict to DataFrame for saving
                    pd.DataFrame([analysis_result]).to_csv(output_file, index=False)
                    print(f"  Saved results to: {output_file}")
                else:
                    print(f"  Warning: Results type {type(analysis_result)} not supported for saving")
            
        except Exception as e:
            print(f"  Error processing {filename}: {str(e)}")
            failed_files.append(filename)
    
    # Summary
    print(f"\n=== Processing Summary ===")
    print(f"Total files processed: {len(files)}")
    print(f"Successfully processed: {len(results)}")
    print(f"Failed: {len(failed_files)}")
    
    if failed_files:
        print(f"Failed files: {failed_files}")
    
    return results

# Example analysis functions that can be used with run_analysis_on_directory
def basic_stats_analysis(data, filename=None, **kwargs):
    """
    Example analysis function: Basic descriptive statistics
    
    Parameters:
    data (pd.DataFrame): The data to analyze
    filename (str): Optional filename for reference
    **kwargs: Additional parameters (ignored in this example)
    
    Returns:
    dict: Basic statistics results
    """
    results = {
        'filename': filename,
        'n_rows': len(data),
        'n_columns': len(data.columns),
        'columns': list(data.columns)
    }
    
    # Add basic stats for numeric columns
    numeric_cols = data.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        stats = data[numeric_cols].describe()
        for col in numeric_cols:
            results[f'{col}_mean'] = stats.loc['mean', col]
            results[f'{col}_std'] = stats.loc['std', col]
            results[f'{col}_min'] = stats.loc['min', col]
            results[f'{col}_max'] = stats.loc['max', col]
    
    return results

def custom_regression_analysis(data, filename=None, predictors=None, outcomes=None, 
                             multiple_regression=True, incremental_r2=False, **kwargs):
    """
    Generic regression analysis function that supports both simple and multiple regression
    
    Parameters:
    data (pd.DataFrame): The data to analyze
    filename (str): Optional filename for reference  
    predictors (list): List of predictor column names
    outcomes (list): List of outcome column names
    multiple_regression (bool): If True and len(predictors)>1, run multiple regression
                               If False, run simple regression for each predictor-outcome pair
    incremental_r2 (bool): If True and multiple_regression=True, calculate incremental R²
                          Shows R² contribution of each predictor when added sequentially
    **kwargs: Additional parameters for regression
    
    Returns:
    pd.DataFrame: Results from regression analyses
    """
    from sklearn.linear_model import LinearRegression
    from sklearn.metrics import r2_score
    import warnings
    warnings.filterwarnings('ignore')
    
    results_list = []
    
    # Default predictors and outcomes if not specified
    if predictors is None:
        predictors = [col for col in data.columns if 'PGS' in col or 'T' in col]
    if outcomes is None:
        outcomes = [col for col in data.columns if col.startswith('Y')]
    
    # Ensure predictors and outcomes are lists
    if isinstance(predictors, str):
        predictors = [predictors]
    if isinstance(outcomes, str):
        outcomes = [outcomes]
    
    # Check if we should run multiple regression
    if multiple_regression and len(predictors) > 1:
        print(f"  Running multiple regression with {len(predictors)} predictors")
        
        # Run multiple regression for each outcome
        for outcome in outcomes:
            if outcome in data.columns:
                try:
                    # Check which predictors are available
                    available_predictors = [p for p in predictors if p in data.columns]
                    
                    if len(available_predictors) == 0:
                        continue
                    
                    # Prepare data for multiple regression
                    X = data[available_predictors].values
                    y = data[outcome].values
                    
                    # Remove any NaN values
                    mask = ~(pd.isna(X).any(axis=1) | pd.isna(y))
                    X_clean = X[mask]
                    y_clean = y[mask]
                    
                    if len(X_clean) > len(available_predictors) + 5:  # Need more samples than predictors
                        
                        if incremental_r2:
                            print(f"    Calculating incremental R² for {len(available_predictors)} predictors")
                            
                            # Calculate incremental R² by adding predictors sequentially
                            incremental_results = []
                            previous_r2 = 0
                            
                            for i in range(1, len(available_predictors) + 1):
                                # Use first i predictors
                                current_predictors = available_predictors[:i]
                                X_current = X_clean[:, :i]
                                
                                # Fit model with current predictors
                                model_current = LinearRegression()
                                model_current.fit(X_current, y_clean)
                                y_pred_current = model_current.predict(X_current)
                                current_r2 = r2_score(y_clean, y_pred_current)
                                
                                # Calculate incremental R²
                                incremental_r2_value = current_r2 - previous_r2
                                
                                # Create incremental result
                                incremental_result = {
                                    'filename': filename,
                                    'predictors_up_to': ', '.join(current_predictors),
                                    'added_predictor': current_predictors[-1],
                                    'outcome': outcome,
                                    'n_samples': len(X_clean),
                                    'n_predictors': i,
                                    'intercept': model_current.intercept_,
                                    'total_r2': current_r2,
                                    'incremental_r2': incremental_r2_value,
                                    'r2_change': incremental_r2_value,
                                    'regression_type': 'incremental_multiple'
                                }
                                
                                # Add coefficients for current model
                                for j, pred in enumerate(current_predictors):
                                    incremental_result[f'coef_{pred}'] = model_current.coef_[j]
                                
                                incremental_results.append(incremental_result)
                                previous_r2 = current_r2
                            
                            # Add all incremental results
                            results_list.extend(incremental_results)
                            
                        else:
                            # Standard multiple regression (all predictors at once)
                            model = LinearRegression()
                            model.fit(X_clean, y_clean)
                            y_pred = model.predict(X_clean)
                            
                            # Create result for multiple regression
                            result = {
                                'filename': filename,
                                'predictors': ', '.join(available_predictors),
                                'outcome': outcome,
                                'n_samples': len(X_clean),
                                'n_predictors': len(available_predictors),
                                'intercept': model.intercept_,
                                'r2_score': r2_score(y_clean, y_pred),
                                'regression_type': 'multiple'
                            }
                            
                            # Add individual coefficients
                            for i, predictor in enumerate(available_predictors):
                                result[f'coef_{predictor}'] = model.coef_[i]
                            
                            results_list.append(result)
                        
                except Exception as e:
                    print(f"  Error in multiple regression {available_predictors} -> {outcome}: {str(e)}")
    
    else:
        print(f"  Running simple regression for {len(predictors)} predictors")
        
        # Run simple regression for each predictor-outcome pair
        for predictor in predictors:
            for outcome in outcomes:
                if predictor in data.columns and outcome in data.columns:
                    try:
                        X = data[[predictor]].values
                        y = data[outcome].values
                        
                        # Remove any NaN values
                        mask = ~(pd.isna(X).any(axis=1) | pd.isna(y))
                        X_clean = X[mask]
                        y_clean = y[mask]
                        
                        if len(X_clean) > 10:  # Minimum sample size
                            model = LinearRegression()
                            model.fit(X_clean, y_clean)
                            y_pred = model.predict(X_clean)
                            
                            result = {
                                'filename': filename,
                                'predictors': predictor,
                                'outcome': outcome,
                                'n_samples': len(X_clean),
                                'n_predictors': 1,
                                'coefficient': model.coef_[0],
                                'intercept': model.intercept_,
                                'r2_score': r2_score(y_clean, y_pred),
                                'regression_type': 'simple'
                            }
                            results_list.append(result)
                            
                    except Exception as e:
                        print(f"  Error in simple regression {predictor} -> {outcome}: {str(e)}")
    
    return pd.DataFrame(results_list) if results_list else None

def correlation_analysis(data, filename=None, method='pearson', **kwargs):
    """
    Correlation analysis between all numeric variables
    
    Parameters:
    data (pd.DataFrame): The data to analyze
    filename (str): Optional filename for reference
    method (str): Correlation method ('pearson', 'spearman', 'kendall')
    **kwargs: Additional parameters
    
    Returns:
    pd.DataFrame: Correlation matrix in long format
    """
    
    # Get numeric columns only
    numeric_data = data.select_dtypes(include=[np.number])
    
    if numeric_data.empty:
        return pd.DataFrame({'filename': [filename], 'error': ['No numeric columns found']})
    
    # Calculate correlation matrix
    corr_matrix = numeric_data.corr(method=method)
    
    # Convert to long format
    results_list = []
    for i, var1 in enumerate(corr_matrix.columns):
        for j, var2 in enumerate(corr_matrix.columns):
            if i < j:  # Only upper triangle to avoid duplicates
                results_list.append({
                    'filename': filename,
                    'variable1': var1,
                    'variable2': var2,
                    'correlation': corr_matrix.loc[var1, var2],
                    'method': method
                })
    
    return pd.DataFrame(results_list)

In [2]:
# Example usage of the directory analysis function
def run_analysis_example():
    """
    Example showing how to run various analyses on all files in a directory
    """
    
    # Specify the directory containing your data files
    data_directory = "/Users/xuly4739/Library/CloudStorage/OneDrive-UCB-O365/Documents/coding/PyProject/StatRev_IndirectGene/Data/phenoVT_geneticAM/nfam8000"
    
    # Output directory for results
    output_directory = "/Users/xuly4739/Library/CloudStorage/OneDrive-UCB-O365/Documents/coding/PyProject/StatRev_IndirectGene/Analysis/PGS-Regression/results/phenoVT_geneticAM/nfam8000"
    
    print("=== Running Basic Statistics on All Files ===")
    # Run basic statistics
    stats_results = run_analysis_on_directory(
        directory_path=data_directory,
        analysis_function=basic_stats_analysis,
        focal_trait='both',
        combine_pgs=True,
        file_pattern='*.txt',
        save_results=True,
        output_dir=output_directory
    )
    
    print(f"\nBasic statistics completed on {len(stats_results)} files")
    
    # print("\n=== Running Custom Regression Analysis (Multiple Regression) ===")
    # # Run multiple regression with specified predictors and outcomes
    # multiple_regression_results = run_analysis_on_directory(
    #     directory_path=data_directory,
    #     analysis_function=custom_regression_analysis,
    #     focal_trait='both',
    #     combine_pgs=True,
    #     file_pattern='*.txt',
    #     save_results=True,
    #     output_dir=output_directory,
    #     # Additional parameters for the regression function
    #     predictors=['PGSo1', 'PGSp1', 'PGSm1'],  # Multiple predictors for multiple regression
    #     outcomes=['Yo1'],  # Single outcome
    #     multiple_regression=True  # Enable multiple regression
    # )
    
    # print(f"\nMultiple regression analysis completed on {len(multiple_regression_results)} files")
    
    print("\n=== Running Incremental R² Analysis ===")
    # Run incremental R² analysis to see contribution of each predictor
    incremental_r2_results = run_analysis_on_directory(
        directory_path=data_directory,
        analysis_function=custom_regression_analysis,
        focal_trait='both',
        combine_pgs=True,
        file_pattern='*.txt',
        save_results=True,
        output_dir=output_directory,
        # Additional parameters for the regression function
        predictors=['PGSo1', 'PGSp1', 'PGSm1'],  # Multiple predictors (order matters!)
        outcomes=['Yo1'],  # Single outcome
        multiple_regression=True,  # Enable multiple regression
        incremental_r2=True  # Enable incremental R² analysis
    )
    
    print(f"\nIncremental R² analysis completed on {len(incremental_r2_results)} files")
    
    # print("\n=== Running Correlation Analysis ===")
    # # Run correlation analysis
    # corr_results = run_analysis_on_directory(
    #     directory_path=data_directory,
    #     analysis_function=correlation_analysis,
    #     focal_trait='both',
    #     combine_pgs=False,  # Keep original columns for correlation
    #     file_pattern='*.txt',
    #     save_results=True,
    #     output_dir=output_directory,
    #     method='pearson'  # correlation method
    # )
    
    # print(f"\nCorrelation analysis completed on {len(corr_results)} files")
    
    return stats_results, multiple_regression_results, incremental_r2_results




In [3]:
def extract_key_results(analysis_results, key_columns=None, filename_pattern=None, 
                       pivot_columns=None, aggregate_func='first', 
                       include_metadata=True, sort_by=None):
    """
    Extract specified key results from run_analysis_on_directory output and consolidate into a DataFrame.
    
    Parameters:
    analysis_results (dict): Output from run_analysis_on_directory function
                           Dictionary with filename as key and analysis results as value
    key_columns (list): List of column names to extract from each analysis result
                       If None, will extract all numeric columns
                       Examples: ['r2_score', 'incremental_r2', 'total_r2']
    filename_pattern (str): Optional regex pattern to filter filenames
                           Example: r'run_(\d+)' to extract run numbers
    pivot_columns (list): Optional list of columns to pivot on
                         Creates separate columns for each unique value
                         Example: ['added_predictor'] creates columns for each predictor
    aggregate_func (str or callable): How to aggregate multiple rows per file
                                     Options: 'first', 'last', 'mean', 'max', 'min', 'sum'
    include_metadata (bool): Whether to include metadata columns like filename, n_samples
    sort_by (str or list): Column(s) to sort the final DataFrame by
    
    Returns:
    pd.DataFrame: Consolidated DataFrame with rows as data files and columns as extracted metrics
    """
    import re
    
    if not analysis_results:
        print("No analysis results provided")
        return pd.DataFrame()
    
    consolidated_data = []
    
    for filename, result in analysis_results.items():
        try:
            # Handle different types of results
            if result is None:
                continue
                
            # Convert to DataFrame if it's not already
            if isinstance(result, dict):
                result_df = pd.DataFrame([result])
            elif isinstance(result, pd.DataFrame):
                result_df = result.copy()
            else:
                print(f"Warning: Unsupported result type {type(result)} for {filename}")
                continue
            
            if result_df.empty:
                continue
            
            # Extract filename information
            base_info = {'filename': filename}
            
            # Extract run number or other info from filename using pattern
            if filename_pattern:
                match = re.search(filename_pattern, filename)
                if match:
                    if match.groups():
                        base_info['run_number'] = match.group(1)
                    else:
                        base_info['match'] = match.group(0)
            
            # Determine which columns to extract
            if key_columns is None:
                # Extract all numeric columns
                numeric_cols = result_df.select_dtypes(include=[np.number]).columns.tolist()
                extract_cols = numeric_cols
            else:
                # Use specified columns that exist in the result
                extract_cols = [col for col in key_columns if col in result_df.columns]
            
            # Include metadata columns if requested
            metadata_cols = []
            if include_metadata:
                possible_metadata = ['n_samples', 'n_predictors', 'outcome', 'predictors', 
                                   'regression_type', 'method', 'added_predictor', 
                                   'variable1', 'variable2']
                metadata_cols = [col for col in possible_metadata if col in result_df.columns]
            
            all_extract_cols = extract_cols + metadata_cols
            
            if not all_extract_cols:
                print(f"Warning: No columns to extract from {filename}")
                continue
            
            # Handle pivoting if requested
            if pivot_columns and any(col in result_df.columns for col in pivot_columns):
                print(f"  Pivoting data for {filename}")
                
                # Determine which pivot columns exist
                existing_pivot_cols = [col for col in pivot_columns if col in result_df.columns]
                
                # Create a pivot table for each combination of pivot columns
                if len(existing_pivot_cols) == 1:
                    pivot_col = existing_pivot_cols[0]
                    
                    # For each numeric column, create separate columns for each pivot value
                    pivoted_data = base_info.copy()
                    
                    for extract_col in extract_cols:
                        if extract_col in result_df.columns:
                            for pivot_value in result_df[pivot_col].unique():
                                mask = result_df[pivot_col] == pivot_value
                                subset = result_df[mask]
                                
                                if not subset.empty:
                                    # Aggregate if multiple rows
                                    if len(subset) > 1:
                                        if aggregate_func == 'first':
                                            value = subset[extract_col].iloc[0]
                                        elif aggregate_func == 'last':
                                            value = subset[extract_col].iloc[-1]
                                        elif aggregate_func == 'mean':
                                            value = subset[extract_col].mean()
                                        elif aggregate_func == 'max':
                                            value = subset[extract_col].max()
                                        elif aggregate_func == 'min':
                                            value = subset[extract_col].min()
                                        elif aggregate_func == 'sum':
                                            value = subset[extract_col].sum()
                                        elif callable(aggregate_func):
                                            value = aggregate_func(subset[extract_col])
                                        else:
                                            value = subset[extract_col].iloc[0]
                                    else:
                                        value = subset[extract_col].iloc[0]
                                    
                                    # Create column name
                                    col_name = f"{extract_col}_{pivot_value}"
                                    pivoted_data[col_name] = value
                    
                    # Add metadata (take first occurrence)
                    for meta_col in metadata_cols:
                        if meta_col not in pivot_columns and meta_col in result_df.columns:
                            pivoted_data[meta_col] = result_df[meta_col].iloc[0]
                    
                    consolidated_data.append(pivoted_data)
                
                else:
                    print(f"Warning: Multiple pivot columns not yet supported for {filename}")
                    continue
            
            else:
                # No pivoting - handle multiple rows by aggregating
                if len(result_df) > 1:
                    print(f"  Aggregating {len(result_df)} rows for {filename}")
                    
                    row_data = base_info.copy()
                    
                    # Aggregate numeric columns
                    for col in extract_cols:
                        if col in result_df.columns:
                            if aggregate_func == 'first':
                                row_data[col] = result_df[col].iloc[0]
                            elif aggregate_func == 'last':
                                row_data[col] = result_df[col].iloc[-1]
                            elif aggregate_func == 'mean':
                                row_data[col] = result_df[col].mean()
                            elif aggregate_func == 'max':
                                row_data[col] = result_df[col].max()
                            elif aggregate_func == 'min':
                                row_data[col] = result_df[col].min()
                            elif aggregate_func == 'sum':
                                row_data[col] = result_df[col].sum()
                            elif callable(aggregate_func):
                                row_data[col] = aggregate_func(result_df[col])
                            else:
                                row_data[col] = result_df[col].iloc[0]
                    
                    # Handle metadata columns
                    for col in metadata_cols:
                        if col in result_df.columns:
                            # For metadata, usually take first value or most common
                            if result_df[col].dtype == 'object':
                                row_data[col] = result_df[col].iloc[0]  # Take first for text
                            else:
                                row_data[col] = result_df[col].iloc[0]  # Take first for numbers
                    
                    consolidated_data.append(row_data)
                
                else:
                    # Single row - just extract the values
                    row_data = base_info.copy()
                    
                    for col in all_extract_cols:
                        if col in result_df.columns:
                            row_data[col] = result_df[col].iloc[0]
                    
                    consolidated_data.append(row_data)
        
        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")
            continue
    
    # Create final DataFrame
    if not consolidated_data:
        print("No data was successfully extracted")
        return pd.DataFrame()
    
    final_df = pd.DataFrame(consolidated_data)
    
    # Sort if requested
    if sort_by and sort_by in final_df.columns:
        final_df = final_df.sort_values(sort_by).reset_index(drop=True)
    elif isinstance(sort_by, list):
        available_sort_cols = [col for col in sort_by if col in final_df.columns]
        if available_sort_cols:
            final_df = final_df.sort_values(available_sort_cols).reset_index(drop=True)
    
    print(f"Successfully extracted data from {len(consolidated_data)} files")
    print(f"Final DataFrame shape: {final_df.shape}")
    print(f"Columns: {list(final_df.columns)}")
    
    return final_df


def extract_r2_results(analysis_results, r2_types=None, filename_pattern=r'run_(\d+)', 
                      include_predictors=True, sort_by_run=True):
    """
    Specialized function to extract R² values from regression analysis results.
    
    Parameters:
    analysis_results (dict): Output from run_analysis_on_directory function
    r2_types (list): List of R² column names to extract
                    If None, will detect all R² columns automatically
                    Examples: ['r2_score', 'total_r2', 'incremental_r2']
    filename_pattern (str): Regex pattern to extract run numbers from filenames
    include_predictors (bool): Whether to include predictor information
    sort_by_run (bool): Whether to sort by run number
    
    Returns:
    pd.DataFrame: DataFrame with runs as rows and R² values as columns
    """
    
    # Auto-detect R² column types if not specified
    if r2_types is None:
        r2_types = []
        # Check a sample of results to find R² columns
        for result in analysis_results.values():
            if result is not None:
                if isinstance(result, dict):
                    sample_df = pd.DataFrame([result])
                elif isinstance(result, pd.DataFrame):
                    sample_df = result
                else:
                    continue
                
                # Find columns that likely contain R² values
                potential_r2_cols = [col for col in sample_df.columns 
                                   if 'r2' in col.lower() or 'r_squared' in col.lower()]
                r2_types.extend(potential_r2_cols)
        
        # Remove duplicates and sort
        r2_types = sorted(list(set(r2_types)))
        print(f"Auto-detected R² columns: {r2_types}")
    
    # Determine what to include based on the type of analysis
    key_columns = r2_types.copy()
    
    # Add predictor information if requested
    if include_predictors:
        key_columns.extend(['predictors', 'added_predictor', 'outcome'])
    
    # Check if we need to pivot (for incremental R² results)
    needs_pivot = any('incremental' in col.lower() for col in r2_types)
    pivot_columns = ['added_predictor'] if needs_pivot else None
    
    # Extract the results
    r2_df = extract_key_results(
        analysis_results=analysis_results,
        key_columns=key_columns,
        filename_pattern=filename_pattern,
        pivot_columns=pivot_columns,
        aggregate_func='first',  # Take first occurrence
        include_metadata=True,
        sort_by='run_number' if sort_by_run else None
    )
    
    return r2_df


# Example usage functions
def demo_extract_key_results(condition = None):
    """
    Demonstrate how to use the extract_key_results function
    """
    print("=== Demo: Extract Key Results Function ===")
    
    # First, run some analysis to get results
    data_directory = f"/Users/xuly4739/Library/CloudStorage/OneDrive-UCB-O365/Documents/coding/PyProject/StatRev_IndirectGene/Data/{condition}/nfam8000"
    
    print("1. Running incremental R² analysis on a few files...")
    # Run analysis on just a few files for demo
    import glob
    files = glob.glob(os.path.join(data_directory, "*.txt"))  # Just first 5 files
    
    demo_results = {}
    for file_path in files:
        filename = os.path.basename(file_path)
        print(f"  Processing {filename}")
        
        try:
            data = read_trait_data(file_path, focal_trait='both', combine_pgs=True)
            if data is not None:
                result = custom_regression_analysis(
                    data, 
                    filename=filename,
                    predictors=['PGSo1', 'PGSp1', 'PGSm1'],
                    outcomes=['Yo1'],
                    multiple_regression=True,
                    incremental_r2=True
                )
                demo_results[filename] = result
        except Exception as e:
            print(f"    Error: {e}")
    
    print(f"\n2. Got results from {len(demo_results)} files")
    
    # Now demonstrate extracting key results
    print("\n3. Extracting R² values using specialized function...")
    r2_summary = extract_r2_results(
        demo_results,
        r2_types=['total_r2', 'incremental_r2'],
        filename_pattern=r'run_(\d+)',
        include_predictors=True,
        sort_by_run=True
    )
    
    print("\nR² Summary DataFrame:")
    print(r2_summary)
    
    # # Demonstrate generic extraction
    # print("\n4. Extracting custom columns using generic function...")
    # custom_summary = extract_key_results(
    #     demo_results,
    #     key_columns=['total_r2', 'incremental_r2', 'n_samples'],
    #     filename_pattern=r'run_(\d+)',
    #     pivot_columns=['added_predictor'],
    #     include_metadata=True,
    #     sort_by='run_number'
    # )
    
    # print("\nCustom Summary DataFrame:")
    # print(custom_summary)
    
    return demo_results, r2_summary

  """


In [4]:
# # for one condition
# test = demo_extract_key_results(condition='phenoVT_geneticAM')

# # save the results to a csv file
# test[1].to_csv('/Users/xuly4739/Library/CloudStorage/OneDrive-UCB-O365/Documents/coding/PyProject/StatRev_IndirectGene/Analysis/PGS-Regression/results/phenoVT_geneticAM_trait1_r2_summary.csv', index=False)

# # for second condition
# test2 = demo_extract_key_results(condition='phenoVT_socialAM')
# # save the results to a csv file
# test2[1].to_csv('/Users/xuly4739/Library/CloudStorage/OneDrive-UCB-O365/Documents/coding/PyProject/StatRev_IndirectGene/Analysis/PGS-Regression/results/phenoVT_socialAM_trait1_r2_summary.csv', index=False)

# # third condition
# test3 = demo_extract_key_results(condition='phenoVT_phenoAM')
# # save the results to a csv file
# test3[1].to_csv('/Users/xuly4739/Library/CloudStorage/OneDrive-UCB-O365/Documents/coding/PyProject/StatRev_IndirectGene/Analysis/PGS-Regression/results/phenoVT_phenoAM_trait1_r2_summary.csv', index=False)

# # fourth condition
# test4 = demo_extract_key_results(condition='socialVT_phenoAM')
# # save the results to a csv file
# test4[1].to_csv('/Users/xuly4739/Library/CloudStorage/OneDrive-UCB-O365/Documents/coding/PyProject/StatRev_IndirectGene/Analysis/PGS-Regression/results/socialVT_phenoAM_trait1_r2_summary.csv', index=False)

test5 = demo_extract_key_results(condition='t1pheVT_t2socVT_uniphenoAM')
 # save the results to a csv file
test5[1].to_csv('/Users/xuly4739/Library/CloudStorage/OneDrive-UCB-O365/Documents/coding/PyProject/StatRev_IndirectGene/Analysis/PGS-Regression/results/t1pheVT_t2socVT_uniphenoAM_trait1_r2_summary.csv', index=False)

# the condition for univariate analysis 
test6 = demo_extract_key_results(condition="01_t1pheVTnoAM_t2socVTnoAM")
# save the results to a csv file
test6[1].to_csv('/Users/xuly4739/Library/CloudStorage/OneDrive-UCB-O365/Documents/coding/PyProject/StatRev_IndirectGene/Analysis/PGS-Regression/results/01_t1pheVTnoAM_t2socVTnoAM_trait1_r2_summary.csv', index=False)

# the condition for univariate analysis 
test7 = demo_extract_key_results(condition="02_t1noVTpheAM_t2noVTnoAM")
# save the results to a csv file
test7[1].to_csv('/Users/xuly4739/Library/CloudStorage/OneDrive-UCB-O365/Documents/coding/PyProject/StatRev_IndirectGene/Analysis/PGS-Regression/results/02_t1noVTpheAM_t2noVTnoAM_trait1_r2_summary.csv', index=False)

=== Demo: Extract Key Results Function ===
1. Running incremental R² analysis on a few files...
  Processing t1pheVT_t2socVT_uniphenoAM_run_834_nfam8000.txt
Selected all columns: 14 columns
Combining haplotypic PGS scores into full PGS scores...
  Created PGSp1 = NTp1 + Tp1
  Created PGSp2 = NTp2 + Tp2
  Created PGSm1 = NTm1 + Tm1
  Created PGSm2 = NTm2 + Tm2
  Created PGSo1 = Tp1 + Tm1
  Created PGSo2 = Tp2 + Tm2
Successfully created 6 combined PGS columns
  Running multiple regression with 3 predictors
    Calculating incremental R² for 3 predictors
  Processing t1pheVT_t2socVT_uniphenoAM_run_495_nfam8000.txt
Selected all columns: 14 columns
Combining haplotypic PGS scores into full PGS scores...
  Created PGSp1 = NTp1 + Tp1
  Created PGSp2 = NTp2 + Tp2
  Created PGSm1 = NTm1 + Tm1
  Created PGSm2 = NTm2 + Tm2
  Created PGSo1 = Tp1 + Tm1
  Created PGSo2 = Tp2 + Tm2
Successfully created 6 combined PGS columns
  Running multiple regression with 3 predictors
    Calculating incremental 

In [5]:
# get the trait1 results
# read the saved csv file

# df_phenoVT_geneticAM = pd.read_csv('/Users/xuly4739/Library/CloudStorage/OneDrive-UCB-O365/Documents/coding/PyProject/StatRev_IndirectGene/Analysis/PGS-Regression/results/phenoVT_geneticAM_trait1_r2_summary.csv')
# df_phenoVT_socialAM = pd.read_csv('/Users/xuly4739/Library/CloudStorage/OneDrive-UCB-O365/Documents/coding/PyProject/StatRev_IndirectGene/Analysis/PGS-Regression/results/phenoVT_socialAM_trait1_r2_summary.csv')
# df_phenoVT_phenoAM = pd.read_csv('/Users/xuly4739/Library/CloudStorage/OneDrive-UCB-O365/Documents/coding/PyProject/StatRev_IndirectGene/Analysis/PGS-Regression/results/phenoVT_phenoAM_trait1_r2_summary.csv')
# df_socialVT_phenoAM = pd.read_csv('/Users/xuly4739/Library/CloudStorage/OneDrive-UCB-O365/Documents/coding/PyProject/StatRev_IndirectGene/Analysis/PGS-Regression/results/socialVT_phenoAM_trait1_r2_summary.csv')
df_t1pheVT_t2socVT_uniphenoAM = pd.read_csv('/Users/xuly4739/Library/CloudStorage/OneDrive-UCB-O365/Documents/coding/PyProject/StatRev_IndirectGene/Analysis/PGS-Regression/results/t1pheVT_t2socVT_uniphenoAM_trait1_r2_summary.csv')
df_01_t1pheVTnoAM_t2socVTnoAM = pd.read_csv('/Users/xuly4739/Library/CloudStorage/OneDrive-UCB-O365/Documents/coding/PyProject/StatRev_IndirectGene/Analysis/PGS-Regression/results/01_t1pheVTnoAM_t2socVTnoAM_trait1_r2_summary.csv')
df_02_t1noVTpheAM_t2noVTnoAM  = pd.read_csv('/Users/xuly4739/Library/CloudStorage/OneDrive-UCB-O365/Documents/coding/PyProject/StatRev_IndirectGene/Analysis/PGS-Regression/results/02_t1noVTpheAM_t2noVTnoAM_trait1_r2_summary.csv')

# add a condition column to each dataframe and rearrange it to be the first column
# df_phenoVT_geneticAM['condition'] = 'phenoVT_geneticAM'
# df_phenoVT_socialAM['condition'] = 'phenoVT_socialAM'
# df_phenoVT_phenoAM['condition'] = 'phenoVT_phenoAM'
# df_socialVT_phenoAM['condition'] = 'socialVT_phenoAM'
df_t1pheVT_t2socVT_uniphenoAM['condition'] = 't1pheVT_t2socVT_uniphenoAM'
df_01_t1pheVTnoAM_t2socVTnoAM['condition'] = '01_t1pheVTnoAM_t2socVTnoAM'
df_02_t1noVTpheAM_t2noVTnoAM['condition'] = '02_t1noVTpheAM_t2noVTnoAM'
# concatenate the dataframes
df_combined = pd.concat([df_t1pheVT_t2socVT_uniphenoAM, df_01_t1pheVTnoAM_t2socVTnoAM, df_02_t1noVTpheAM_t2noVTnoAM], ignore_index=True)

# get summary statistics for the total_r2_PGSo1,total_r2_PGSp1,total_r2_PGSm1,incremental_r2_PGSo1,incremental_r2_PGSp1,incremental_r2_PGSm1
# by condition, include mean, std, min, max, median, MAD
# Get summary statistics for specified R² columns by condition
target_columns = ['total_r2_PGSo1', 'total_r2_PGSp1', 'total_r2_PGSm1', 
                  'incremental_r2_PGSo1', 'incremental_r2_PGSp1', 'incremental_r2_PGSm1']

# Create long format summary table
summary_list = []
for condition in df_combined['condition'].unique():
    condition_data = df_combined[df_combined['condition'] == condition]
    for col in target_columns:
        values = condition_data[col].dropna()
        if len(values) > 0:
            summary_list.append({
                'condition': condition,
                'variable': col,
                'count': len(values),
                'mean': values.mean(),
                'std': values.std(),
                'min': values.min(),
                'max': values.max(),
                'median': values.median(),
                'mad': np.median(np.abs(values - values.median()))
            })

summary_stats = pd.DataFrame(summary_list).round(4)

# write the summary statistics to a tsv file
summary_stats.to_csv('/Users/xuly4739/Library/CloudStorage/OneDrive-UCB-O365/Documents/coding/PyProject/StatRev_IndirectGene/Analysis/PGS-Regression/results/regression_trait1_r2_summary_stats_unionly.tsv', sep='\t')


In [6]:
# Example usage functions
def demo_extract_key_results2(condition = None):
    """
    Demonstrate how to use the extract_key_results function
    """
    print("=== Demo: Extract Key Results Function ===")
    
    # First, run some analysis to get results
    data_directory = f"/Users/xuly4739/Library/CloudStorage/OneDrive-UCB-O365/Documents/coding/PyProject/StatRev_IndirectGene/Data/{condition}/nfam8000"
    
    print("1. Running incremental R² analysis on a few files...")
    # Run analysis on just a few files for demo
    import glob
    files = glob.glob(os.path.join(data_directory, "*.txt"))  # Just first 5 files
    
    demo_results = {}
    for file_path in files:
        filename = os.path.basename(file_path)
        print(f"  Processing {filename}")
        
        try:
            data = read_trait_data(file_path, focal_trait='both', combine_pgs=True)
            if data is not None:
                result = custom_regression_analysis(
                    data, 
                    filename=filename,
                    predictors=['PGSo2', 'PGSp2', 'PGSm2'],
                    outcomes=['Yo2'],
                    multiple_regression=True,
                    incremental_r2=True
                )
                demo_results[filename] = result
        except Exception as e:
            print(f"    Error: {e}")
    
    print(f"\n2. Got results from {len(demo_results)} files")
    
    # Now demonstrate extracting key results
    print("\n3. Extracting R² values using specialized function...")
    r2_summary = extract_r2_results(
        demo_results,
        r2_types=['total_r2', 'incremental_r2'],
        filename_pattern=r'run_(\d+)',
        include_predictors=True,
        sort_by_run=True
    )
    
    print("\nR² Summary DataFrame:")
    print(r2_summary)
    
    # # Demonstrate generic extraction
    # print("\n4. Extracting custom columns using generic function...")
    # custom_summary = extract_key_results(
    #     demo_results,
    #     key_columns=['total_r2', 'incremental_r2', 'n_samples'],
    #     filename_pattern=r'run_(\d+)',
    #     pivot_columns=['added_predictor'],
    #     include_metadata=True,
    #     sort_by='run_number'
    # )
    
    # print("\nCustom Summary DataFrame:")
    # print(custom_summary)
    
    return demo_results, r2_summary

test5_2 = demo_extract_key_results2(condition='t1pheVT_t2socVT_uniphenoAM')
 # save the results to a csv file
test5_2[1].to_csv('/Users/xuly4739/Library/CloudStorage/OneDrive-UCB-O365/Documents/coding/PyProject/StatRev_IndirectGene/Analysis/PGS-Regression/results/t1pheVT_t2socVT_uniphenoAM_trait2_r2_summary.csv', index=False)

# the condition for univariate analysis 
test6_2 = demo_extract_key_results2(condition="01_t1pheVTnoAM_t2socVTnoAM")
# save the results to a csv file
test6_2[1].to_csv('/Users/xuly4739/Library/CloudStorage/OneDrive-UCB-O365/Documents/coding/PyProject/StatRev_IndirectGene/Analysis/PGS-Regression/results/01_t1pheVTnoAM_t2socVTnoAM_trait2_r2_summary.csv', index=False)

# the condition for univariate analysis 
test7_2 = demo_extract_key_results2(condition="02_t1noVTpheAM_t2noVTnoAM")
# save the results to a csv file
test7_2[1].to_csv('/Users/xuly4739/Library/CloudStorage/OneDrive-UCB-O365/Documents/coding/PyProject/StatRev_IndirectGene/Analysis/PGS-Regression/results/02_t1noVTpheAM_t2noVTnoAM_trait2_r2_summary.csv', index=False)

=== Demo: Extract Key Results Function ===
1. Running incremental R² analysis on a few files...
  Processing t1pheVT_t2socVT_uniphenoAM_run_834_nfam8000.txt
Selected all columns: 14 columns
Combining haplotypic PGS scores into full PGS scores...
  Created PGSp1 = NTp1 + Tp1
  Created PGSp2 = NTp2 + Tp2
  Created PGSm1 = NTm1 + Tm1
  Created PGSm2 = NTm2 + Tm2
  Created PGSo1 = Tp1 + Tm1
  Created PGSo2 = Tp2 + Tm2
Successfully created 6 combined PGS columns
  Running multiple regression with 3 predictors
    Calculating incremental R² for 3 predictors
  Processing t1pheVT_t2socVT_uniphenoAM_run_495_nfam8000.txt
Selected all columns: 14 columns
Combining haplotypic PGS scores into full PGS scores...
  Created PGSp1 = NTp1 + Tp1
  Created PGSp2 = NTp2 + Tp2
  Created PGSm1 = NTm1 + Tm1
  Created PGSm2 = NTm2 + Tm2
  Created PGSo1 = Tp1 + Tm1
  Created PGSo2 = Tp2 + Tm2
Successfully created 6 combined PGS columns
  Running multiple regression with 3 predictors
    Calculating incremental 

In [7]:
# get the trait2 results - extract the same (univariate) conditions used for trait1
df_t1pheVT_t2socVT_uniphenoAM_trait2 = pd.read_csv('/Users/xuly4739/Library/CloudStorage/OneDrive-UCB-O365/Documents/coding/PyProject/StatRev_IndirectGene/Analysis/PGS-Regression/results/t1pheVT_t2socVT_uniphenoAM_trait2_r2_summary.csv')
df_01_t1pheVTnoAM_t2socVTnoAM_trait2 = pd.read_csv('/Users/xuly4739/Library/CloudStorage/OneDrive-UCB-O365/Documents/coding/PyProject/StatRev_IndirectGene/Analysis/PGS-Regression/results/01_t1pheVTnoAM_t2socVTnoAM_trait2_r2_summary.csv')
df_02_t1noVTpheAM_t2noVTnoAM_trait2  = pd.read_csv('/Users/xuly4739/Library/CloudStorage/OneDrive-UCB-O365/Documents/coding/PyProject/StatRev_IndirectGene/Analysis/PGS-Regression/results/02_t1noVTpheAM_t2noVTnoAM_trait2_r2_summary.csv')

# add a condition column to each dataframe
df_t1pheVT_t2socVT_uniphenoAM_trait2['condition'] = 't1pheVT_t2socVT_uniphenoAM'
df_01_t1pheVTnoAM_t2socVTnoAM_trait2['condition'] = '01_t1pheVTnoAM_t2socVTnoAM'
df_02_t1noVTpheAM_t2noVTnoAM_trait2['condition'] = '02_t1noVTpheAM_t2noVTnoAM'

# concatenate the dataframes
df_combined_trait2 = pd.concat([df_t1pheVT_t2socVT_uniphenoAM_trait2, df_01_t1pheVTnoAM_t2socVTnoAM_trait2, df_02_t1noVTpheAM_t2noVTnoAM_trait2], ignore_index=True)

# Get summary statistics for specified R² columns by condition (trait2 columns)
target_columns_trait2 = ['total_r2_PGSo2', 'total_r2_PGSp2', 'total_r2_PGSm2',
                         'incremental_r2_PGSo2', 'incremental_r2_PGSp2', 'incremental_r2_PGSm2']

# Create long format summary table
summary_list_trait2 = []
for condition in df_combined_trait2['condition'].unique():
    condition_data = df_combined_trait2[df_combined_trait2['condition'] == condition]
    for col in target_columns_trait2:
        values = condition_data[col].dropna()
        if len(values) > 0:
            summary_list_trait2.append({
                'condition': condition,
                'variable': col,
                'count': len(values),
                'mean': values.mean(),
                'std': values.std(),
                'min': values.min(),
                'max': values.max(),
                'median': values.median(),
                'mad': np.median(np.abs(values - values.median()))
            })

summary_stats_trait2 = pd.DataFrame(summary_list_trait2).round(4)
# write the summary statistics to a tsv file
summary_stats_trait2.to_csv('/Users/xuly4739/Library/CloudStorage/OneDrive-UCB-O365/Documents/coding/PyProject/StatRev_IndirectGene/Analysis/PGS-Regression/results/regression_trait2_r2_summary_stats_unionly.tsv', sep='\t')


#### Get estimates using Kong's approach

In [8]:
def run_pgs_analysis(condition, trait, analysis_type='full_pgs', output_suffix='', 
                    incremental_r2=True, save_results=True):
    """
    Unified function to run PGS analysis for both full PGS and Kong's haplotypic approaches.
    
    Parameters:
    condition (str): Data condition (e.g., 't1pheVT_t2socVT_uniphenoAM')
    trait (str): Trait to analyze ('trait1' or 'trait2') 
    analysis_type (str): Type of analysis
                        - 'full_pgs': Full PGS approach (PGSo, PGSp, PGSm)
                        - 'kong': Kong's approach (NTm, NTp)
                        - 'kong_maternal': Kong's approach with maternal only (NTm)
                        - 'kong_paternal': Kong's approach with paternal only (NTp)
    output_suffix (str): Suffix to add to output filenames
    incremental_r2 (bool): Whether to calculate incremental R²
    save_results (bool): Whether to save results to CSV
    
    Returns:
    tuple: (raw_results_dict, summary_dataframe)
    """
    
    print(f"=== Running {analysis_type} analysis for {trait} on {condition} ===")
    
    # Set up data directory
    data_directory = f"/Users/xuly4739/Library/CloudStorage/OneDrive-UCB-O365/Documents/coding/PyProject/StatRev_IndirectGene/Data/{condition}/nfam8000"
    
    # Define predictors and outcomes based on analysis type and trait
    trait_num = '1' if trait == 'trait1' else '2'
    outcome = f'Yo{trait_num}'
    
    if analysis_type == 'full_pgs':
        predictors = [f'PGSo{trait_num}', f'PGSp{trait_num}', f'PGSm{trait_num}']
        combine_pgs = True
        print(f"  Using full PGS predictors: {predictors}")
    elif analysis_type == 'kong':
        predictors = [f'NTm{trait_num}', f'NTp{trait_num}']
        combine_pgs = False
        print(f"  Using Kong's approach predictors: {predictors}")
    elif analysis_type == 'kong_maternal':
        predictors = [f'NTm{trait_num}']
        combine_pgs = False
        print(f"  Using Kong's maternal predictor: {predictors}")
    elif analysis_type == 'kong_paternal':
        predictors = [f'NTp{trait_num}']
        combine_pgs = False
        print(f"  Using Kong's paternal predictor: {predictors}")
    else:
        raise ValueError(f"Unknown analysis_type: {analysis_type}")
    
    # Run analysis on all files
    import glob
    files = glob.glob(os.path.join(data_directory, "*.txt"))
    print(f"  Processing {len(files)} files...")
    
    demo_results = {}
    for file_path in files:
        filename = os.path.basename(file_path)
        
        try:
            data = read_trait_data(file_path, focal_trait='both', combine_pgs=combine_pgs)
            if data is not None:
                result = custom_regression_analysis(
                    data, 
                    filename=filename,
                    predictors=predictors,
                    outcomes=[outcome],
                    multiple_regression=True,
                    incremental_r2=incremental_r2
                )
                demo_results[filename] = result
        except Exception as e:
            print(f"    Error processing {filename}: {str(e)}")
    
    print(f"  Successfully processed {len(demo_results)} files")
    
    # Extract R² results
    print("  Extracting R² values...")
    r2_summary = extract_r2_results(
        demo_results,
        r2_types=['r2_score', 'total_r2', 'incremental_r2'],
        filename_pattern=r'run_(\d+)',
        include_predictors=True,
        sort_by_run=True
    )
    
    # Save results if requested
    if save_results:
        output_dir = '/Users/xuly4739/Library/CloudStorage/OneDrive-UCB-O365/Documents/coding/PyProject/StatRev_IndirectGene/Analysis/PGS-Regression/results'
        output_filename = f"{condition}_{analysis_type}_{trait}_r2_summary{output_suffix}.csv"
        output_path = os.path.join(output_dir, output_filename)
        r2_summary.to_csv(output_path, index=False)
        print(f"  Saved results to: {output_filename}")
    
    return demo_results, r2_summary


def create_analysis_summary(results_dict, condition_list, analysis_type, trait, 
                           output_filename, target_r2_columns=None):
    """
    Create summary statistics from multiple analysis results.
    
    Parameters:
    results_dict (dict): Dictionary with condition as key and summary_df as value
    condition_list (list): List of condition names
    analysis_type (str): Type of analysis for labeling
    trait (str): Trait analyzed
    output_filename (str): Name of output TSV file
    target_r2_columns (list): Specific R² columns to summarize (if None, auto-detect)
    
    Returns:
    pd.DataFrame: Summary statistics
    """
    
    # Combine all results
    combined_dfs = []
    for condition in condition_list:
        if condition in results_dict:
            df = results_dict[condition].copy()
            df['condition'] = condition
            combined_dfs.append(df)
    
    if not combined_dfs:
        print("No results to summarize")
        return pd.DataFrame()
    
    df_combined = pd.concat(combined_dfs, ignore_index=True)
    
    # Auto-detect R² columns if not specified
    if target_r2_columns is None:
        target_r2_columns = [col for col in df_combined.columns 
                           if any(r2_name in col.lower() for r2_name in ['r2_score', 'total_r2', 'incremental_r2'])]
    
    # Find existing columns
    existing_columns = [col for col in target_r2_columns if col in df_combined.columns]
    print(f"Creating summary for columns: {existing_columns}")
    
    # Create summary statistics
    summary_list = []
    for condition in df_combined['condition'].unique():
        condition_data = df_combined[df_combined['condition'] == condition]
        for col in existing_columns:
            values = condition_data[col].dropna()
            if len(values) > 0:
                # Determine predictor info from column name or data
                if 'predictors' in condition_data.columns:
                    predictor_info = condition_data['predictors'].iloc[0]
                elif any(pred in col for pred in ['PGS', 'NT']):
                    predictor_info = col.split('_')[-1] if '_' in col else 'multiple'
                else:
                    predictor_info = f"{analysis_type}_predictors"
                
                summary_list.append({
                    'condition': condition,
                    'analysis_type': analysis_type,
                    'trait': trait,
                    'variable': col,
                    'predictor': predictor_info,
                    'count': len(values),
                    'mean': values.mean(),
                    'std': values.std(),
                    'min': values.min(),
                    'max': values.max(),
                    'median': values.median(),
                    'mad': np.median(np.abs(values - values.median()))
                })
    
    summary_stats = pd.DataFrame(summary_list).round(4)
    
    # Save summary statistics
    output_path = f'/Users/xuly4739/Library/CloudStorage/OneDrive-UCB-O365/Documents/coding/PyProject/StatRev_IndirectGene/Analysis/PGS-Regression/results/{output_filename}'
    summary_stats.to_csv(output_path, sep='\t', index=False)
    print(f"Saved summary statistics to: {output_filename}")
    
    return summary_stats

In [9]:
# Run all analyses using the unified function

# 1. Full PGS analyses for trait 1
print("Running Full PGS analyses for trait 1...")
conditions = ['t1pheVT_t2socVT_uniphenoAM', '01_t1pheVTnoAM_t2socVTnoAM', '02_t1noVTpheAM_t2noVTnoAM']

full_pgs_trait1_results = {}
for condition in conditions:
    _, summary_df = run_pgs_analysis(condition, 'trait1', 'full_pgs')
    full_pgs_trait1_results[condition] = summary_df

# Create summary for full PGS trait 1
full_pgs_trait1_summary = create_analysis_summary(
    full_pgs_trait1_results, conditions, 'full_pgs', 'trait1',
    'regression_trait1_full_pgs_r2_summary_stats.tsv'
)

# 2. Full PGS analyses for trait 2
print("\nRunning Full PGS analyses for trait 2...")
full_pgs_trait2_results = {}
for condition in conditions:
    _, summary_df = run_pgs_analysis(condition, 'trait2', 'full_pgs')
    full_pgs_trait2_results[condition] = summary_df

# Create summary for full PGS trait 2
full_pgs_trait2_summary = create_analysis_summary(
    full_pgs_trait2_results, conditions, 'full_pgs', 'trait2',
    'regression_trait2_full_pgs_r2_summary_stats.tsv'
)

# 3. Kong's haplotypic analyses for trait 1
print("\nRunning Kong's haplotypic analyses for trait 1...")
kong_trait1_results = {}
for condition in conditions:
    _, summary_df = run_pgs_analysis(condition, 'trait1', 'kong')
    kong_trait1_results[condition] = summary_df

# Create summary for Kong's approach trait 1
kong_trait1_summary = create_analysis_summary(
    kong_trait1_results, conditions, 'kong', 'trait1',
    'regression_trait1_kong_r2_summary_stats.tsv'
)

# 4. Kong's haplotypic analyses for trait 2
print("\nRunning Kong's haplotypic analyses for trait 2...")
kong_trait2_results = {}
for condition in conditions:
    _, summary_df = run_pgs_analysis(condition, 'trait2', 'kong')
    kong_trait2_results[condition] = summary_df

# Create summary for Kong's approach trait 2
kong_trait2_summary = create_analysis_summary(
    kong_trait2_results, conditions, 'kong', 'trait2',
    'regression_trait2_kong_r2_summary_stats.tsv'
)

print("\n=== Analysis Complete ===")
print("Generated summary files:")
print("- regression_trait1_full_pgs_r2_summary_stats.tsv")
print("- regression_trait2_full_pgs_r2_summary_stats.tsv") 
print("- regression_trait1_kong_r2_summary_stats.tsv")
print("- regression_trait2_kong_r2_summary_stats.tsv")

Running Full PGS analyses for trait 1...
=== Running full_pgs analysis for trait1 on t1pheVT_t2socVT_uniphenoAM ===
  Using full PGS predictors: ['PGSo1', 'PGSp1', 'PGSm1']
  Processing 999 files...
Selected all columns: 14 columns
Combining haplotypic PGS scores into full PGS scores...
  Created PGSp1 = NTp1 + Tp1
  Created PGSp2 = NTp2 + Tp2
  Created PGSm1 = NTm1 + Tm1
  Created PGSm2 = NTm2 + Tm2
  Created PGSo1 = Tp1 + Tm1
  Created PGSo2 = Tp2 + Tm2
Successfully created 6 combined PGS columns
  Running multiple regression with 3 predictors
    Calculating incremental R² for 3 predictors
Selected all columns: 14 columns
Combining haplotypic PGS scores into full PGS scores...
  Created PGSp1 = NTp1 + Tp1
  Created PGSp2 = NTp2 + Tp2
  Created PGSm1 = NTm1 + Tm1
  Created PGSm2 = NTm2 + Tm2
  Created PGSo1 = Tp1 + Tm1
  Created PGSo2 = Tp2 + Tm2
Successfully created 6 combined PGS columns
  Running multiple regression with 3 predictors
    Calculating incremental R² for 3 predictors

In [10]:
# Display summary results
print("=== Full PGS Analysis Results (Trait 1) ===")
print(full_pgs_trait1_summary)

print("\n=== Kong's Haplotypic Analysis Results (Trait 1) ===") 
print(kong_trait1_summary)

print("\n=== Full PGS Analysis Results (Trait 2) ===")
print(full_pgs_trait2_summary)

print("\n=== Kong's Haplotypic Analysis Results (Trait 2) ===")
print(kong_trait2_summary)

=== Full PGS Analysis Results (Trait 1) ===
                     condition analysis_type   trait              variable  \
0   t1pheVT_t2socVT_uniphenoAM      full_pgs  trait1        total_r2_PGSo1   
1   t1pheVT_t2socVT_uniphenoAM      full_pgs  trait1        total_r2_PGSp1   
2   t1pheVT_t2socVT_uniphenoAM      full_pgs  trait1        total_r2_PGSm1   
3   t1pheVT_t2socVT_uniphenoAM      full_pgs  trait1  incremental_r2_PGSo1   
4   t1pheVT_t2socVT_uniphenoAM      full_pgs  trait1  incremental_r2_PGSp1   
5   t1pheVT_t2socVT_uniphenoAM      full_pgs  trait1  incremental_r2_PGSm1   
6   01_t1pheVTnoAM_t2socVTnoAM      full_pgs  trait1        total_r2_PGSo1   
7   01_t1pheVTnoAM_t2socVTnoAM      full_pgs  trait1        total_r2_PGSp1   
8   01_t1pheVTnoAM_t2socVTnoAM      full_pgs  trait1        total_r2_PGSm1   
9   01_t1pheVTnoAM_t2socVTnoAM      full_pgs  trait1  incremental_r2_PGSo1   
10  01_t1pheVTnoAM_t2socVTnoAM      full_pgs  trait1  incremental_r2_PGSp1   
11  01_t1pheVTnoAM_t

In [11]:
# Example: Run specific analysis combinations

# If you want to run only maternal non-transmitted alleles (Kong's approach)
print("=== Example: Kong's Maternal-only Analysis ===")
maternal_results = {}
for condition in ['t1pheVT_t2socVT_uniphenoAM']:  # Example with one condition
    _, summary_df = run_pgs_analysis(condition, 'trait1', 'kong_maternal', 
                                   output_suffix='_maternal_only')
    maternal_results[condition] = summary_df
    print(f"Maternal-only analysis for {condition}:")
    print(summary_df.head())

# If you want to run only paternal non-transmitted alleles
print("\n=== Example: Kong's Paternal-only Analysis ===") 
paternal_results = {}
for condition in ['t1pheVT_t2socVT_uniphenoAM']:  # Example with one condition
    _, summary_df = run_pgs_analysis(condition, 'trait1', 'kong_paternal',
                                   output_suffix='_paternal_only')
    paternal_results[condition] = summary_df
    print(f"Paternal-only analysis for {condition}:")
    print(summary_df.head())

=== Example: Kong's Maternal-only Analysis ===
=== Running kong_maternal analysis for trait1 on t1pheVT_t2socVT_uniphenoAM ===
  Using Kong's maternal predictor: ['NTm1']
  Processing 999 files...
Selected all columns: 14 columns
  Running simple regression for 1 predictors
Selected all columns: 14 columns
  Running simple regression for 1 predictors
Selected all columns: 14 columns
  Running simple regression for 1 predictors
Selected all columns: 14 columns
  Running simple regression for 1 predictors
Selected all columns: 14 columns
  Running simple regression for 1 predictors
Selected all columns: 14 columns
  Running simple regression for 1 predictors
Selected all columns: 14 columns
  Running simple regression for 1 predictors
Selected all columns: 14 columns
  Running simple regression for 1 predictors
Selected all columns: 14 columns
  Running simple regression for 1 predictors
Selected all columns: 14 columns
  Running simple regression for 1 predictors
Selected all columns: 1

In [12]:
# Usage Guide for the Unified Analysis Function

print("""
=== Usage Guide ===

The unified function `run_pgs_analysis()` can handle all your analysis scenarios:

1. Full PGS Analysis:
   run_pgs_analysis(condition, trait, 'full_pgs')
   - Uses PGSo, PGSp, PGSm as predictors

2. Kong's Complete Haplotypic Analysis:
   run_pgs_analysis(condition, trait, 'kong') 
   - Uses NTm, NTp as predictors

3. Kong's Maternal-only Analysis:
   run_pgs_analysis(condition, trait, 'kong_maternal')
   - Uses only NTm as predictor

4. Kong's Paternal-only Analysis:
   run_pgs_analysis(condition, trait, 'kong_paternal')
   - Uses only NTp as predictor

Parameters:
- condition: e.g., 't1pheVT_t2socVT_uniphenoAM'
- trait: 'trait1' or 'trait2'
- analysis_type: 'full_pgs', 'kong', 'kong_maternal', 'kong_paternal'
- incremental_r2: True/False (default: True)
- save_results: True/False (default: True)

The function automatically:
- Handles single or multiple predictors
- Calculates incremental R² when appropriate
- Saves results with descriptive filenames
- Returns both raw results and summary DataFrames

Use `create_analysis_summary()` to combine results across conditions.
""")

# Quick comparison of approaches
print("=== Quick Comparison Example ===")
condition = 't1pheVT_t2socVT_uniphenoAM'

# Compare full PGS vs Kong's approach for trait 1
_, full_pgs_result = run_pgs_analysis(condition, 'trait1', 'full_pgs', save_results=False)
_, kong_result = run_pgs_analysis(condition, 'trait1', 'kong', save_results=False)

print(f"\nFull PGS R² range: {full_pgs_result['total_r2_PGSo1'].mean():.4f} to {full_pgs_result['total_r2_PGSm1'].mean():.4f}")
print(f"Kong's R² range: {kong_result['total_r2_NTm1'].mean():.4f} to {kong_result['total_r2_NTp1'].mean():.4f}")


=== Usage Guide ===

The unified function `run_pgs_analysis()` can handle all your analysis scenarios:

1. Full PGS Analysis:
   run_pgs_analysis(condition, trait, 'full_pgs')
   - Uses PGSo, PGSp, PGSm as predictors

2. Kong's Complete Haplotypic Analysis:
   run_pgs_analysis(condition, trait, 'kong') 
   - Uses NTm, NTp as predictors

3. Kong's Maternal-only Analysis:
   run_pgs_analysis(condition, trait, 'kong_maternal')
   - Uses only NTm as predictor

4. Kong's Paternal-only Analysis:
   run_pgs_analysis(condition, trait, 'kong_paternal')
   - Uses only NTp as predictor

Parameters:
- condition: e.g., 't1pheVT_t2socVT_uniphenoAM'
- trait: 'trait1' or 'trait2'
- analysis_type: 'full_pgs', 'kong', 'kong_maternal', 'kong_paternal'
- incremental_r2: True/False (default: True)
- save_results: True/False (default: True)

The function automatically:
- Handles single or multiple predictors
- Calculates incremental R² when appropriate
- Saves results with descriptive filenames
- Returns b