# Purpose
This notebook is meant to summarize output files from Alphafold3 (alphafold3 server: https://alphafoldserver.com) and collect them as a .csv file for further downstream statistical analysis of all output models.

## Checklist before running
1) Install necessary modules and import
2) Change the `parent_directory` argument to your input directory before running
3) Change the `output_csv_path` argument to your output file path before running

In [2]:
####Importing
import os
import json
import pandas as pd

In [None]:
# Define the parent directory containing all subdirectories
parent_directory = '/Path/To/Your/Directory'  # Change this to your actual directory path. Note: Should be *DIRECTORY*, not file

# Define lists to store the summary data
summary_data = []

# Loop through all subdirectories in the parent directory
for subdir, _, files in os.walk(parent_directory):
    for file in files:
        # Only consider files that match the "*confidences_*.json" pattern
        if file.endswith('.json') and 'confidences_' in file:
            file_path = os.path.join(subdir, file)
            
            # Open and read each JSON file
            with open(file_path, 'r') as f:
                data = json.load(f)
                
                # Extract relevant fields
                summary = {
                    'subdir': subdir,  # Include subdirectory name to identify the source
                    'file_name': file,
                    'fraction_disordered': data.get('fraction_disordered'),
                    'has_clash': data.get('has_clash'),
                    'num_recycles': data.get('num_recycles'),
                    'ptm': data.get('ptm'),
                    'ranking_score': data.get('ranking_score'),
                    'chain_iptm': data.get('chain_iptm'),
                    'chain_pair_iptm': data.get('chain_pair_iptm'),
                    'chain_pair_pae_min': data.get('chain_pair_pae_min'),
                    'chain_ptm': data.get('chain_ptm'),
                }
                summary_data.append(summary)

# Create a DataFrame from the collected summary data
summary_df = pd.DataFrame(summary_data)

# Ensure the output directory exists
output_csv_path = '/Output/File/Path/summary_confidences_A.csv'  # Change this to your desired output path
output_directory = os.path.dirname(output_csv_path)
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# 1. Clean the 'file_name' column to remove the "fold_" prefix and ".json" suffix
summary_df['file_name'] = summary_df['file_name'].str.replace(r'^fold_', '').str.replace(r'\.json$', '', regex=True)

# 2. Extract protein name from the 'file_name' column
summary_df['protein_name'] = summary_df['file_name'].str.extract(r'^(.*?)_summary_confidences_', expand=False)

# 3. Remove brackets from numerical columns (handles both single and double brackets)
def remove_brackets(value):
    # Handle nested lists like [[value]], [value], or [value, value, ...]
    if isinstance(value, list):
        # Flatten nested lists if they exist and return the first element as a float
        while isinstance(value, list) and len(value) > 0:
            value = value[0]
        try:
            return float(value)
        except (ValueError, TypeError):
            return None
    return value

# Apply bracket removal to relevant columns
columns_to_clean = ['fraction_disordered', 'has_clash', 'num_recycles', 'ptm', 'ranking_score',
                    'chain_iptm', 'chain_pair_iptm', 'chain_pair_pae_min', 'chain_ptm']

for column in columns_to_clean:
    summary_df[column] = summary_df[column].apply(remove_brackets)

# Save the cleaned DataFrame as a CSV file
summary_df.to_csv(output_csv_path, index=False)

# Display the first few rows of the cleaned DataFrame for verification
summary_df.head()