# Objective:

#### This will average the pLDDT scores for your given "_full_data_#.json" given a parent file directory of AF3 server output files. This will include the name of the protein as well in the summary of the average pLDDT.

## Checklist before running:
#### 1) Provide a directory of directories containing your "_full_data_#.json" files output from the Alphafold3 server
#### 2) Make sure that you have pandas installed, can run "pip install pandas openpyxl" from bash if need be

## Note:
#### Can comment out code regarding .csv/.xlsx creation if need be
#### This only works for protein sequences *without* post-translational modifications (PTMs) such as glycosylation, citrullination, phosphorylation, SUMOlation, etc.

In [None]:
#Importing modules
import os
import json
import pandas as pd

In [None]:
# Function to process a single full data file
def process_full_data_file(file_path):
    try:
        with open(file_path, 'r') as file:
            data = json.load(file)
            
            # Extract atom_plddts and compute average
            atom_plddts = data.get("atom_plddts", [])
            if not atom_plddts:
                print(f"No atom_plddts found in {file_path}")
                return None

            avg_plddt = sum(atom_plddts) / len(atom_plddts)

            # Extract token_res_ids and compute protein length
            token_res_ids = data.get("token_res_ids", [])
            if not token_res_ids:
                print(f"No token_res_ids found in {file_path}")
                return None

            protein_length = max(token_res_ids)

            return {
                "file_name": os.path.basename(file_path),
                "protein_length": protein_length,
                "avg_plddt": avg_plddt
            }
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return None

# Function to traverse directories and process all full data files
def process_all_full_data_files(parent_directory, output_csv_file):
    results = []

    for root, _, files in os.walk(parent_directory):
        for file in files:
            if file.endswith(".json") and "_full_data_" in file:
                file_path = os.path.join(root, file)
                result = process_full_data_file(file_path)
                if result:
                    results.append(result)

    # Convert results to DataFrame and save to CSV
    if results:
        df = pd.DataFrame(results)
        df.to_csv(output_csv_file, index=False)
        print(f"Results saved to {output_csv_file}")
    else:
        print("No valid full data files were found.")

# Example usage
parent_directory = "Parent/Directory/File/Path"  # Replace with your directory path
output_csv_file = "Output/CSV/File/Path/average_plddt_AF3.csv"  # Replace with the desired output CSV file path
process_all_full_data_files(parent_directory, output_csv_file)