# create PeptoneDB-Integrative metadata
The integrative dataset is manually curated. This notebook simply gathers the available information into useful csv and computes G-scores

In [None]:
import json
import os.path
from glob import glob

import numpy as np
import pandas as pd
from tqdm import tqdm

from peptonebench import nmrcs
from peptonebench.trizod_gscores import compute_gscores

DATA_PATH = os.path.abspath("../datasets/PeptoneDB-Integrative")

In [None]:
labels = sorted([os.path.dirname(f).split("/")[-1] for f in glob(f"{DATA_PATH}/*/info.csv")])
assert labels == sorted([os.path.dirname(f).split("/")[-1] for f in glob(f"{DATA_PATH}/*/CS.dat")]), (
    "we assume all entries have a info.csv and a CS.dat files"
)

gscores = {}
sequences = {}
for label in tqdm(labels):
    with open(f"{DATA_PATH}/{label}/seq.fasta") as f:
        for line in f:
            if line.startswith(">"):
                assert line[1:].strip() == label, "fasta header should match label"
            else:
                sequences[label] = line.strip()
    info_df = pd.read_csv(f"{DATA_PATH}/{label}/info.csv", index_col="Experiment")
    cs = nmrcs.experimental_cs_from_label(label)
    gscores[label] = compute_gscores(
        cs=cs,
        sequence=sequences[label],
        temperature=info_df.loc["CS", "Temp(K)"],
        pH=info_df.loc["CS", "pH"],
        ionic_strength=info_df.loc["CS", "Ionic(M)"],
    )

In [None]:
with open(f"{DATA_PATH}/PeptoneDB-Integrative.csv", "w") as f:
    f.write("label,sequence,length,mean_gscore,gscores\n")
    for label in labels:
        f.write(
            f'{label},{sequences[label]},{len(sequences[label])},{np.nanmean(gscores[label])},"{json.dumps(list(gscores[label]))}"\n'
        )