In [149]:
import pandas as pd
import numpy as np
import os

sdrf_basis = pd.read_csv('PXD017710-tmt.sdrf.tsv', sep='\t')
sdrf_basis.head()

Unnamed: 0,source name,characteristics[organism],characteristics[organism part],characteristics[sex],characteristics[age],characteristics[developmental stage],characteristics[ancestry category],characteristics[cell type],characteristics[disease],characteristics[cell line],...,comment[modification parameters ],comment[modification parameters ].1,comment[modification parameters ].2,comment[cleavage agent details],comment[cleavage agent details].1,comment[fragment mass tolerance],comment[precursor mass tolerance],comment[data file],factor value[infect],factor value[time]
0,Sample 1,Homo sapiens,colon,male,72,adult,caucasian,not available,colon cancer,CaCo-2,...,NT=Carbamidomethyl;TA=C;AC=UNIMOD:4;MT=fixed,NT=Oxidation;TA=M;AC=UNIMOD:35;MT=variable,NT=13C6-15N4;TA=R;AC=UNIMOD:267;MT=variable,NT=Trypsin,NT=Lys-C,not available,not available,20200219_KKL_SARS_CoV2_pool1_F1.raw,bridge mixed pool,none
1,Sample 2,Homo sapiens,colon,male,72,adult,caucasian,not available,colon cancer,CaCo-2,...,NT=Carbamidomethyl;TA=C;AC=UNIMOD:4;MT=fixed,NT=Oxidation;TA=M;AC=UNIMOD:35;MT=variable,NT=13C6-15N4;TA=R;AC=UNIMOD:267;MT=variable,NT=Trypsin,NT=Lys-C,not available,not available,20200219_KKL_SARS_CoV2_pool1_F1.raw,none,2 hour
2,Sample 3,Homo sapiens,colon,male,72,adult,caucasian,not available,colon cancer,CaCo-2,...,NT=Carbamidomethyl;TA=C;AC=UNIMOD:4;MT=fixed,NT=Oxidation;TA=M;AC=UNIMOD:35;MT=variable,NT=13C6-15N4;TA=R;AC=UNIMOD:267;MT=variable,NT=Trypsin,NT=Lys-C,not available,not available,20200219_KKL_SARS_CoV2_pool1_F1.raw,none,6 hour
3,Sample 4,Homo sapiens,colon,male,72,adult,caucasian,not available,colon cancer,CaCo-2,...,NT=Carbamidomethyl;TA=C;AC=UNIMOD:4;MT=fixed,NT=Oxidation;TA=M;AC=UNIMOD:35;MT=variable,NT=13C6-15N4;TA=R;AC=UNIMOD:267;MT=variable,NT=Trypsin,NT=Lys-C,not available,not available,20200219_KKL_SARS_CoV2_pool1_F1.raw,none,10 hour
4,Sample 5,Homo sapiens,colon,male,72,adult,caucasian,not available,colon cancer,CaCo-2,...,NT=Carbamidomethyl;TA=C;AC=UNIMOD:4;MT=fixed,NT=Oxidation;TA=M;AC=UNIMOD:35;MT=variable,NT=13C6-15N4;TA=R;AC=UNIMOD:267;MT=variable,NT=Trypsin,NT=Lys-C,not available,not available,20200219_KKL_SARS_CoV2_pool1_F1.raw,none,24 hour


Preprocess step: merge columns with the same base name

In [150]:
sdrf = sdrf_basis.copy()
base_columns = [i.split(']')[0] for i in sdrf_basis.columns]
duplicate_base_columns = list(set([i for i in base_columns if base_columns.count(i) > 1]))
for c in duplicate_base_columns:
    #get columns with the same base name
    columns = [i for i in sdrf_basis.columns if c in i]
    print(columns)

    #Merge columns and drop original column
    sdrf[f'new_{c}'] = sdrf[columns].apply(lambda x: ', '.join(x.dropna().astype(str)), axis=1)
    sdrf = sdrf.drop(columns, axis=1)
    #rename
    sdrf = sdrf.rename(columns={f'new_{c}': str(c)+']'})


['comment[cleavage agent details]', 'comment[cleavage agent details].1']
['comment[modification parameters ]', 'comment[modification parameters ].1', 'comment[modification parameters ].2']


Samples, runs and factor values

In [151]:

# Step 1: Extract samples and biological replicate information
samples = {}
sample_names = sdrf['source name'].unique()
for idx, sample_name in enumerate(sample_names):
    sample_data = sdrf[sdrf['source name'] == sample_name].iloc[0]
    samples[f"sample {idx + 1}"] = {
        "name": sample_name,
        "biological_replicate": sample_data.get('characteristics[biological replicate]', '1')
    }

# Step 2: Extract runs and assays from the data, considering labels from comment[label] column
runs = {}
run_index = 1

for idx, row in sdrf.iterrows():
    data_file = row['comment[data file]']

    # Check if the data file already exists in the runs dictionary
    run_name = None
    for existing_run_name, run_data in runs.items():
        if run_data['data file'] == data_file:
            run_name = existing_run_name
            break
    if not run_name:
        #new run
        run_name = f"run {run_index}"
        runs[run_name] = {
            "assays": [],
            "file uri": f"ftp://ftp.pride.ebi.ac.uk/pride/data/archive/2018/08/{row['comment[data file]']}",
            "data file": row['comment[data file]']
        }
        run_index += 1
    
    sample_name = row['source name']
    fraction = row.get('comment[fraction]', '1')
    technical_replicate = row.get('comment[technical replicate]', '1')


    # Construct the factor values (e.g., for enrichment method, time, etc.)
    factor_values = {}
    for column in row.index:
        if 'factor value' in column:
            factor_name = column.split('[', 1)[1].split(']', 1)[0]
            factor_values[factor_name] = row[column]

    # Extract the label from the comment column (assuming multiple labels exist in the comment[label] column)
    label = row.get('comment[label]', None)

    # If the label exists, create an assay for this label
    if label:
        #create a new assay for each label
        assay = {
            "sample": sample_name,
            "label": label,  # The label is taken directly from the comment[label] column
            "fraction": fraction,
            "technical replicate": technical_replicate,
            "factor values": factor_values
        }
        runs[run_name]["assays"].append(assay)

# Step 3: Extract and store factor values for different labelling channels
factor_columns = [column for column in sdrf.columns if 'factor value' in column]
factor_values = {}

for column in factor_columns:
    name = column.split('[', 1)[1].split(']', 1)[0]
    factor_values[name] = sdrf[column].unique()


Sample characteristics

{'infect': array(['bridge mixed pool', 'none',
        'Severe acute respiratory syndrome coronavirus 2'], dtype=object),
 'time': array(['none', ' 2 hour', ' 6 hour', ' 10 hour', ' 24 hour'], dtype=object)}

In [169]:
characteristic_columns = [col for col in sdrf.columns if 'characteristics' in col]
#drop technical replicate, biological replicate, file uri and data file from characteristic_columns
leave_out = ['biological replicate']
for column in sdrf.columns[sdrf.columns.str.contains('factor value')]:
    name = column.split('[', 1)[1].split(']', 1)[0]
    leave_out.append(name)

characteristic_columns = [characteristic for characteristic in characteristic_columns if not any(leave_out in characteristic for leave_out in leave_out)]
sample_characteristics = {}
common_group = {"apply": "ALL"}
shared_cols = []
# Loop through all characteristic columns to create groupings
for col in characteristic_columns:
    # Get the unique values in the current column
    unique_values = sdrf[col].unique()
    # If there is only one unique value across all samples, create a "common" group
    if len(unique_values) == 1:
        char_name = col.split('[')[1].split(']')[0]
        shared_cols.append(col)
        if char_name not in common_group:
            common_group[char_name] = unique_values[0]

sample_characteristics["comomn"] = common_group

notshared_characteristic_columns = [col for col in characteristic_columns if col not in shared_cols]
grouped_samples = {} 
if len(notshared_characteristic_columns) > 0:
    for idx, row in sdrf.iterrows():
        sample_values = tuple(row[col] for col in notshared_characteristic_columns)

        if sample_values not in grouped_samples:
            grouped_samples[sample_values] = []
        
        grouped_samples[sample_values].append(row['source name'])

    group_counter = 1 
    for sample_values, samples_in_group in grouped_samples.items():
        group_key = f"sample_chara_{group_counter}"
        group_counter += 1
        group = {
            "apply": samples_in_group
        }
        for col, value in zip(notshared_characteristic_columns, sample_values):
            characteristic_name = col.split('[')[1].split(']')[0]
            group[characteristic_name] = value
            

        sample_characteristics[group_key] = group


Assay characteristics

In [170]:
comment_columns = [col for col in sdrf.columns if 'comment' in col]
#drop technical replicate, biological replicate, file uri and data file from comment_columns
leave_out = ['technical replicate', 'biological replicate', 'file uri', 'data file', 'fraction', 'label']
for column in sdrf.columns[sdrf.columns.str.contains('factor value')]:
    name = column.split('[', 1)[1].split(']', 1)[0]
    leave_out.append(name)
comment_columns = [comment for comment in comment_columns if not any(leave_out in comment for leave_out in leave_out)]
assay_characteristics = {}
common_group = {"apply": "ALL"}
shared_cols = []


# Loop through all characteristic columns to create groupings
for col in comment_columns:
    # Get the unique values in the current column
    unique_values = sdrf[col].unique()
    # If there is only one unique value across all samples, create a "common" group
    if len(unique_values) == 1:
        char_name = col.split('[')[1].split(']')[0]
        shared_cols.append(col)
        if char_name not in common_group:
            common_group[char_name] = unique_values[0]

assay_characteristics["comomn"] = common_group

notshared_comment_columns = [col for col in comment_columns if col not in shared_cols]
grouped_samples = {} 
if len(notshared_comment_columns) > 0:
    for idx, row in sdrf.iterrows():
        sample_values = tuple(row[col] for col in notshared_comment_columns)

        if sample_values not in grouped_samples:
            grouped_samples[sample_values] = []
        
        grouped_samples[sample_values].append(row['assay name'])

    group_counter = 1 
    for sample_values, samples_in_group in grouped_samples.items():
        group_key = f"assay_chara_{group_counter}"
        group_counter += 1
        group = {
            "apply": samples_in_group
        }
        for col, value in zip(notshared_comment_columns, sample_values):
            characteristic_name = col.split('[')[1].split(']')[0]
            group[characteristic_name] = value
            

        assay_characteristics[group_key] = group


In [171]:
import json
import numpy as np



json_data ={
    "samples": samples,
    "runs": runs,
    "factor values": factor_values,
    "sample characteristics": sample_characteristics,
    "assay characteristics": assay_characteristics}

# Convert int64 values in the JSON data
def convert_int64_values(data):
    if isinstance(data, dict):
        return {key: convert_int64_values(value) for key, value in data.items()}
    if isinstance(data, list):
        return [convert_int64_values(item) for item in data]
    return convert_int64(data)

# Combine all into one json
# Convert numpy arrays to lists
def convert_numpy_arrays(data):
    if isinstance(data, np.ndarray):
        return data.tolist()
    if isinstance(data, dict):
        return {key: convert_numpy_arrays(value) for key, value in data.items()}
    if isinstance(data, list):
        return [convert_numpy_arrays(item) for item in data]
    return data

# Convert int64 values in the JSON data and numpy arrays to lists
def convert_data(data):
    data = convert_int64_values(data)
    data = convert_numpy_arrays(data)
    return data

# Convert the data and serialize to JSON
json_data = convert_data(json_data)
json_string = json.dumps(json_data)
json_string


'{"samples": {"sample 1": {"name": "Sample 1", "biological_replicate": 1}, "sample 2": {"name": "Sample 2", "biological_replicate": 1}, "sample 3": {"name": "Sample 3", "biological_replicate": 1}, "sample 4": {"name": "Sample 4", "biological_replicate": 1}, "sample 5": {"name": "Sample 5", "biological_replicate": 1}, "sample 6": {"name": "Sample 6", "biological_replicate": 1}, "sample 7": {"name": "Sample 7", "biological_replicate": 1}, "sample 8": {"name": "Sample 8", "biological_replicate": 1}, "sample 9": {"name": "Sample 9", "biological_replicate": 1}}, "runs": {"run 1": {"assays": [{"sample": "Sample 1", "label": "TMT131", "fraction": "1", "technical replicate": 1, "factor values": {"infect": "bridge mixed pool", "time": "none"}}, {"sample": "Sample 2", "label": "TMT127N", "fraction": "1", "technical replicate": 1, "factor values": {"infect": "none", "time": " 2 hour"}}, {"sample": "Sample 3", "label": "TMT127C", "fraction": "1", "technical replicate": 1, "factor values": {"infect