# Installation of required packages
```bash
conda create -n sadie python=3.10.5 pip
pip install sadie-antibody
```

# Imports

In [1]:
# Built-in Python
from pathlib import Path

# Third Part Libraries
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
import pandas as pd
from sadie.airr import Airr

# Generic Functions

In [None]:
def rename_duplicate_column_names(df):
    cols = pd.Series(df.columns)
    for dup in cols[cols.duplicated()].unique():
        cols[cols[cols == dup].index.values] = [dup + "_" + str(i) if i != 0 else dup for i in range(sum(cols == dup))]
    df.columns = cols
    return df

Unnamed: 0,sequence_id_heavy,sequence_id_light,sequence_heavy,sequence_light
0,00000b62-d08f-4917-adf4-905941f380cc,00000b62-d08f-4917-adf4-905941f380cc,TACGTTGCGAACACCTACTACAATCCGTCCCTCAAGAGTCGAGTCT...,ACCGGCAGAAACCTGGCCAGGCTCCCAGGCTCCTCATTTATGGTGC...
1,00005592-a265-4e78-b1de-d105668dadce,00005592-a265-4e78-b1de-d105668dadce,GCAGTAGTGGCACTACGAAGTTCTACTCAGAATCTCTGAGGGGCCG...,GGTGCATCCATTCTACACAGTGGAGTCCCATCAAGGTTCAGTGGCA...
2,00008ba7-06e0-417a-b918-d42dbc6fc90c,00008ba7-06e0-417a-b918-d42dbc6fc90c,CAGTTATGTCGTCTGATGGCAGTGAGACATACTTTGCAGACTCCGT...,CGCACGCTCATCTACGCCACAAGTGCTCGCTTTTCTGGGGTCCCTG...
3,00010dc3-7723-4b3d-9453-7e1406e84c70,00010dc3-7723-4b3d-9453-7e1406e84c70,ACTGGGCAAGGGTTTGAGTGGATGGGATGGATGAACCCTAACACTG...,TCCCCTGTGTTGGTCATCTATCAAGATGCCAAGCGGCCCTCAGGGA...
4,0001417a-1b6c-4a43-acfb-c2c0283f73ce,0001417a-1b6c-4a43-acfb-c2c0283f73ce,CCAGAGACAATTCCAAGAACACGCTGCATTTGCAAATGACCAGCCT...,GGTACCAGCACAAACCTGGCCAGGCTCCCAGGCTCCTCGTCTATGC...
...,...,...,...,...
67988,fff17e16-46e2-49b5-966c-f41ffdd7509e,fff17e16-46e2-49b5-966c-f41ffdd7509e,CTCACCATCTCCAAGGACACCTCCAAAACCCAGGTGGTCCTTACAA...,ACTGTGATCTATGAGGATGATCACAGACCCTCTGGGGTCCCTGATC...
67989,fff4650e-83c7-4320-bdf5-48f8e5c6dd4b,fff4650e-83c7-4320-bdf5-48f8e5c6dd4b,AGAAGTTCCAGGGCAGAGTCACCATGACCGAGGACATATCTACAGA...,GTCATCTATTATGATAGCGACCGACCCTCAGGGATCCCTGAGCGAT...
67990,fff4dd7d-f140-4618-9006-131149021a9a,fff4dd7d-f140-4618-9006-131149021a9a,GACAGAGTCACCATGGCCAGGGACACGTCCACGAGCACAGCCTACA...,AGAAACCAGGGAAACCCCCTAAGCTCCTGATCTACGATGCATCCAA...
67991,fffac9d8-b7e0-4b68-a6ab-07e532ab1219,fffac9d8-b7e0-4b68-a6ab-07e532ab1219,AAGTGATAAATACTATGCAGACTCCGTGAAGGGCCGATTCACCATC...,ACCGGCAGAAACCGGGACAGCCTCCTAAGCTGCTCATTTACTGGGC...


# Globals

In [None]:
airr_api = Airr(reference_name="human")
oas_folder = "OAS/data/OAS_paired"
leuko_file = "OAS/data/D326651_Leuko_human_naive.csv.gz"
dekosky = "OAS/data/DeKosky_paired"

# OAS Manifest as static metadata

In [None]:
import json
import gzip


def spy_for_manifest(path):
    if str(path).endswith(".gz"):
        with gzip.open(path, "rt") as file:
            header = list(file)[0]
            if header.startswith('"{'):
                return True
    else:
        with open(path, "r") as file:
            header = list(file)[0]
            if header.startswith('"{'):
                return True
    return False


# spy_for_manifest("OAS/data/DeKosky_paired/ERR4082227_paired.csv.gz")

In [None]:
manifest = pd.read_csv("OAS/data/oas_manifest.csv.gz", compression="gzip")

manifest = manifest[
    [
        "data_link",
        "run",
        "link",
        "author",
        "species",
        "bsource",
        "btype",
        "longitudinal",
        "age",
        "disease",
        "subject",
        "vaccine",
        "file_name",
    ]
]

manifest.query('run == "ERR4082303"')
manifest = manifest.query('species == "human"').drop_duplicates(["run"])
run2manifest = manifest.set_index("run", drop=False).to_dict(orient="index")
"ERR4082303" in run2manifest.keys()

True

# STD load

In [None]:
%%time
Path('parquet-paired').mkdir(exist_ok=True)

manifest = pd.read_csv("OAS/data/oas_manifest.csv.gz", compression='gzip')

paths = list(Path(oas_folder).glob('**/*.csv.gz')) 
# paths = [Path('OAS/data/DeKosky_paired/SRR10358525_paired.csv.gz')]

# Run OAS misc Sequences through SADIE
for path in paths:

    # filename = path.stem.split('_paired')[0] 
    filename = path.stem.split('_')[0] 
    
    if not filename.startswith('SRR'):
        continue
    
    print(filename)
    # Do not overwrite
    # if Path(f'parquet-paired/{filename}.parquet').exists():
    #     print('already exists', path)
    #     continue
    
    if spy_for_manifest(path):
        df = pd.read_csv(path, header=1, compression='gzip')
    else:
        df = pd.read_csv(path, compression='gzip')

    df['sequence_id_heavy'] = df['sequence_id_heavy'].astype(str)
    
    # Heavy
    sequences = [
        SeqRecord(id=sequenec_id, name=sequenec_id, seq=Seq(sequence))
        for sequenec_id, sequence in zip(df['sequence_id_heavy'], df['sequence_heavy'])
    ]
    with gzip.open(f"OAS/data/fasta-heavy/{filename}.fasta.gz", "wt") as output_handle:
        SeqIO.write(sequences, output_handle, "fasta")
    
    # Light
    sequences = [
        SeqRecord(id=sequenec_id, name=sequenec_id, seq=Seq(sequence))
        for sequenec_id, sequence in zip(df['sequence_id_light'], df['sequence_light'])
    ]
    with gzip.open(f"OAS/data/fasta-light/{filename}.fasta.gz", "wt") as output_handle:
        SeqIO.write(sequences, output_handle, "fasta")
        
    # Paired
    heavy_df = airr_api.run_fasta(f"OAS/data/fasta-heavy/{filename}.fasta.gz")
    light_df = airr_api.run_fasta(f"OAS/data/fasta-light/{filename}.fasta.gz")
    
    heavy_df['tmp_id'] = heavy_df['sequence_id'].apply(lambda x: x.rsplit('_', 1)[0])
    light_df['tmp_id'] = light_df['sequence_id'].apply(lambda x: x.rsplit('_', 1)[0])
    
    paired_df = pd.merge(heavy_df, light_df, how='outer', on='tmp_id', suffixes=('_heavy', '_light'))
    paired_df = paired_df.drop(['tmp_id'], axis=1)
    
    # open manifest
    run = filename.split('_')[0]
    for k, v in run2manifest[run].items():
        paired_df[k] = v

    # merge with shared fields
    # save new parquet
    paired_df.to_parquet(f'parquet-paired/{filename}.parquet')
    
    del heavy_df
    del light_df
    del paired_df

SRR10358525
SRR10358523
SRR10358524
CPU times: user 556 ms, sys: 112 ms, total: 668 ms
Wall time: 20.9 s


# Dekosky load

In [None]:
%%time
Path('parquet-paired').mkdir(exist_ok=True)

manifest = pd.read_csv("OAS/data/oas_manifest.csv.gz", compression='gzip')

paths = list(Path(dekosky).glob('**/*.csv.gz'))
# paths = [Path('OAS/data/DeKosky_paired/SRR10358525_paired.csv.gz')]

# Run OAS misc Sequences through SADIE
for path in paths:

    # filename = path.stem.split('_paired')[0] 
    filename = path.stem.split('_')[0] 
    print(filename)
    # Do not overwrite
    if Path(f'parquet-paired/{filename}.parquet').exists():
        print('already exists', path)
        continue
    
    if spy_for_manifest(path):
        df = pd.read_csv(path, header=1, compression='gzip')
    else:
        df = pd.read_csv(path, compression='gzip')

    df = rename_duplicate_column_names(df)
    df["sequence_id_heavy"] = df["Sequence ID"].astype(str)
    df["sequence_id_light"] = df["Sequence ID"].astype(str)
    df["sequence_heavy"] = df["VDJ NT seq"].astype(str)
    df["sequence_light"] = df["VDJ NT seq.1"].astype(str)

    df['sequence_id_heavy'] = df['sequence_id_heavy'].astype(str)
    
    # Heavy
    sequences = [
        SeqRecord(id=sequenec_id, name=sequenec_id, seq=Seq(sequence))
        for sequenec_id, sequence in zip(df['sequence_id_heavy'], df['sequence_heavy'])
    ]
    with gzip.open(f"OAS/data/fasta-heavy/{filename}.fasta.gz", "wt") as output_handle:
        SeqIO.write(sequences, output_handle, "fasta")
    
    # Light
    sequences = [
        SeqRecord(id=sequenec_id, name=sequenec_id, seq=Seq(sequence))
        for sequenec_id, sequence in zip(df['sequence_id_light'], df['sequence_light'])
    ]
    with gzip.open(f"OAS/data/fasta-light/{filename}.fasta.gz", "wt") as output_handle:
        SeqIO.write(sequences, output_handle, "fasta")
        
    # Paired
    heavy_df = airr_api.run_fasta(f"OAS/data/fasta-heavy/{filename}.fasta.gz")
    light_df = airr_api.run_fasta(f"OAS/data/fasta-light/{filename}.fasta.gz")
    
    heavy_df['tmp_id'] = heavy_df['sequence_id'].apply(lambda x: x.rsplit('_', 1)[0])
    light_df['tmp_id'] = light_df['sequence_id'].apply(lambda x: x.rsplit('_', 1)[0])
    
    paired_df = pd.merge(heavy_df, light_df, how='outer', on='tmp_id', suffixes=('_heavy', '_light'))
    paired_df = paired_df.drop(['tmp_id'], axis=1)
    
    # open manifest
    # run = filename.split('_')[0]
    # for k, v in run2manifest[run].items():
    #     paired_df[k] = v
    paired_df["run"] = filename
    paired_df["species"] = "human"
    paired_df["bsource"] = "PBMC"
    paired_df["author"] = "DeKosky"
    paired_df["btype"] = "Naive-B-Cells"
    paired_df["disease"] = None
    paired_df["file_name"] = filename
    
    # merge with shared fields
    # save new parquet
    paired_df.to_parquet(f'parquet-paired/{filename}.parquet')
    
    del heavy_df
    del light_df
    del paired_df

SRR1585265
already exists data/DeKosky_paired/SRR1585265_joined_NoAlleles.csv
SRR1585275
already exists data/DeKosky_paired/SRR1585275_joined_NoAlleles.csv
SRR1585248
already exists data/DeKosky_paired/SRR1585248_joined_NoAlleles.csv
SRR1585267


  df = pd.read_csv(tmpfile.name, sep="\t", dtype=IGBLAST_AIRR)  # type: ignore


SRR1585274
SRR1585249
CPU times: user 7.47 s, sys: 771 ms, total: 8.24 s
Wall time: 4min 32s


# Leuko

In [None]:
leuko_file = "OAS/data/D326651_Leuko_human_naive.csv.gz"
for path in [Path(leuko_file)]:
    filename = path.stem.split("_")[0]
    print(filename)
    df = pd.read_csv(path, compression="gzip")

    # Heavy
    sequences = [
        SeqRecord(
            id=str(sequenec_id),
            name=str(sequenec_id),
            description=barcode,
            seq=Seq(sequence),
        )
        for barcode, sequenec_id, sequence in zip(df["barcode"], df["id_heavy"], df["sequence_heavy"])
    ]
    with gzip.open(f"OAS/data/fasta-heavy/{filename}.fasta.gz", "wt") as output_handle:
        SeqIO.write(sequences, output_handle, "fasta")

    # Light
    sequences = [
        SeqRecord(
            id=str(sequenec_id),
            name=str(sequenec_id),
            description=barcode,
            seq=Seq(sequence),
        )
        for barcode, sequenec_id, sequence in zip(df["barcode"], df["id_light"], df["sequence_light"])
    ]
    with gzip.open(f"OAS/data/fasta-light/{filename}.fasta.gz", "wt") as output_handle:
        SeqIO.write(sequences, output_handle, "fasta")

    # Paired
    heavy_df = airr_api.run_fasta(f"OAS/data/fasta-heavy/{filename}.fasta.gz")
    light_df = airr_api.run_fasta(f"OAS/data/fasta-light/{filename}.fasta.gz")

    heavy_df["tmp_id"] = heavy_df["sequence_id"].apply(lambda x: x.rsplit("_", 1)[0])
    light_df["tmp_id"] = light_df["sequence_id"].apply(lambda x: x.rsplit("_", 1)[0])

    paired_df = pd.merge(
        heavy_df,
        light_df,
        how="outer",
        on="tmp_id",
        suffixes=("_heavy", "_light"),
    )
    paired_df = paired_df.drop(["tmp_id"], axis=1)
    paired_df["run"] = "D326651"
    paired_df["species"] = "human"
    paired_df["bsource"] = "PBMC"
    paired_df["author"] = "Jonathan Hurtado"
    paired_df["btype"] = "Naive-B-Cells"
    paired_df["disease"] = None
    paired_df["file_name"] = "D326651"
    # add manifest columns manually # TODO
    paired_df.to_parquet(f"parquet-paired/{filename}.parquet")

    del heavy_df
    del light_df
    del paired_df

D326651


# Concat all parquets

In [None]:
# def run_airr(chain):
#     airr_api = Airr(reference_name='human')
#     dfs = []
#     for path in Path(f'OAS/data/fasta-{chain}').glob('*.fasta.gz'):
#         df = airr_api.run_fasta(path)
#         df.to_parquet(f'{path.stem}.parquet')
#     return pd.concat(dfs)

# heavy_df = run_airr('heavy')
# light_df = run_airr('light')

In [5]:
# paired_df = pd.merge(heavy_df, light_df, how='outer', on='sequence_id', suffixes=('_heavy', '_light'))
# paired_df.to_parquet('all.parquet')

# Make tailored manifest

In [None]:
import pandas as pd
import glob
from pathlib import Path

manifest = pd.read_csv("OAS/data/oas_manifest.csv.gz", compression="gzip")
dfs = []
for filename in glob.glob("parquet-paired/*.parquet"):
    print(runid)
    runid = Path(filename).stem.split("_")[0]
    auth_subdf = manifest.query(f'species=="human" & run == "{runid.upper()}"').drop_duplicates("author")
    dfs.append(auth_subdf)
df = pd.concat(dfs).reset_index().sort_values("run")
df.to_csv("OAS/data/oas_manifest_human_paired.csv.gz", index=False, compression="gzip")
df.author.unique()

SRR10358524
ERR4082235
1287148
1287158
1287151
1279073
1287150
1287159
1287149
1287152
1279068
ERR4082303
SRR10358523
ERR4082227
1287153
ERR4082263
1279075
1279065
1287156
1287146
ERR4082259
ERR4082251
1279074
ERR4082267
1287147
1287157
ERR4082283
ERR4082243
SRR10358525
D326651
1287155
1287145
1279076
1279066
ERR4082275
ERR4082291
1287144
1287154
1279067
ERR4082299


array(['Jaffe et al., 2022', 'King et al., 2020', 'Eccles et al., 2020'],
      dtype=object)