In [None]:
import re
import pandas as pd

In [None]:
url = 'https://opig.stats.ox.ac.uk/webapps/sabdab-sabpred/static/downloads/TheraSAbDab_SeqStruc_OnlineDownload.xlsx'

df = pd.read_excel(url)
df.head()
print(df.columns)

Index(['Therapeutic', 'Format', 'CH1 Isotype', 'VD LC',
       'Highest_Clin_Trial (Feb '25)', 'Est. Status', 'HeavySequence',
       'LightSequence', 'HeavySequence(ifbispec)', 'LightSequence(ifbispec)',
       '100% SI Structure', '99% SI Structure', '95-98% SI Structure',
       'Year Proposed', 'Year Recommended', 'Target', 'Companies',
       'Conditions Approved', 'Conditions Active', 'Conditions Discontinued',
       'Development Tech', 'Notes',
       'Genetics (Bispecifics delimited with semicolon)',
       'Alternative Therapeutic Names'],
      dtype='object')


In [None]:
# Normalize text
df = df.apply(lambda x: x.str.strip().str.lower() if x.dtype == "object" else x)

# Apply filters
filtered_df = df[
    (df["Format"] == "whole mab") &
    (df["Highest_Clin_Trial (Feb '25)"].isin(["approved", "phase-iii"])) &
    (df["Est. Status"] == "active")
].copy()

print(f"Number of therapeutics after filtering: {len(filtered_df)}")
filtered_df.head()

Number of therapeutics after filtering: 230


Unnamed: 0,Therapeutic,Format,CH1 Isotype,VD LC,Highest_Clin_Trial (Feb '25),Est. Status,HeavySequence,LightSequence,HeavySequence(ifbispec),LightSequence(ifbispec),...,Year Recommended,Target,Companies,Conditions Approved,Conditions Active,Conditions Discontinued,Development Tech,Notes,Genetics (Bispecifics delimited with semicolon),Alternative Therapeutic Names
3,abelacimab,whole mab,g1,lambda,phase-iii,active,qvqllesggglvqpggslrlscaasgftfstaamswvrqapgkgle...,qsvltqppsasgtpgqrvtiscsgsssnigsndvswyqqlpgtapk...,na,na,...,,f11,novartis;anthos therapeutics;labcorp drug deve...,na,venous thromboembolism; stroke;thrombosis,na,,feb '22: added in missing residues 97 and 98. ...,genetically human,maa-868
14,adalimumab,whole mab,g1,kappa,approved,active,evqlvesggglvqpgrslrlscaasgftfddyamhwvrqapgkgle...,diqmtqspsslsasvgdrvtitcrasqgirnylawyqqkpgkapkl...,na,na,...,,tnf/tnfa,abbvie;medimmune;180 life sciences,ankylosing spondylitis;behcet's syndrome;crohn...,dupuytren's contracture;cognition disorders;fr...,interstitial cystitis,cat phage display,many biosimilars and related drugs.,genetically human,d2e7;humira
15,adebrelimab,whole mab,g4,kappa,approved,active,qvqlvqsgaevkkpgasvkvsckasgytftsywmhwvrqapgqgle...,divltqspaslavspgqratitcrasesvsihgthlmhwyqqkpgq...,na,na,...,,pdl1/cd274,jiangsu hengrui medicine;atridia,small cell lung cancer,non-small cell lung cancer;oesophageal cancer;...,haematological malignancies,,genetics: musmus/homosap,chimeric and/or humanised,hti-1088;hti1088;hti-1316;hti1316;shr 1316;shr...
22,afimkibart,whole mab,g1,kappa,phase-iii,active,qvqlvqsgaevkkpgasvkvsckasgydftyygiswvrqapgqgle...,eivltqspatlslspgeratlscrasqsvssylawyqqkpgqaprl...,na,na,...,na,tnfsf15/tl1a/vegi,pfizer;roche,na,ulcerative colitis;crohn's disease;inflammator...,na,,,genetically human,pf 6480605;pf-06480605;pf06480605;rg-6631;rg 6...
26,alemtuzumab,whole mab,g1,kappa,approved,active,qvqlqesgpglvrpsqtlsltctvsgftftdfymnwvrqppgrgle...,diqmtqspsslsasvgdrvtitckasqnidkylnwyqqkpgkapkl...,na,na,...,,cd52,bayer healthcare pharmaceuticals inc.;dana-far...,chronic lymphocytic leukaemia;multiple sclerosis,lymphoma;t-cell prolymphocytic leukaemia,rheumatoid arthritis;diffuse large b cell lymp...,,,humanised,campath;campath-1h;ldp-03;ldp03;lemtrada;remniq


In [None]:
# Drop entries with missing VH or VL
filtered_df = filtered_df.dropna(subset=["HeavySequence", "LightSequence"])

print(f" after removing non available sequences: {len(filtered_df)}")

# Combine VH + VL to identify duplicates
filtered_df["vh_vl_pair"] = filtered_df["HeavySequence"] + "_" + filtered_df["LightSequence"]
filtered_df = filtered_df.drop_duplicates(subset=["vh_vl_pair"])

print(f"Final unique therapeutics: {len(filtered_df)}")

 after removing non available sequences: 230
Final unique therapeutics: 227


In [None]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 227 entries, 3 to 1129
Data columns (total 25 columns):
 #   Column                                           Non-Null Count  Dtype 
---  ------                                           --------------  ----- 
 0   Therapeutic                                      227 non-null    object
 1   Format                                           227 non-null    object
 2   CH1 Isotype                                      227 non-null    object
 3   VD LC                                            227 non-null    object
 4   Highest_Clin_Trial (Feb '25)                     227 non-null    object
 5   Est. Status                                      227 non-null    object
 6   HeavySequence                                    227 non-null    object
 7   LightSequence                                    227 non-null    object
 8   HeavySequence(ifbispec)                          227 non-null    object
 9   LightSequence(ifbispec)                        

In [None]:
filtered_df["Genetics (Bispecifics delimited with semicolon)"].unique()

array(['genetically human', 'chimeric and/or humanised', 'humanised',
       'chimeric', 'murine', 'chimeric and humanised',
       'humanised;chimeric'], dtype=object)

In [None]:
filtered_df2 = filtered_df[
    (filtered_df["Genetics (Bispecifics delimited with semicolon)"].isin([
        "genetically human",
        "humanised"
    ]))
].copy()

print("Number of therapeutics after filtering:", len(filtered_df2))

Number of therapeutics after filtering: 183


In [None]:
len(filtered_df2)

183

In [None]:
vh_col = "HeavySequence"
aa_re = re.compile(r"^[ACDEFGHIKLMNPQRSTVWY]+$")

mask = filtered_df2[vh_col].str.upper().str.fullmatch(aa_re)
print("Canonical AA only:", mask.mean())  # fraction
thera_pos = filtered_df2[mask].copy()

Canonical AA only: 1.0


In [None]:
# change to uppercase.
thera_pos[vh_col] = thera_pos[vh_col].str.upper()

# Deduplicate VH sequences
before = len(thera_pos)
thera_pos = thera_pos.drop_duplicates(subset=[vh_col])
after = len(thera_pos)
print(f"Removed {before - after} duplicate VH-only sequences")

Removed 1 duplicate VH-only sequences


In [None]:
thera_pos.info()

<class 'pandas.core.frame.DataFrame'>
Index: 182 entries, 3 to 1127
Data columns (total 25 columns):
 #   Column                                           Non-Null Count  Dtype 
---  ------                                           --------------  ----- 
 0   Therapeutic                                      182 non-null    object
 1   Format                                           182 non-null    object
 2   CH1 Isotype                                      182 non-null    object
 3   VD LC                                            182 non-null    object
 4   Highest_Clin_Trial (Feb '25)                     182 non-null    object
 5   Est. Status                                      182 non-null    object
 6   HeavySequence                                    182 non-null    object
 7   LightSequence                                    182 non-null    object
 8   HeavySequence(ifbispec)                          182 non-null    object
 9   LightSequence(ifbispec)                        

In [None]:
thera_pos["label"] = 1

pos_for_model = thera_pos[["Therapeutic", "HeavySequence", "label"]].copy()

In [None]:
!apt-get update -y
!apt-get install -y hmmer

Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:2 https://cli.github.com/packages stable InRelease
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:6 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:8 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [2,153 kB]
Get:9 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [9,486 kB]
Get:10 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]
Hit:11 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:12 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:13 http://security.ubuntu.com/ubuntu jammy-security/unive

In [None]:
!git clone https://github.com/oxpig/ANARCI.git

Cloning into 'ANARCI'...
remote: Enumerating objects: 793, done.[K
remote: Counting objects: 100% (293/293), done.[K
remote: Compressing objects: 100% (83/83), done.[K
remote: Total 793 (delta 249), reused 210 (delta 210), pack-reused 500 (from 2)[K
Receiving objects: 100% (793/793), 6.52 MiB | 16.91 MiB/s, done.
Resolving deltas: 100% (454/454), done.


In [None]:
%cd ANARCI

!pip install biopython
!python setup.py install

%cd ..

/content/ANARCI
Collecting biopython
[0m  Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)
Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.86
!!

        ********************************************************************************
        Please avoid running ``setup.py`` directly.
        Instead, use pypa/build, pypa/installer or other
        standards-based tools.

        See https://blog.ganssle.io/articles/2021/10/setup-py-deprecated.html for details.
        ********************************************************************************

!!
  self.initialize_options()
INFO: ANARCI lives in:  /usr/local/lib/python3.12/dist-packages/ana

In [None]:
from anarci import anarci

def annotate_vh(seq):
    """
    Run ANARCI on a single VH amino acid sequence and return:
    - v_call, j_call
    - fwr1_aa, cdr1_aa, fwr2_aa, cdr2_aa, fwr3_aa, cdr3_aa, fwr4_aa
    """
    # Default values (in case ANARCI fails for some sequence)
    out = {
        "v_call": None,
        "j_call": None,
        "fwr1_aa": "",
        "cdr1_aa": "",
        "fwr2_aa": "",
        "cdr2_aa": "",
        "fwr3_aa": "",
        "cdr3_aa": "",
        "fwr4_aa": "",
    }

    if not isinstance(seq, str) or len(seq) == 0:
        return out

    try:
        numbered, hit_info, _ = anarci([("antibody", seq)], scheme="imgt", assign_germline=True)
    except Exception:
        # If ANARCI crashes on some weird sequence, just return defaults
        return out

    #Extract germlines (v_call, j_call)
    if hit_info and len(hit_info[0]) > 0:
        germlines = hit_info[0][0].get("germlines", {})

        # v_call
        if "v_gene" in germlines and len(germlines["v_gene"]) > 0:
            out["v_call"] = germlines["v_gene"][0][1]  # e.g. "IGHV3-23*01"

        # j_call
        if "j_gene" in germlines and len(germlines["j_gene"]) > 0:
            out["j_call"] = germlines["j_gene"][0][1]  # e.g. "IGHJ4*02"

    #Extract FR/CDR regions
    if not numbered or len(numbered[0]) == 0:
        return out

    chain_domains = numbered[0]       # list of domains for this chain
    positions_list, start, end = chain_domains[0]

    for (pos, ins), aa in positions_list:
        if aa in ['-', '.']:
            continue

        i = pos

        if 1 <= i <= 26:
            out["fwr1_aa"] += aa
        elif 27 <= i <= 38:
            out["cdr1_aa"] += aa
        elif 39 <= i <= 55:
            out["fwr2_aa"] += aa
        elif 56 <= i <= 65:
            out["cdr2_aa"] += aa
        elif 66 <= i <= 104:
            out["fwr3_aa"] += aa
        elif 105 <= i <= 117:
            out["cdr3_aa"] += aa
        elif 118 <= i <= 129:
            out["fwr4_aa"] += aa

    return out

In [None]:
# Reset index
thera_pos = thera_pos.reset_index(drop=True)

# Run ANARCI annotation on HeavySequence
anno_series = thera_pos["HeavySequence"].apply(annotate_vh)

anno_df = pd.DataFrame(list(anno_series))

# Concatenate
thera_pos = pd.concat([thera_pos, anno_df], axis=1)

In [None]:
def get_v_family_from_v_call(v_call):
    if pd.isna(v_call):
        return None

    s = str(v_call)
    # Take the first call if multiple are present
    s = s.split(",")[0].split("|")[0].strip()

    m = re.search(r"(IGHV\d+)", s)
    return m.group(1) if m else None

thera_pos["v_family"] = thera_pos["v_call"].apply(get_v_family_from_v_call)

In [None]:
thera_pos["v_family"].unique()

array(['IGHV3', 'IGHV1', 'IGHV4', 'IGHV5', 'IGHV2', 'IGHV7', 'IGHV8',
       'IGHV6'], dtype=object)

In [None]:
#Count each IGHV family
family_counts = thera_pos["v_family"].value_counts()
print(family_counts)

v_family
IGHV3    83
IGHV1    58
IGHV4    20
IGHV5     8
IGHV2     7
IGHV7     3
IGHV6     2
IGHV8     1
Name: count, dtype: int64


In [None]:
# Compute percentage distribution
family_percent = thera_pos["v_family"].value_counts(normalize=True) * 100     .
print(family_percent.round(2))

SyntaxError: invalid syntax (ipython-input-1644147569.py, line 2)

In [None]:
# Save
positive_for_model = thera_pos[[ "Therapeutic", "HeavySequence", "labesl", "cdr3_aa", "v_family"]].copy()

positive_for_model.to_csv("df_pos.csv", index=False)