# Interpretability strategies for machinelearning language models for antibody sequences
## Data Tokenization
Octave Malamoud CID 02504015

# Import

In [1]:
from transformers import (
    RobertaConfig,
    RobertaTokenizer,
    RobertaForMaskedLM,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer,
)
from torch.utils.data import DataLoader

import numpy as np
import torch
from datasets import load_dataset,Dataset, DatasetDict
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
import matplotlib.patches as mpatches
import umap
from scipy.spatial.distance import cosine
import blosum as bl
from datasets import load_from_disk

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
data = pd.read_csv("C:/Users/omala/Downloads/Mixthuman2.csv",delimiter=",",skiprows=1)
data.columns

Index(['sequence', 'locus', 'stop_codon', 'vj_in_frame', 'v_frameshift',
       'productive', 'rev_comp', 'complete_vdj', 'v_call', 'd_call', 'j_call',
       'sequence_alignment', 'germline_alignment', 'sequence_alignment_aa',
       'germline_alignment_aa', 'v_alignment_start', 'v_alignment_end',
       'd_alignment_start', 'd_alignment_end', 'j_alignment_start',
       'j_alignment_end', 'v_sequence_alignment', 'v_sequence_alignment_aa',
       'v_germline_alignment', 'v_germline_alignment_aa',
       'd_sequence_alignment', 'd_sequence_alignment_aa',
       'd_germline_alignment', 'd_germline_alignment_aa',
       'j_sequence_alignment', 'j_sequence_alignment_aa',
       'j_germline_alignment', 'j_germline_alignment_aa', 'fwr1', 'fwr1_aa',
       'cdr1', 'cdr1_aa', 'fwr2', 'fwr2_aa', 'cdr2', 'cdr2_aa', 'fwr3',
       'fwr3_aa', 'fwr4', 'fwr4_aa', 'cdr3', 'cdr3_aa', 'junction',
       'junction_length', 'junction_aa', 'junction_aa_length', 'v_score',
       'd_score', 'j_score', 'v_

# Data selection

In [26]:
MIN_RESIDUES_BEFORE_CDR1 = 20
MIN_RESIDUES_AFTER_CDR3 = 10
CDR1_RANGE = (5, 12)
CDR2_RANGE = (1, 10)
CDR3_RANGE = (5, 38)
def verify_sequence_criteria(row):
    fwr1_length = len(row['fwr1'])
    cdr1_length = len(row['cdr1'])
    fwr2_length = len(row['fwr2'])
    cdr2_length = len(row['cdr2'])
    fwr3_length = len(row['fwr3'])
    cdr3_length = len(row['cdr3'])
    fwr4_length = len(row['fwr4'])

    # Check lengths before CDR1 and after CDR3
    if fwr1_length < MIN_RESIDUES_BEFORE_CDR1 or fwr4_length < MIN_RESIDUES_AFTER_CDR3:
        return False

    # Check CDR lengths
    if not (CDR1_RANGE[0] <= cdr1_length <= CDR1_RANGE[1]):
        return False
    if not (CDR2_RANGE[0] <= cdr2_length <= CDR2_RANGE[1]):
        
        return False
    if not (CDR3_RANGE[0] <= cdr3_length <= CDR3_RANGE[1]):
        
        return False

    return True
    
 
df = pd.DataFrame()
df["v_call"] = data["v_call"]
df["fwr1"] =data["fwr1_aa"]
df["cdr1"] =data["cdr1_aa"]
df["fwr2"] =data["fwr2_aa"]
df["cdr2"] =data["cdr2_aa"]
df["fwr3"] =data["fwr3_aa"]
df["cdr3"] =data["cdr3_aa"]
df["fwr4"] =data["fwr4_aa"]
df["sequence"] = df["fwr1"] + df["cdr1"] + df["fwr2"] + df["cdr2"] + df["fwr3"] + df["cdr3"] + df["fwr4"]
float_indices = df[df["sequence"].apply(lambda x: isinstance(x, float))].index
df.drop(float_indices,inplace=True)
df.reset_index(drop=True,inplace=True)
df['meets_criteria'] = df.apply(verify_sequence_criteria, axis=1)
print(df['meets_criteria'].value_counts())
df = df[df['meets_criteria']].copy()
print(df['meets_criteria'].value_counts())
df.reset_index(drop=True,inplace=True)
df

meets_criteria
False    983685
True       6138
Name: count, dtype: int64
meets_criteria
True    6138
Name: count, dtype: int64


Unnamed: 0,v_call,fwr1,cdr1,fwr2,cdr2,fwr3,cdr3,fwr4,sequence,meets_criteria
0,IGHV2-5*02,ESGPTLVNPTQTLTLTCTFS,GFSLSTGGVS,VAWIRQPPGKALEWIAS,INWGDDK,RYSPSLKSRLTLTKDNSKNQVVLTLTNMNPVDTAPYYC,SQRVGSRGSFDY,WGQGTLVTVSS,ESGPTLVNPTQTLTLTCTFSGFSLSTGGVSVAWIRQPPGKALEWIA...,True
1,IGHV2-5*01,ASGPTLVNPTQTLTLTCTFS,GFSLSTSGVG,VGWIRQPPGKALEWLAL,IYWNDDK,RYSPSLKSRLTITKDTSKNQVVLTMTNMDPVDTATYYC,AHSSAYYDFWSGYQYYFDY,WGQGTLVTVSS,ASGPTLVNPTQTLTLTCTFSGFSLSTSGVGVGWIRQPPGKALEWLA...,True
2,IGHV2-5*01,ESGPTLVNPTQTLTLTCTFS,GFSLSTSQVG,VGWIRQPPGKALEWLAH,VYWNDAK,YYSLSLKTRLTITKDTSKNQVVLTMTNMDPVDTATYFC,AHLNTRGYYFDY,WGQGALVTVSS,ESGPTLVNPTQTLTLTCTFSGFSLSTSQVGVGWIRQPPGKALEWLA...,True
3,IGHV2-5*01,QSGPTLVNPTQTLTLTCNFS,GFSLTTRGVG,VAWIRQPPGKALEWLAL,IYWNDDK,RCSPSLKSRLTITKDTSKNEVVLTMTNMDPADTATYYC,AQGGGGMDV,WGQGTTVTVSS,QSGPTLVNPTQTLTLTCNFSGFSLTTRGVGVAWIRQPPGKALEWLA...,True
4,IGHV2-5*02,ESGPTLVNPTQTLTLTCTVS,GVSLTTSGVS,VGWIRQPPGKALEWLAL,IFWDDDK,RYSPALKSRLTVTKDTSKNQVVLTLTNVDPVDTATYYC,AQGTPRTSMLAY,WGQGIRVTVSS,ESGPTLVNPTQTLTLTCTVSGVSLTTSGVSVGWIRQPPGKALEWLA...,True
...,...,...,...,...,...,...,...,...,...,...
6133,IGHV2-5*01,ASGPTLVNPTPTLTLTCTVS,GFSLSASGEG,VGWIRQPPGKALEWLGV,LYWHDNT,RYTRYSPSLKNRLAITEDTSKNQVVLTLTNMDPVDTATYFC,AHRYDFWTGYYFLSYFDY,WGQGMLVTVSS,ASGPTLVNPTPTLTLTCTVSGFSLSASGEGVGWIRQPPGKALEWLG...,True
6134,IGHV2-5*02,ESGPTLVNPTQTLTLTCTFS,GFSLSTSGVG,VGWIRQPPGKALERLAL,IYWDDDK,HYSPSLKSRLTITKDTSKNQVVLTMTNMDPVDTATYYC,AHLYGGNSVDY,WGQGTLVTVSS,ESGPTLVNPTQTLTLTCTFSGFSLSTSGVGVGWIRQPPGKALERLA...,True
6135,IGHV2-5*02,ESGPTLVNPTQTLTLTCTFS,GFSLSTSEAG,VGWIRQPPGKALEWLAL,IYWDDDK,RYSPSLKSRLTITKDTSKNQVVLTMTNLDPVDTATYYC,AHSSGIDAFDI,WGQGTMVTVSS,ESGPTLVNPTQTLTLTCTFSGFSLSTSEAGVGWIRQPPGKALEWLA...,True
6136,IGHV2-5*01,QSGPTLVNPTQTLTLTCTFS,GFSLSTSGVG,VGWIRQPPGKALEWLAL,IYWNDAK,LYSPSLKSRLTITKDTSKNQVVLTMTNMDPVDTATYYC,AHRRPAYYDSRVQVWYFDY,WGQGTLVTVSS,QSGPTLVNPTQTLTLTCTFSGFSLSTSGVGVGWIRQPPGKALEWLA...,True


In [57]:
df = pd.concat([data1,data2])

df.reset_index(drop=True,inplace=True)z

In [59]:
df.to_csv("C:/Users/omala/Downloads/dataset_12.csv")


# Tokenization

In [63]:
# Initialise the tokeniser
tokenizer = RobertaTokenizer.from_pretrained(
    "C:/Users/omala/OneDrive/Bureau/Imperial/Project/code/antibody-tokenizer/",
)

# Initialise the data collator, which is necessary for batching
collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)
torch.manual_seed(42)

<torch._C.Generator at 0x1f3bc470430>

In [64]:

train_len =int(df["sequence"].count()*0.8)
eval_len = int(df["sequence"].count()*0.9)
# Convert DataFrames to Datasets
train_dataset = Dataset.from_pandas(df[["sequence"]][:train_len])
eval_dataset = Dataset.from_pandas(df[["sequence"]][train_len:eval_len])
test_dataset = Dataset.from_pandas(df[["sequence"]][eval_len:df["sequence"].count()])
dataset_dict = DatasetDict({
    "train": train_dataset,
    "eval": eval_dataset,
    "test": test_dataset
})
# Tokenize the SEQUENCE_INPUT
def tokenize_function(examples):
    return tokenizer(
        examples["sequence"],
        padding="max_length",
        truncation=True,
        max_length=150,
        return_special_tokens_mask=True,
    )

# Tokenize the datasets
tokenized_dataset = dataset_dict.map(tokenize_function, batched=True, remove_columns=["sequence"])

# Display the tokenized dataset
print(tokenized_dataset)


Map: 100%|██████████| 1598902/1598902 [03:53<00:00, 6861.41 examples/s]
Map: 100%|██████████| 199863/199863 [00:28<00:00, 6921.59 examples/s]
Map: 100%|██████████| 199863/199863 [00:28<00:00, 6907.60 examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'special_tokens_mask', 'attention_mask'],
        num_rows: 1598902
    })
    eval: Dataset({
        features: ['input_ids', 'special_tokens_mask', 'attention_mask'],
        num_rows: 199863
    })
    test: Dataset({
        features: ['input_ids', 'special_tokens_mask', 'attention_mask'],
        num_rows: 199863
    })
})





In [10]:
tokenized_dataset.save_to_disk("C:/Users/omala/OneDrive/Bureau/Imperial/Project/code/antibody-tokenized-dataset/")