In [1]:
# Python script for analyses of "Antibody affinity birth through somatic hypermutation" publication.
# This pipeline is divided into X sections. At the beginning of each section there is a comment which indicates which figures of the publication are generated based on that section.

# input sequences for these analyses are uploaded in data folder. By a successful run, the result of each section will be saved in output folder.
print('Running...')
import re
import operator

import os
#import sys
import pandas as pd
import numpy as np

import time
import itertools
import matplotlib.pyplot as plt
import glob
#import logomaker #https://logomaker.readthedocs.io

# Functions
def display_big():

    # df = pd.DataFrame()
    # pd.options.display.max_colwidth = 2000
    pd.set_option('display.max_rows', 10)
    pd.set_option('display.max_columns', 500)
    pd.set_option('display.width', 1000)

display_big()

Running...


In [2]:
data_folder='../data'

input_folder = os.getenv('VAR_IN_FOLDER', f"{data_folder}/input")
output_folder = os.getenv('VAR_OUT_FOLDER', f"{data_folder}/output")

In [3]:
def set_output_folder(section_output):
    output_folder=data_folder+'/output/'+section_output

    if not os.path.isdir(output_folder): # make output folder if it doesn't exist
        os.makedirs(output_folder)
    return(output_folder)

In [4]:
# Section1: preparation

output_folder_prep=set_output_folder('1_prep')
output_folder_num_miss=set_output_folder('2_num_miss')
output_folder_freq_pos=set_output_folder('3_freq_per_position')

In [5]:
dfs_expanded_nts_included=pd.read_csv('{}/../1_prep/dfs_expanded_nts_included.tsv'.format(output_folder_prep), sep='\t', header=0, low_memory=False)
dfs_expanded_nts_included.reset_index(inplace=True, drop=True)

dfs_expanded_nts_excluded=pd.read_csv('{}/../1_prep/dfs_expanded_nts_excluded.tsv'.format(output_folder_prep), sep='\t', header=0, low_memory=False)
dfs_expanded_nts_excluded.reset_index(inplace=True, drop=True)

dfs_expanded_aas_included=pd.read_csv('{}/../1_prep/dfs_expanded_aas_included.tsv'.format(output_folder_prep), sep='\t', header=0, low_memory=False)
dfs_expanded_aas_included.reset_index(inplace=True, drop=True)

dfs_expanded_aas_excluded=pd.read_csv('{}/../1_prep/dfs_expanded_aas_excluded.tsv'.format(output_folder_prep), sep='\t', header=0, low_memory=False)
dfs_expanded_aas_excluded.reset_index(inplace=True, drop=True)

In [6]:
def num_nt_miss(df):
    nt_miss_list=[]

    for chain in ['VH', 'VL']:
        df_per_chain=df[df['chain']==chain]
        
        grouping=df_per_chain.groupby(by=['status', 'mouse', 'dataset', 'chain', 'sub_dataset'])
        #grouping=df_per_chain.groupby(by=['mouse', 'dataset'])
        dfs_chain=pd.DataFrame()

        for grouped, df_chain in grouping:
            ID='_'.join(grouped)
            #if ID=='Published_B18_Passenger_VH_NA':
            # print(grouped)
            df_chain.rename(columns={'nt_miss':ID}, inplace=True)
            df_chain=df_chain[[ID]].sort_values(by=ID)
            df_chain.reset_index(inplace=True, drop=True)
            #display(df_chain)
            
            dfs_chain=pd.concat([dfs_chain, df_chain], axis=1)
        nt_miss_list.append(dfs_chain)
    return(nt_miss_list)

In [7]:
def num_aa_miss(df):
    aa_miss_list=[]

    for chain in ['VH', 'VL']:
        df_per_chain=df[df['chain']==chain]
        grouping=df_per_chain.groupby(by=['status', 'mouse', 'dataset', 'chain', 'sub_dataset'])
        #grouping=df_per_chain.groupby(by=['mouse', 'dataset'])
        dfs_chain=pd.DataFrame()

        for grouped, df_chain in grouping:
            ID='_'.join(grouped)
            # print(ID)
            df_chain.rename(columns={'aa_miss':ID}, inplace=True)
            df_chain=df_chain[[ID]].sort_values(by=ID)
            df_chain.reset_index(inplace=True, drop=True)
            dfs_chain=pd.concat([dfs_chain, df_chain], axis=1)
        aa_miss_list.append(dfs_chain)
    return(aa_miss_list)

In [8]:
num_nt_miss_list_included=num_nt_miss(dfs_expanded_nts_included)

In [9]:
num_nt_miss_list_excluded=num_nt_miss(dfs_expanded_nts_excluded)

In [10]:
num_aa_miss_list_included=num_aa_miss(dfs_expanded_aas_included)

In [11]:
num_aa_miss_list_excluded=num_aa_miss(dfs_expanded_aas_excluded)

# Sorting

In [12]:
# Define the sorting order for each block
block1_order = ['Unimmunized', 'EarlyGC', 'LateGC', 'Published']
block2_order = ['B18-383', 'HA-uMT', 'HA-WT', 'HRO']
block3_order = ['OVA', 'APC', 'CGG', 'OVA-CTLA4', 'OVA-Isotype', 'CGG-CTLA4', 'CGG-Isotype', 'BM', 'SPL', 'PP', 'MLN']
block4_order = ['VH', 'VL']
block5_order = ['NA', '0-1', '1-1', '1-100', '1-1000', 'rep1', 'rep2']

# Create dictionaries for fast lookups of the sort order
block1_dict = {v: i for i, v in enumerate(block1_order)}
block2_dict = {v: i for i, v in enumerate(block2_order)}
block3_dict = {v: i for i, v in enumerate(block3_order)}
block4_dict = {v: i for i, v in enumerate(block4_order)}
block5_dict = {v: i for i, v in enumerate(block5_order)}

# Function to get the sort key for each entry
def sort_key(x):
    # Split the input string into its 5 components
    blocks = x.split('_')
    
    # Ensure each block has a valid entry, fill with a high number if not found
    b1 = block1_dict.get(blocks[0], float('inf'))
    b2 = block2_dict.get(blocks[1], float('inf'))
    b3 = block3_dict.get(blocks[2], float('inf'))
    b4 = block4_dict.get(blocks[3], float('inf'))
    b5 = block5_dict.get(blocks[4], float('inf'))
    
    # Return a tuple that Python will use to sort the items
    return (b1, b2, b3, b4, b5)

In [13]:
num_nt_miss_vh=num_nt_miss_list_included[0][sorted(set(num_nt_miss_list_included[0]), key=sort_key)]
num_nt_miss_vh.to_csv('{}/nt_miss_included_vh.tsv'.format(output_folder_num_miss), sep = '\t', index=False)
print(num_nt_miss_vh.max().max())
num_nt_miss_vl=num_nt_miss_list_included[1][sorted(set(num_nt_miss_list_included[1]), key=sort_key)]
num_nt_miss_vl.to_csv('{}/nt_miss_included_vl.tsv'.format(output_folder_num_miss), sep = '\t', index=False)
print(num_nt_miss_vl.max().max())

92.0
33.0


In [14]:
num_nt_miss_vh=num_nt_miss_list_excluded[0][sorted(set(num_nt_miss_list_excluded[0]), key=sort_key)]
num_nt_miss_vh.to_csv('{}/nt_miss_excluded_vh.tsv'.format(output_folder_num_miss), sep = '\t', index=False)
print(num_nt_miss_vh.max().max())
num_nt_miss_vl=num_nt_miss_list_excluded[1][sorted(set(num_nt_miss_list_excluded[1]), key=sort_key)]
num_nt_miss_vl.to_csv('{}/nt_miss_excluded_vl.tsv'.format(output_folder_num_miss), sep = '\t', index=False)
print(num_nt_miss_vl.max().max())

92.0
33.0


In [15]:
num_aa_miss_vh=num_aa_miss_list_included[0][sorted(set(num_aa_miss_list_included[0]), key=sort_key)]
num_aa_miss_vh.to_csv('{}/aa_miss_included_vh.tsv'.format(output_folder_num_miss), sep = '\t', index=False)
print(num_aa_miss_vh.max().max())
num_aa_miss_vl=num_aa_miss_list_included[1][sorted(set(num_aa_miss_list_included[1]), key=sort_key)]
num_aa_miss_vl.to_csv('{}/aa_miss_included_vl.tsv'.format(output_folder_num_miss), sep = '\t', index=False)
print(num_aa_miss_vl.max().max())

62.0
24.0


In [16]:
num_aa_miss_vh=num_aa_miss_list_excluded[0][sorted(set(num_aa_miss_list_excluded[0]), key=sort_key)]
num_aa_miss_vh.to_csv('{}/aa_miss_excluded_vh.tsv'.format(output_folder_num_miss), sep = '\t', index=False)
print(num_aa_miss_vh.max().max())
num_aa_miss_vl=num_aa_miss_list_excluded[1][sorted(set(num_aa_miss_list_excluded[1]), key=sort_key)]
num_aa_miss_vl.to_csv('{}/aa_miss_excluded_vl.tsv'.format(output_folder_num_miss), sep = '\t', index=False)
print(num_aa_miss_vl.max().max())

62.0
24.0
