In [1]:
# Python script for analyses of "Antibody affinity birth through somatic hypermutation" publication.
# This pipeline is divided into X sections. At the beginning of each section there is a comment which indicates which figures of the publication are generated based on that section.

# input sequences for these analyses are uploaded in data folder. By a successful run, the result of each section will be saved in output folder.
print('Running...')
import re
import operator

import os
#import sys
import pandas as pd
import numpy as np

import time
import itertools
import matplotlib.pyplot as plt
import glob
#import logomaker #https://logomaker.readthedocs.io

# Functions
def display_big():

    # df = pd.DataFrame()
    # pd.options.display.max_colwidth = 2000
    pd.set_option('display.max_rows', 10)
    pd.set_option('display.max_columns', 200)
    pd.set_option('display.width', 1000)

display_big()

Running...


In [2]:
data_folder='../data'

input_folder = os.getenv('VAR_IN_FOLDER', f"{data_folder}/input")
output_folder = os.getenv('VAR_OUT_FOLDER', f"{data_folder}/output")

In [3]:
def set_output_folder(section_output):
    output_folder=data_folder+'/output/'+section_output

    if not os.path.isdir(output_folder): # make output folder if it doesn't exist
        os.makedirs(output_folder)
    return(output_folder)

In [4]:
# Section1: preparation

output_folder_prep=set_output_folder('1_prep')
output_folder_num_miss=set_output_folder('2_num_miss')
output_folder_freq_pos=set_output_folder('3_freq_per_position')
output_folder_donuts=set_output_folder('4_donuts')
output_folder_seq_logos=set_output_folder('5_seq_logos2')
output_folder_rs_prep=set_output_folder('6_prep_rs')
output_folder_rs=set_output_folder('7_rs')


In [5]:
# Section5: rs ratios
# used in Figs 4B, S5A, S5B

dfs=pd.read_csv('{}/dfs_rs_ratios.tsv'.format(output_folder_rs_prep), sep='\t', header=0, low_memory=False)
dfs

Unnamed: 0,header,seq_nt,status,mouse,dataset,chain,sub_dataset,mouse_DB,label,type,ref_nt,ref_aa,seq_aa,stopcodon,len_nt,len_aa,nt_ins,nt_dels,nt_miss,nt_N,aa_ins,aa_dels,aa_miss,aa_.,A0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16,A17,A18,A19,A20,A21,A22,A23,A24,A25,A26,A27,A28,A29,A30,A31,A32,A33,A34,A35,A36,A37,A38,A39,A40,A41,A42,A43,A44,A45,A46,A47,A48,A49,A50,A51,A52,A53,A54,A55,A56,A57,A58,A59,A60,A61,A62,A63,A64,A65,A66,A67,A68,A69,A70,A71,A72,A73,A74,A75,A76,A77,A78,A79,A80,A81,A82,A83,A84,A85,A86,A87,A88,A89,A90,A91,A92,A93,A94,A95,A96,A97,A98,A99,A100,A101,A102,A103,A104,A105,replacement_mut,silent_mut,R-S_check,FR1,CDR1,FR2,CDR2,FR3,CDR3,FR4
0,58A_074_L-1524392-R2-A10_L_B10,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,LateGC,HA-WT,APC,VL,1-1,HA,LateGC_HA-WT_APC_VL_1-1,query,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,VTMSCTSSQSLFNSGKQKSYLTWYQQKLGQPPKVLIYWASTRESGV...,False,285.0,95.0,0,0,8,0,0,0,3,0,V,T,M,S,C,T,S,S,Q,S,L,F,N,S,G,K,Q,K,S,Y,L,T,W,Y,Q,Q,K,L,G,Q,P,P,K,V,L,I,Y,W,A,S,T,R,E,S,G,V,P,D,R,F,T,G,S,G,S,G,T,D,F,T,L,T,I,S,S,V,Q,A,E,D,L,A,V,Y,Y,C,Q,N,D,Y,S,Y,P,L,T,F,G,G,G,T,K,L,E,L,K,,,,,,,,,,,,"[0, 1, 2, 0, 0, 1, 0]","[0, 0, 2, 0, 1, 1, 0]","[0, 1, 0, 0, -1, 0, 0]",0.0,1.0,0.666667,0.0,0.0,0.5,0.0
1,51I_053_L-1350833-1362242-R-D11_L_H10,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,LateGC,HA-WT,APC,VL,1-1,HA,LateGC_HA-WT_APC_VL_1-1,query,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,False,285.0,95.0,0,0,1,0,0,0,0,0,V,T,M,S,C,T,S,S,Q,S,L,F,N,S,G,K,Q,K,N,Y,L,T,W,Y,Q,Q,K,P,G,Q,P,P,K,V,L,I,Y,W,A,S,T,R,E,S,G,V,P,D,R,F,T,G,S,G,S,G,T,D,F,T,L,T,I,S,S,V,Q,A,E,D,L,A,V,Y,Y,C,Q,N,D,Y,S,N,P,L,T,F,G,G,G,T,K,L,E,L,K,,,,,,,,,,,,"[0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 1, 0]","[0, 0, 0, 0, 0, -1, 0]",0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,57B_101_L-1524393-L-D6_L_E05,GTCACTCTGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAG...,LateGC,HA-WT,APC,VL,1-1,HA,LateGC_HA-WT_APC_VL_1-1,query,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,VTLSCTSSQSLFNSGEQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,False,285.0,95.0,0,0,2,0,0,0,2,0,V,T,L,S,C,T,S,S,Q,S,L,F,N,S,G,E,Q,K,N,Y,L,T,W,Y,Q,Q,K,P,G,Q,P,P,K,V,L,I,Y,W,A,S,T,R,E,S,G,V,P,D,R,F,T,G,S,G,S,G,T,D,F,T,L,T,I,S,S,V,Q,A,E,D,L,A,V,Y,Y,C,Q,N,D,Y,S,N,P,L,T,F,G,G,G,T,K,L,E,L,K,,,,,,,,,,,,"[1, 1, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0]","[1, 1, 0, 0, 0, 0, 0]",1.0,1.0,0.000000,0.0,0.0,0.0,0.0
3,51I_035_L-1350833-1362242-R-B7_L_F08,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,LateGC,HA-WT,APC,VL,1-1,HA,LateGC_HA-WT_APC_VL_1-1,query,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,False,285.0,95.0,0,0,1,0,0,0,0,0,V,T,M,S,C,T,S,S,Q,S,L,F,N,S,G,K,Q,K,N,Y,L,T,W,Y,Q,Q,K,P,G,Q,P,P,K,V,L,I,Y,W,A,S,T,R,E,S,G,V,P,D,R,F,T,G,S,G,S,G,T,D,F,T,L,T,I,S,S,V,Q,A,E,D,L,A,V,Y,Y,C,Q,N,D,Y,S,N,P,L,T,F,G,G,G,T,K,L,E,L,K,,,,,,,,,,,,"[0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 1, 0, 0]","[0, 0, 0, 0, -1, 0, 0]",0.0,0.0,0.000000,0.0,0.0,0.0,0.0
4,58A_094_L-1524393-L-C9_L_F12,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTAGAA...,LateGC,HA-WT,APC,VL,1-1,HA,LateGC_HA-WT_APC_VL_1-1,query,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,VTMSCTSSQSLFNSRKQKNFLTWYQQKPGQPPKLLIYWASTRESGV...,False,285.0,95.0,0,0,6,0,0,0,4,0,V,T,M,S,C,T,S,S,Q,S,L,F,N,S,R,K,Q,K,N,F,L,T,W,Y,Q,Q,K,P,G,Q,P,P,K,L,L,I,Y,W,A,S,T,R,E,S,G,V,P,D,R,F,T,G,S,G,S,G,T,D,F,T,L,T,I,S,S,V,Q,A,E,D,L,A,V,Y,Y,C,Q,N,D,Y,S,Y,P,L,T,F,G,G,G,T,K,L,E,L,K,,,,,,,,,,,,"[0, 2, 1, 0, 0, 1, 0]","[0, 0, 0, 0, 1, 1, 0]","[0, 2, 1, 0, -1, 0, 0]",0.0,2.0,1.000000,0.0,0.0,0.5,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11401,52A_018_H-1350833-unL-1362242-L-B12_H_B03,TCCTGTGCAGCCTCTGGATTCACTTTCAGTACCTATGGCATGTCTT...,LateGC,HA-WT,mix,VH,1-1,HA,LateGC_HA-WT_mix_VH_1-1,query,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,SCAASGFTFSTYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,False,300.0,100.0,0,0,2,0,0,0,2,0,S,C,A,A,S,G,F,T,F,S,T,Y,G,M,S,W,V,R,Q,T,P,D,K,R,L,E,W,V,A,T,I,S,N,G,G,G,Y,T,Y,Y,P,D,S,V,K,G,R,F,T,I,S,R,D,N,A,K,N,T,L,Y,L,Q,M,S,S,L,K,S,E,D,S,A,M,Y,Y,C,A,R,R,E,R,Y,D,E,N,G,F,S,Y,W,G,Q,G,T,L,V,T,V,S,A,,,,,,,"[0, 1, 0, 0, 0, 1, 0]","[0, 0, 0, 0, 0, 0, 0]","[0, 1, 0, 0, 0, 1, 0]",0.0,1.0,0.000000,0.0,0.0,1.0,0.0
11402,53A_021_H-1524392-R-C5_H_E03,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,LateGC,HA-WT,mix,VH,1-1,HA,LateGC_HA-WT_mix_VH_1-1,query,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGDTYTSCPDSVKG...,False,300.0,100.0,0,0,7,0,0,0,5,0,S,C,A,A,S,G,F,T,F,S,S,Y,G,M,S,W,V,R,Q,T,P,D,K,R,L,E,W,V,A,T,I,S,N,G,D,T,Y,T,S,C,P,D,S,V,K,G,R,F,T,I,S,R,D,N,A,K,N,T,L,Y,L,Q,M,S,S,L,K,S,E,D,S,A,M,Y,Y,C,A,R,R,E,R,Y,D,D,N,G,F,A,Y,W,G,Q,G,T,L,V,T,V,S,A,,,,,,,"[0, 0, 0, 3, 2, 1, 0]","[0, 0, 0, 0, 0, 1, 0]","[0, 0, 0, 3, 2, 0, 0]",0.0,0.0,0.000000,3.0,2.0,0.5,0.0
11403,53A_022_H-1524392-R-C7_H_F03,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,LateGC,HA-WT,mix,VH,1-1,HA,LateGC_HA-WT_mix_VH_1-1,query,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTFYPDTVKG...,False,300.0,100.0,0,0,3,0,0,0,3,0,S,C,A,A,S,G,F,T,F,S,S,Y,G,M,S,W,V,R,Q,T,P,D,K,R,L,E,W,V,A,T,I,S,N,G,G,G,Y,T,F,Y,P,D,T,V,K,G,R,F,T,I,S,R,D,N,A,K,N,T,L,Y,L,Q,M,S,S,L,K,S,E,D,S,A,M,Y,Y,C,A,R,R,G,R,Y,D,E,N,G,F,A,Y,W,G,Q,G,T,L,V,T,V,S,A,,,,,,,"[0, 0, 0, 0, 2, 1, 0]","[0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 2, 1, 0]",0.0,0.0,0.000000,0.0,2.0,1.0,0.0
11404,53A_016_H-1524392-R-B10_H_H02,TCCTGTGCAGCCTCTGGAATCACTTTCAGTAGCTATGGCATGTCTT...,LateGC,HA-WT,mix,VH,1-1,HA,LateGC_HA-WT_mix_VH_1-1,query,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,SCAASGITFSSYGMSWVRQTPDKRLEWVATISNGGGSTYYPDSVKG...,False,300.0,100.0,0,0,4,0,0,0,3,0,S,C,A,A,S,G,I,T,F,S,S,Y,G,M,S,W,V,R,Q,T,P,D,K,R,L,E,W,V,A,T,I,S,N,G,G,G,S,T,Y,Y,P,D,S,V,K,G,R,F,T,I,S,R,D,N,A,K,N,T,L,Y,L,Q,M,S,S,L,K,S,E,D,S,A,M,Y,Y,C,A,S,R,E,R,Y,D,E,N,G,F,A,Y,W,G,Q,G,T,L,V,T,V,S,A,,,,,,,"[0, 1, 0, 1, 0, 1, 0]","[0, 0, 0, 0, 1, 0, 0]","[0, 1, 0, 1, -1, 1, 0]",0.0,1.0,0.000000,1.0,0.0,1.0,0.0


In [6]:
set(dfs['mouse'])

{'B18', 'B18-383', 'HA-WT', 'HA-uMT'}

In [7]:
grouping=dfs.groupby(by=['status', 'mouse', 'dataset', 'chain', 'sub_dataset'])[['FR1','CDR1','FR2','CDR2','FR3','CDR3','FR4']]

for grouped, df in grouping:
    print(grouped)
    label= '_'.join(grouped)
    # display(df)

    if grouped[3]=='VH':
        if grouped[1]=='HA-uMT': repeats=[5,8,17,8,38,13,11]
        elif grouped[1]=='HA-WT': repeats=[5,8,17,8,38,13,11]
        elif grouped[1]=='B18-383': repeats=[11,8,17,8,38,13,11]
        elif grouped[1]=='B18': repeats=[11,8,17,8,38,13,11]
    elif grouped[3]=='VL':
        repeats=[8,12,17,3,36,9,10]
    else:
        raise Exception("Something wrong!")

    columns = []
    df_mean=df.mean().to_frame().T
    # Repeat the columns according to the repeats list
    for col, repeat in zip(df_mean.columns, repeats):
        for _ in range(repeat):
            columns.append(df_mean[col])
    
    # Concatenate the columns to create a new DataFrame
    repeated_df = pd.concat(columns, axis=1)
    
    # Display the new DataFrame
    repeated_df.to_csv('{}/rs_mean_repeated_{}.tsv'.format(output_folder_rs, label), sep = '\t', index=False)

# grouping.reset_index(inplace=True)
# grouping

('EarlyGC', 'B18-383', 'APC', 'VH', '-')
('EarlyGC', 'B18-383', 'APC', 'VL', '-')
('EarlyGC', 'B18-383', 'CGG', 'VH', '-')
('EarlyGC', 'B18-383', 'CGG', 'VL', '-')
('EarlyGC', 'B18-383', 'OVA', 'VH', '-')
('EarlyGC', 'B18-383', 'OVA', 'VL', '-')
('EarlyGC', 'HA-uMT', 'APC', 'VH', '0-1')
('EarlyGC', 'HA-uMT', 'APC', 'VL', '0-1')
('EarlyGC', 'HA-uMT', 'CGG', 'VH', '0-1')
('EarlyGC', 'HA-uMT', 'CGG', 'VL', '0-1')
('EarlyGC', 'HA-uMT', 'OVA', 'VH', '0-1')
('EarlyGC', 'HA-uMT', 'OVA', 'VL', '0-1')
('LateGC', 'B18-383', 'APC', 'VH', '-')
('LateGC', 'B18-383', 'APC', 'VL', '-')
('LateGC', 'B18-383', 'CGG', 'VH', '-')
('LateGC', 'B18-383', 'CGG', 'VL', '-')
('LateGC', 'B18-383', 'OVA', 'VH', '-')
('LateGC', 'B18-383', 'OVA', 'VL', '-')
('LateGC', 'B18-383', 'OVA-CTLA4', 'VH', '-')
('LateGC', 'B18-383', 'OVA-CTLA4', 'VL', '-')
('LateGC', 'B18-383', 'OVA-Isotype', 'VH', '-')
('LateGC', 'B18-383', 'OVA-Isotype', 'VL', '-')
('LateGC', 'HA-WT', 'APC', 'VH', '1-1')
('LateGC', 'HA-WT', 'APC', 'VH', '

In [8]:
print("Done!")

Done!
