In [1]:
# Python script for analyses of "Antibody affinity birth through somatic hypermutation" publication.
# This pipeline is divided into X sections. At the beginning of each section there is a comment which indicates which figures of the publication are generated based on that section.

# input sequences for these analyses are uploaded in data folder. By a successful run, the result of each section will be saved in output folder.
print('Running...')
import re
import operator

import os
#import sys
import pandas as pd
import numpy as np

import time
import itertools
import matplotlib.pyplot as plt
import glob
#import logomaker #https://logomaker.readthedocs.io

# Functions
def display_big():

    # df = pd.DataFrame()
    # pd.options.display.max_colwidth = 2000
    pd.set_option('display.max_rows', 30)
    pd.set_option('display.max_columns', 500)
    pd.set_option('display.width', 1000)

display_big()

Running...


In [2]:
data_folder='../data'

input_folder = os.getenv('VAR_IN_FOLDER', f"{data_folder}/input")
output_folder = os.getenv('VAR_OUT_FOLDER', f"{data_folder}/output")

In [3]:
def set_output_folder(section_output):
    output_folder=data_folder+'/output/'+section_output

    if not os.path.isdir(output_folder): # make output folder if it doesn't exist
        os.makedirs(output_folder)
    return(output_folder)

In [4]:
# Section1: preparation

output_folder_prep=set_output_folder('1_prep')
output_folder_num_miss=set_output_folder('2_num_miss')
output_folder_freq_pos=set_output_folder('3_freq_per_position')
output_folder_donuts=set_output_folder('4_donuts')

In [5]:
nt_miss_included_vh=pd.read_csv('{}/nt_miss_included_vh.tsv'.format(output_folder_num_miss), sep='\t', header=0, low_memory=False)
nt_miss_included_vh.reset_index(inplace=True, drop=True)

nt_miss_included_vl=pd.read_csv('{}/nt_miss_included_vl.tsv'.format(output_folder_num_miss), sep='\t', header=0, low_memory=False)
nt_miss_included_vl.reset_index(inplace=True, drop=True)

In [6]:
import pandas as pd

def num_nt_miss_donuts(df):
    df_categories = pd.DataFrame()
    num_Sequences_list = []
    bins = [0] + list(range(1, 102, 5))
    labels = ['0', '1-5', '6-10', '11-15', '16-20', '21-25', '26-30', '31-35', 
              '36-40', '41-45', '46-50', '51-55', '56-60', '61-65', '66-70', 
              '71-75', '76-80', '81-85', '86-90', '91-95', '96-100']
    
    for column in df.columns:
        df_now = pd.cut(df[column], bins=bins, labels=labels, right=False)
        num_Sequences = df[column].count()
        print(column, ':', num_Sequences)
        num_Sequences_list.append(num_Sequences)
        df_categories[column] = list(df_now.groupby(df_now, observed=False).count())
    
    df_categories = df_categories.set_axis(labels)
    df_categories.loc['num_seqs'] = num_Sequences_list
    return df_categories


In [7]:
nt_miss_included_vh

Unnamed: 0,Unimmunized_B18-383_SPL_VH_reps,Unimmunized_B18-383_PP_VH_reps,Unimmunized_B18-383_MLN_VH_reps,Unimmunized_HA-uMT_SPL_VH_reps,Unimmunized_HA-uMT_PP_VH_reps,Unimmunized_HA-uMT_MLN_VH_reps,EarlyGC_B18-383_OVA_VH_-,EarlyGC_B18-383_APC_VH_-,EarlyGC_B18-383_CGG_VH_-,EarlyGC_HA-uMT_OVA_VH_0-1,EarlyGC_HA-uMT_APC_VH_0-1,EarlyGC_HA-uMT_CGG_VH_0-1,LateGC_B18-383_OVA_VH_-,LateGC_B18-383_APC_VH_-,LateGC_B18-383_CGG_VH_-,LateGC_B18-383_OVA-CTLA4_VH_-,LateGC_B18-383_OVA-Isotype_VH_-,LateGC_HA-uMT_OVA_VH_0-1,LateGC_HA-uMT_APC_VH_0-1,LateGC_HA-uMT_CGG_VH_0-1,LateGC_HA-uMT_OVA-CTLA4_VH_0-1,LateGC_HA-uMT_OVA-Isotype_VH_0-1,LateGC_HA-WT_OVA_VH_1-1,LateGC_HA-WT_OVA_VH_1-100,LateGC_HA-WT_OVA_VH_1-1000,LateGC_HA-WT_APC_VH_1-1,LateGC_HA-WT_APC_VH_1-100,LateGC_HA-WT_APC_VH_1-1000,LateGC_HA-WT_CGG_VH_1-1,LateGC_HA-WT_CGG_VH_1-100,LateGC_HA-WT_CGG_VH_1-1000,LateGC_HA-WT_CGG-CTLA4_VH_1-1000,LateGC_HA-WT_CGG-Isotype_VH_1-1000,LateGC_HA-WT_mix_VH_1-1,Published_B18_Passenger_VH_-
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,11.0,5.0,2.0,2.0,2.0,8.0,7.0,11.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,4.0,13.0,8.0,3.0,5.0,8.0,10.0,9.0,14.0,5.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0,6.0,13.0,9.0,3.0,10.0,9.0,10.0,10.0,14.0,6.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,1.0,0.0,6.0,13.0,10.0,3.0,10.0,9.0,10.0,12.0,14.0,6.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,3.0,1.0,0.0,6.0,13.0,11.0,4.0,11.0,9.0,11.0,17.0,15.0,6.0,7.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11778,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,49
11779,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,52
11780,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,57
11781,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,63


In [8]:
print('VH:')
donuts_vh=num_nt_miss_donuts(nt_miss_included_vh)
donuts_vh.to_csv('{}/donuts_vh.tsv'.format(output_folder_donuts), sep = '\t', index=True)
donuts_vh

VH:
Unimmunized_B18-383_SPL_VH_reps : 9798
Unimmunized_B18-383_PP_VH_reps : 3986
Unimmunized_B18-383_MLN_VH_reps : 690
Unimmunized_HA-uMT_SPL_VH_reps : 10971
Unimmunized_HA-uMT_PP_VH_reps : 4405
Unimmunized_HA-uMT_MLN_VH_reps : 10268
EarlyGC_B18-383_OVA_VH_- : 38
EarlyGC_B18-383_APC_VH_- : 108
EarlyGC_B18-383_CGG_VH_- : 46
EarlyGC_HA-uMT_OVA_VH_0-1 : 44
EarlyGC_HA-uMT_APC_VH_0-1 : 73
EarlyGC_HA-uMT_CGG_VH_0-1 : 33
LateGC_B18-383_OVA_VH_- : 141
LateGC_B18-383_APC_VH_- : 95
LateGC_B18-383_CGG_VH_- : 48
LateGC_B18-383_OVA-CTLA4_VH_- : 122
LateGC_B18-383_OVA-Isotype_VH_- : 155
LateGC_HA-uMT_OVA_VH_0-1 : 70
LateGC_HA-uMT_APC_VH_0-1 : 28
LateGC_HA-uMT_CGG_VH_0-1 : 45
LateGC_HA-uMT_OVA-CTLA4_VH_0-1 : 60
LateGC_HA-uMT_OVA-Isotype_VH_0-1 : 54
LateGC_HA-WT_OVA_VH_1-1 : 37
LateGC_HA-WT_OVA_VH_1-100 : 90
LateGC_HA-WT_OVA_VH_1-1000 : 25
LateGC_HA-WT_APC_VH_1-1 : 29
LateGC_HA-WT_APC_VH_1-100 : 79
LateGC_HA-WT_APC_VH_1-1000 : 27
LateGC_HA-WT_CGG_VH_1-1 : 48
LateGC_HA-WT_CGG_VH_1-100 : 66
LateGC_HA-WT

Unnamed: 0,Unimmunized_B18-383_SPL_VH_reps,Unimmunized_B18-383_PP_VH_reps,Unimmunized_B18-383_MLN_VH_reps,Unimmunized_HA-uMT_SPL_VH_reps,Unimmunized_HA-uMT_PP_VH_reps,Unimmunized_HA-uMT_MLN_VH_reps,EarlyGC_B18-383_OVA_VH_-,EarlyGC_B18-383_APC_VH_-,EarlyGC_B18-383_CGG_VH_-,EarlyGC_HA-uMT_OVA_VH_0-1,EarlyGC_HA-uMT_APC_VH_0-1,EarlyGC_HA-uMT_CGG_VH_0-1,LateGC_B18-383_OVA_VH_-,LateGC_B18-383_APC_VH_-,LateGC_B18-383_CGG_VH_-,LateGC_B18-383_OVA-CTLA4_VH_-,LateGC_B18-383_OVA-Isotype_VH_-,LateGC_HA-uMT_OVA_VH_0-1,LateGC_HA-uMT_APC_VH_0-1,LateGC_HA-uMT_CGG_VH_0-1,LateGC_HA-uMT_OVA-CTLA4_VH_0-1,LateGC_HA-uMT_OVA-Isotype_VH_0-1,LateGC_HA-WT_OVA_VH_1-1,LateGC_HA-WT_OVA_VH_1-100,LateGC_HA-WT_OVA_VH_1-1000,LateGC_HA-WT_APC_VH_1-1,LateGC_HA-WT_APC_VH_1-100,LateGC_HA-WT_APC_VH_1-1000,LateGC_HA-WT_CGG_VH_1-1,LateGC_HA-WT_CGG_VH_1-100,LateGC_HA-WT_CGG_VH_1-1000,LateGC_HA-WT_CGG-CTLA4_VH_1-1000,LateGC_HA-WT_CGG-Isotype_VH_1-1000,LateGC_HA-WT_mix_VH_1-1,Published_B18_Passenger_VH_-
0,9723,3942,675,10907,4339,10042,1,14,1,3,13,0,0,0,0,0,0,0,0,0,0,1,11,54,9,7,45,4,12,35,13,1,1,30,5322
1-5,36,7,1,58,20,71,10,37,9,16,42,2,0,1,6,2,1,0,0,0,2,0,18,17,8,12,23,11,30,18,16,24,14,60,3971
6-10,5,4,2,2,14,13,6,15,9,15,17,11,0,3,4,2,11,4,3,0,33,28,6,8,3,8,9,5,4,7,5,1,3,18,1277
11-15,6,4,2,1,3,9,10,21,13,10,1,16,21,18,6,12,35,23,1,6,16,10,1,6,5,0,2,2,2,4,13,12,0,3,618
16-20,9,8,3,0,0,7,6,11,9,0,0,4,42,26,7,19,61,20,3,20,7,14,0,5,0,1,0,4,0,0,7,29,1,1,332
21-25,8,8,1,2,1,14,5,6,3,0,0,0,32,31,14,38,32,17,5,16,2,1,0,0,0,0,0,1,0,1,2,24,8,0,150
26-30,6,4,1,1,8,57,0,4,1,0,0,0,32,14,5,36,13,4,5,2,0,0,1,0,0,1,0,0,0,0,1,14,3,2,54
31-35,3,5,3,0,18,37,0,0,0,0,0,0,13,1,6,8,2,2,5,1,0,0,0,0,0,0,0,0,0,1,3,4,0,0,35
36-40,2,3,1,0,2,14,0,0,1,0,0,0,1,1,0,4,0,0,2,0,0,0,0,0,0,0,0,0,0,0,2,1,0,0,13
41-45,0,1,1,0,0,4,0,0,0,0,0,0,0,0,0,1,0,0,2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,3


In [9]:
donuts_vh.sum(axis=1)

0           45205
1-5          4543
6-10         1555
11-15         912
16-20         656
21-25         422
26-30         269
31-35         147
36-40          47
41-45          13
46-50           5
51-55           1
56-60           2
61-65           1
66-70           0
71-75           0
76-80           0
81-85           0
86-90           0
91-95           1
96-100          0
num_seqs    53779
dtype: int64

In [10]:
print('VL:')
donuts_vl=num_nt_miss_donuts(nt_miss_included_vl)
donuts_vl.to_csv('{}/donuts_vl.tsv'.format(output_folder_donuts), sep = '\t', index=True)
donuts_vl

VL:
Unimmunized_B18-383_SPL_VL_reps : 9779
Unimmunized_B18-383_PP_VL_reps : 3968
Unimmunized_B18-383_MLN_VL_reps : 683
Unimmunized_HA-uMT_SPL_VL_reps : 10971
Unimmunized_HA-uMT_PP_VL_reps : 4405
Unimmunized_HA-uMT_MLN_VL_reps : 10268
EarlyGC_B18-383_OVA_VL_- : 77
EarlyGC_B18-383_APC_VL_- : 89
EarlyGC_B18-383_CGG_VL_- : 41
EarlyGC_HA-uMT_OVA_VL_0-1 : 67
EarlyGC_HA-uMT_APC_VL_0-1 : 97
EarlyGC_HA-uMT_CGG_VL_0-1 : 58
LateGC_B18-383_OVA_VL_- : 214
LateGC_B18-383_APC_VL_- : 111
LateGC_B18-383_CGG_VL_- : 77
LateGC_B18-383_OVA-CTLA4_VL_- : 174
LateGC_B18-383_OVA-Isotype_VL_- : 283
LateGC_HA-uMT_OVA_VL_0-1 : 171
LateGC_HA-uMT_APC_VL_0-1 : 72
LateGC_HA-uMT_CGG_VL_0-1 : 107
LateGC_HA-uMT_OVA-CTLA4_VL_0-1 : 75
LateGC_HA-uMT_OVA-Isotype_VL_0-1 : 78
LateGC_HA-WT_OVA_VL_1-1 : 62
LateGC_HA-WT_OVA_VL_1-100 : 189
LateGC_HA-WT_OVA_VL_1-1000 : 70
LateGC_HA-WT_APC_VL_1-1 : 62
LateGC_HA-WT_APC_VL_1-100 : 154
LateGC_HA-WT_APC_VL_1-1000 : 56
LateGC_HA-WT_CGG_VL_1-1 : 93
LateGC_HA-WT_CGG_VL_1-100 : 101
LateGC_

Unnamed: 0,Unimmunized_B18-383_SPL_VL_reps,Unimmunized_B18-383_PP_VL_reps,Unimmunized_B18-383_MLN_VL_reps,Unimmunized_HA-uMT_SPL_VL_reps,Unimmunized_HA-uMT_PP_VL_reps,Unimmunized_HA-uMT_MLN_VL_reps,EarlyGC_B18-383_OVA_VL_-,EarlyGC_B18-383_APC_VL_-,EarlyGC_B18-383_CGG_VL_-,EarlyGC_HA-uMT_OVA_VL_0-1,EarlyGC_HA-uMT_APC_VL_0-1,EarlyGC_HA-uMT_CGG_VL_0-1,LateGC_B18-383_OVA_VL_-,LateGC_B18-383_APC_VL_-,LateGC_B18-383_CGG_VL_-,LateGC_B18-383_OVA-CTLA4_VL_-,LateGC_B18-383_OVA-Isotype_VL_-,LateGC_HA-uMT_OVA_VL_0-1,LateGC_HA-uMT_APC_VL_0-1,LateGC_HA-uMT_CGG_VL_0-1,LateGC_HA-uMT_OVA-CTLA4_VL_0-1,LateGC_HA-uMT_OVA-Isotype_VL_0-1,LateGC_HA-WT_OVA_VL_1-1,LateGC_HA-WT_OVA_VL_1-100,LateGC_HA-WT_OVA_VL_1-1000,LateGC_HA-WT_APC_VL_1-1,LateGC_HA-WT_APC_VL_1-100,LateGC_HA-WT_APC_VL_1-1000,LateGC_HA-WT_CGG_VL_1-1,LateGC_HA-WT_CGG_VL_1-100,LateGC_HA-WT_CGG_VL_1-1000,LateGC_HA-WT_CGG-CTLA4_VL_1-1000,LateGC_HA-WT_CGG-Isotype_VL_1-1000,LateGC_HA-WT_mix_VL_1-1
0,9744,3939,674,10939,4358,10089,15,1,2,18,44,2,12,7,18,6,10,2,1,0,1,3,20,108,23,24,78,18,42,54,40,16,6,86
1-5,30,19,6,29,20,52,56,67,25,35,45,14,123,71,30,81,189,41,2,8,34,35,33,54,37,27,67,31,43,39,57,48,25,103
6-10,4,9,2,1,17,26,6,20,5,14,6,34,75,31,16,61,74,73,15,21,34,35,7,18,8,9,9,6,6,5,21,52,22,22
11-15,1,1,1,1,9,83,0,1,4,0,1,8,4,2,7,18,9,45,28,64,5,4,2,7,2,2,0,1,2,3,8,53,4,6
16-20,0,0,0,0,0,17,0,0,2,0,1,0,0,0,6,2,1,10,22,14,1,1,0,2,0,0,0,0,0,0,2,7,0,0
21-25,0,0,0,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0
26-30,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
31-35,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
36-40,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
41-45,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [11]:
donuts_vl.sum(axis=1)

0           40400
1-5          1576
6-10          764
11-15         386
16-20          88
21-25           9
26-30           4
31-35           5
36-40           0
41-45           0
46-50           0
51-55           0
56-60           0
61-65           0
66-70           0
71-75           0
76-80           0
81-85           0
86-90           0
91-95           0
96-100          0
num_seqs    43232
dtype: int64