In [1]:
# Python script for analyses of "Antibody affinity birth through somatic hypermutation" publication.
# This pipeline is divided into X sections. At the beginning of each section there is a comment which indicates which figures of the publication are generated based on that section.

# input sequences for these analyses are uploaded in data folder. By a successful run, the result of each section will be saved in output folder.
print('Running...')
import re
import operator

import os
#import sys
import pandas as pd
import numpy as np

import time
import itertools
import matplotlib.pyplot as plt
import glob
#import logomaker #https://logomaker.readthedocs.io

# Functions
def display_big():

    # df = pd.DataFrame()
    # pd.options.display.max_colwidth = 2000
    pd.set_option('display.max_rows', 10)
    pd.set_option('display.max_columns', 200)
    pd.set_option('display.width', 1000)

display_big()

Running...


In [2]:
data_folder='../data'

input_folder = os.getenv('VAR_IN_FOLDER', f"{data_folder}/input")
output_folder = os.getenv('VAR_OUT_FOLDER', f"{data_folder}/output")

In [3]:
def set_output_folder(section_output):
    output_folder=data_folder+'/output/'+section_output

    if not os.path.isdir(output_folder): # make output folder if it doesn't exist
        os.makedirs(output_folder)
    return(output_folder)

In [4]:
# Section1: preparation

output_folder_prep=set_output_folder('1_prep')
output_folder_num_miss=set_output_folder('2_num_miss')
output_folder_freq_pos=set_output_folder('3_freq_per_position')
output_folder_donuts=set_output_folder('4_donuts')
output_folder_seq_logos=set_output_folder('5_seq_logos')
output_folder_rs_prep=set_output_folder('6_prep_rs')
output_folder_rs=set_output_folder('7_rs')
output_folder_prep_plots=set_output_folder('8_prep_plots')


In [5]:
del_sign='-'
ambiguity_sign='.'
aas_dic={'AAA':'K','AAC':'N','AAT':'N','AAG':'K','ACA':'T','ACC':'T','ACT':'T','ACG':'T','ATA':'I','ATC':'I',\
        'ATT':'I','ATG':'M','AGA':'R','AGC':'S','AGT':'S','AGG':'R','CAA':'Q','CAC':'H','CAT':'H','CAG':'Q',\
        'CCA':'P','CCC':'P','CCT':'P','CCG':'P','CTA':'L','CTC':'L','CTT':'L','CTG':'L','CGA':'R','CGC':'R',\
        'CGT':'R','CGG':'R','TAA':'*','TAC':'Y','TAT':'Y','TAG':'*','TCA':'S','TCC':'S','TCT':'S','TCG':'S',\
        'TTA':'L','TTC':'F','TTT':'F','TTG':'L','TGA':'*','TGC':'C','TGT':'C','TGG':'W','GAA':'E','GAC':'D',\
        'GAT':'D','GAG':'E','GCA':'A','GCC':'A','GCT':'A','GCG':'A','GTA':'V','GTC':'V','GTT':'V','GTG':'V',\
        'GGA':'G','GGC':'G','GGT':'G','GGG':'G','---':del_sign}
aas_list=['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', '*', del_sign]
aas_chemistry_list=['I', 'V', 'L', 'F', 'C', 'M', 'A', 'W', 'G', 'T', 'S', 'Y', 'P', 'H', 'N', 'D', 'Q', 'E', 'K', 'R']
nts_list=['A', 'C', 'G', 'T', del_sign, ambiguity_sign]

In [6]:
modes_dic={ 1 : {'mouse' : 'B18-383', 'datasets': ['OVA', 'APC', 'CGG', 'Passenger'], 'chain':'VH'},
            2 : {'mouse' : 'HA-WT', 'datasets': ['OVA', 'APC', 'CGG', 'mix'], 'chain':'VH'},
           
            3 : {'mouse' : 'B18-383', 'datasets': ['OVA-CTLA4', 'OVA-Isotype'], 'chain':'VH'},
            4 : {'mouse' : 'HA-uMT', 'datasets': ['OVA-CTLA4', 'OVA-Isotype'], 'chain':'VH'},
            5 : {'mouse' : 'HA-WT', 'datasets': ['CGG-CTLA4', 'CGG-Isotype'], 'chain':'VH'},

            6 : {'mouse' : 'B18-383', 'datasets': ['OVA-CTLA4', 'OVA-Isotype'], 'chain':'VL'},
            7 : {'mouse' : 'HA-uMT', 'datasets': ['OVA-CTLA4', 'OVA-Isotype'], 'chain':'VL'},
            8 : {'mouse' : 'HA-WT', 'datasets': ['CGG-CTLA4', 'CGG-Isotype'], 'chain':'VL'}}

In [7]:
# Section6: Preparing data for scatter plots

modes_dic

for mode in modes_dic.keys():
    dfs=pd.read_csv('{}/dfs_expanded_aas_excluded.tsv'.format(output_folder_prep), sep='\t', header=0, low_memory=False)

    mouse=modes_dic[mode]['mouse']
    datasets=modes_dic[mode]['datasets']
    chain=modes_dic[mode]['chain']

    # if mode==1 and chain=='VL':
    #     datasets.remove('Passenger')
    #     print('passenger removed from VL dataset list', datasets)
    mouse


    if mode == 1:
        dfs=dfs.loc[
        (dfs['label']=='LateGC_B18-383_OVA_VH_-') |\
        (dfs['label']=='LateGC_B18-383_APC_VH_-') |\
        (dfs['label']=='LateGC_B18-383_CGG_VH_-') |\
        (dfs['label']=='Published_B18_Passenger_VH_-') \
        ,].copy()
        print(set(dfs['label']))

    elif mode == 2:
        dfs=dfs.loc[
        (dfs['label']=='LateGC_HA-uMT_OVA_VH_0-1') |\
        (dfs['label']=='LateGC_HA-uMT_APC_VH_0-1') |\
        (dfs['label']=='LateGC_HA-uMT_CGG_VH_0-1') |\
        (dfs['label']=='LateGC_HA-WT_mix_VH_1-1') \
        ,].copy()
        print(set(dfs['label']))
            
    elif mode == 3:
        dfs=dfs.loc[
        (dfs['label']=='LateGC_B18-383_OVA-CTLA4_VH_-') |\
        (dfs['label']=='LateGC_B18-383_OVA-Isotype_VH_-') \
        ,].copy()
        print(set(dfs['label']))

    elif mode == 4:
        dfs=dfs.loc[
        (dfs['label']=='LateGC_HA-uMT_OVA-CTLA4_VH_0-1') |\
        (dfs['label']=='LateGC_HA-uMT_OVA-Isotype_VH_0-1') \
        ,].copy()
        print(set(dfs['label']))

    elif mode == 5:
        dfs=dfs.loc[
        (dfs['label']=='LateGC_HA-WT_CGG-CTLA4_VH_1-1000') |\
        (dfs['label']=='LateGC_HA-WT_CGG-Isotype_VH_1-1000') \
        ,].copy()
        print(set(dfs['label']))    	

    elif mode == 6:
        dfs=dfs.loc[
        (dfs['label']=='LateGC_B18-383_OVA-CTLA4_VL_-') |\
        (dfs['label']=='LateGC_B18-383_OVA-Isotype_VL_-') \
        ,].copy()
        print(set(dfs['label']))
    
    elif mode == 7:
        dfs=dfs.loc[
        (dfs['label']=='LateGC_HA-uMT_OVA-CTLA4_VL_0-1') |\
        (dfs['label']=='LateGC_HA-uMT_OVA-Isotype_VL_0-1') \
        ,].copy()
        print(set(dfs['label']))
    
    elif mode == 8:
        dfs=dfs.loc[
        (dfs['label']=='LateGC_HA-WT_CGG-CTLA4_VL_1-1000') |\
        (dfs['label']=='LateGC_HA-WT_CGG-Isotype_VL_1-1000') \
        ,].copy()
        print(set(dfs['label']))    	
        
    print('Mode :', mode, mouse, chain, datasets)
        
    # dfs=my.funcs.df_clean_up(dfs)
    # dfs=dfs[dfs['aa_miss']!=0]
    dfs.reset_index(inplace=True, drop=True)
    dfs

    len_aa=int(dfs['len_aa'].values[0])
    len_aa

    ref_aa=dfs['ref_aa'].values[0]
    ref_aa

    df_aa=pd.DataFrame(columns=[i for i in range(0,len_aa)], index=aas_list)
    df_aa.fillna(np.nan, inplace=True)
    df_aa

    def update_df_aa(case, position, diff_list):

        value = { 1:1, 2:2, 3:3, 12: 4, 13: 5, 23: 6, 123: 7, 4:8, 14:9, 24:10, 34:11, 124:12, 234:13, 134:14, 1234:15, 'ref':np.nan}

        if diff_list:
            for aminoacid in diff_list:
                df_aa.loc[aminoacid,position]=value[case]

    aa_dic = dict()  #{'OVA_p0': {'A', 'S'}, 'APC_p0': {'S'}, 'CGG_p0': {'P', 'S'}}
    # dif_dic = dict() #{'OVA_aa0': ['A'], 'APC_aa0': [],  'CGG_aa0': ['P'], 'intersection_aa0': ['S']}

    for i in range(0, len_aa): #106 for B18-383, 101 for HA-uMT
        for dataset in datasets:
            # dataset='OVA'
            df_now=dfs[dfs['dataset']==dataset]
            # print(dataset, 'df length: ', len(df_now))
            aa_dic['{}_p{}'.format(dataset, i)]=set(df_now.loc[:,'A{}'.format(i)])


        A=aa_dic['{}_p{}'.format(datasets[0],i)]
        B=aa_dic['{}_p{}'.format(datasets[1],i)]
        if len(datasets)==2:
            C=set()
            D=set()
        elif len(datasets)==3:
            C=aa_dic['{}_p{}'.format(datasets[2],i)]
            D=set()
        elif len(datasets)==4:
            C=aa_dic['{}_p{}'.format(datasets[2],i)]
            D=aa_dic['{}_p{}'.format(datasets[3],i)]

        # print(A)
        # print(B)
        # print(C)
        # print(D)

        update_df_aa(1, i, list(A.difference(B, C, D)))
        update_df_aa(2, i, list(B.difference(A, C, D)))
        update_df_aa(3, i, list(C.difference(A, B, D)))
        update_df_aa(4, i, list(D.difference(A, B, C)))


        _12=A & B
        _13=A & C
        _23=B & C

        _14=A & D
        _24=B & D
        _34=C & D

        _123=A & B & C
        _124=A & B & D
        _134=A & C & D
        _234=B & C & D

        _1234=A & B & C & D

        update_df_aa(12, i, _12-_123-_124-_1234)
        update_df_aa(13, i, _13-_123-_134-_1234)
        update_df_aa(23, i, _23-_123-_234-_1234)

        update_df_aa(14, i, _14-_124-_134-_1234)
        update_df_aa(24, i, _24-_124-_234-_1234)
        update_df_aa(34, i, _34-_234-_134-_1234)

        update_df_aa(123, i, _123-_1234)
        update_df_aa(124, i, _124-_1234)
        update_df_aa(134, i, _134-_1234)
        update_df_aa(234, i, _234-_1234)

        update_df_aa(1234, i, _1234)
        update_df_aa('ref', i, list(ref_aa[i]))
        # break

    df_aa

    df_aa.to_csv('{}/df_aa_mode{}_{}.tsv'.format(output_folder_prep_plots, mode, chain), sep = '\t', index=True)

{'LateGC_B18-383_CGG_VH_-', 'Published_B18_Passenger_VH_-', 'LateGC_B18-383_APC_VH_-', 'LateGC_B18-383_OVA_VH_-'}
Mode : 1 B18-383 VH ['OVA', 'APC', 'CGG', 'Passenger']


  df_aa.fillna(np.nan, inplace=True)


{'LateGC_HA-uMT_OVA_VH_0-1', 'LateGC_HA-uMT_CGG_VH_0-1', 'LateGC_HA-uMT_APC_VH_0-1', 'LateGC_HA-WT_mix_VH_1-1'}
Mode : 2 HA-WT VH ['OVA', 'APC', 'CGG', 'mix']


  df_aa.fillna(np.nan, inplace=True)


{'LateGC_B18-383_OVA-Isotype_VH_-', 'LateGC_B18-383_OVA-CTLA4_VH_-'}
Mode : 3 B18-383 VH ['OVA-CTLA4', 'OVA-Isotype']


  df_aa.fillna(np.nan, inplace=True)


{'LateGC_HA-uMT_OVA-CTLA4_VH_0-1', 'LateGC_HA-uMT_OVA-Isotype_VH_0-1'}
Mode : 4 HA-uMT VH ['OVA-CTLA4', 'OVA-Isotype']


  df_aa.fillna(np.nan, inplace=True)


{'LateGC_HA-WT_CGG-CTLA4_VH_1-1000', 'LateGC_HA-WT_CGG-Isotype_VH_1-1000'}
Mode : 5 HA-WT VH ['CGG-CTLA4', 'CGG-Isotype']


  df_aa.fillna(np.nan, inplace=True)


{'LateGC_B18-383_OVA-Isotype_VL_-', 'LateGC_B18-383_OVA-CTLA4_VL_-'}
Mode : 6 B18-383 VL ['OVA-CTLA4', 'OVA-Isotype']


  df_aa.fillna(np.nan, inplace=True)


{'LateGC_HA-uMT_OVA-CTLA4_VL_0-1', 'LateGC_HA-uMT_OVA-Isotype_VL_0-1'}
Mode : 7 HA-uMT VL ['OVA-CTLA4', 'OVA-Isotype']


  df_aa.fillna(np.nan, inplace=True)


{'LateGC_HA-WT_CGG-CTLA4_VL_1-1000', 'LateGC_HA-WT_CGG-Isotype_VL_1-1000'}
Mode : 8 HA-WT VL ['CGG-CTLA4', 'CGG-Isotype']


  df_aa.fillna(np.nan, inplace=True)


In [8]:
'Done!'

'Done!'