In [1]:
# Python script for analyses of "Antibody affinity birth through somatic hypermutation" publication.
# This pipeline is divided into X sections. At the beginning of each section there is a comment which indicates which figures of the publication are generated based on that section.

# input sequences for these analyses are uploaded in data folder. By a successful run, the result of each section will be saved in output folder.
print('Running...')
import re
import operator

import os
#import sys
import pandas as pd
import numpy as np

import time
import itertools
import matplotlib.pyplot as plt
import glob
import logomaker #https://logomaker.readthedocs.io

# Functions
def display_big():

    # df = pd.DataFrame()
    # pd.options.display.max_colwidth = 2000
    pd.set_option('display.max_rows', 10)
    pd.set_option('display.max_columns', 500)
    pd.set_option('display.width', 1000)

display_big()

Running...


In [2]:
data_folder='../data'

input_folder = os.getenv('VAR_IN_FOLDER', f"{data_folder}/input")
output_folder = os.getenv('VAR_OUT_FOLDER', f"{data_folder}/output")

In [3]:
def set_output_folder(section_output):
    output_folder=data_folder+'/output/'+section_output

    if not os.path.isdir(output_folder): # make output folder if it doesn't exist
        os.makedirs(output_folder)
    return(output_folder)

In [4]:
# Section1: preparation

output_folder_prep=set_output_folder('1_prep')
output_folder_num_miss=set_output_folder('2_num_miss')
output_folder_freq_pos=set_output_folder('3_freq_per_position')
output_folder_donuts=set_output_folder('4_donuts')
output_folder_seq_logos=set_output_folder('5_seq_logos')

In [5]:
dfs_expanded_aas_included=pd.read_csv('{}/dfs_expanded_aas_included.tsv'.format(output_folder_prep), sep='\t', header=0, low_memory=False)
dfs_expanded_aas_included.reset_index(inplace=True, drop=True)

In [6]:
def seq_logo_plot(df, output_folder, ID):

    plt.rc('font', size=20)
    fig, mxx = plt.subplots(figsize=(20,2))

    #nn_logo = logomaker.Logo(df, color_scheme='chemistry', font_name='Times new roman', ax=mxx)
    nn_logo = logomaker.Logo(df, color_scheme='chemistry', ax=mxx)

    # style using Logo methods
    nn_logo.style_spines(visible=True) # Up and right borders
    # nn_logo.style_spines(spines=['left'], visible=True, bounds=[0, .75])
    nn_logo.style_spines(spines=['left'], visible=True, bounds=[0, 1.0])

    # style using Axes methods
    # nn_logo.ax.set_xlim([20, 115])
    nn_logo.ax.set_xticks([])
    # nn_logo.ax.set_ylim([-.6, .75])
    nn_logo.ax.set_ylim([0, 1.])
    plt.yticks([0.2,0.4,0.6,0.8], weight='bold')

    print('------------ID',ID)
    # if ID in ['B18-383_Passenger_VH_aa', 'B18-383_Passenger_VH_nt', \
    #           'HA-WT_1_1_mix_VL_aa', 'HA-WT_1_1_mix_VL_nt', \
    #           'HA-WT_1_1_mix_VH_aa', 'HA-WT_1_1_mix_VH_nt']:
    if ID.split('_')[0] == 'Unimmunized':            
        nn_logo.style_spines(spines=['left'], visible=True, bounds=[0, 0.006])
        nn_logo.ax.set_ylim([0, 0.006])
        plt.yticks([0.003, 0.006], weight='bold')

    if ID == 'LateGC_HA-WT_mix_VH_1-1_aa':            
        nn_logo.style_spines(spines=['left'], visible=True, bounds=[0, 0.4])
        nn_logo.ax.set_ylim([0, 0.4])
        plt.yticks([0.2,0.4], weight='bold')

    if ID == 'Published_B18_Passenger_VH_-_aa':            
        nn_logo.style_spines(spines=['left'], visible=True, bounds=[0, 0.4])
        nn_logo.ax.set_ylim([0, 0.4])
        plt.yticks([0.2,0.4], weight='bold')
    
    # nn_logo.ax.set_yticks([0, .75])
    # nn_logo.ax.set_yticklabels(['0', '0.75'])
    # nn_logo.ax.figure(figsize=(100,30))
    # nn_logo.ax.set_yticks([])
    # plt.yticks([0.2,0.4], weight='bold')
    print(ID)
    plt.savefig('{}/{}.jpg'.format(output_folder, ID, dpi=1200))
    # plt.show()
    plt.close()
    time.sleep(0.1)
    plt.pause(0.0001)

In [7]:
del_sign='-'
ambiguity_sign='.'
aas_dic={'AAA':'K','AAC':'N','AAT':'N','AAG':'K','ACA':'T','ACC':'T','ACT':'T','ACG':'T','ATA':'I','ATC':'I',\
        'ATT':'I','ATG':'M','AGA':'R','AGC':'S','AGT':'S','AGG':'R','CAA':'Q','CAC':'H','CAT':'H','CAG':'Q',\
        'CCA':'P','CCC':'P','CCT':'P','CCG':'P','CTA':'L','CTC':'L','CTT':'L','CTG':'L','CGA':'R','CGC':'R',\
        'CGT':'R','CGG':'R','TAA':'*','TAC':'Y','TAT':'Y','TAG':'*','TCA':'S','TCC':'S','TCT':'S','TCG':'S',\
        'TTA':'L','TTC':'F','TTT':'F','TTG':'L','TGA':'*','TGC':'C','TGT':'C','TGG':'W','GAA':'E','GAC':'D',\
        'GAT':'D','GAG':'E','GCA':'A','GCC':'A','GCT':'A','GCG':'A','GTA':'V','GTC':'V','GTT':'V','GTG':'V',\
        'GGA':'G','GGC':'G','GGT':'G','GGG':'G','---':del_sign}
aas_list=['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', '*', del_sign, ambiguity_sign]
aas_chemistry_list=['I', 'V', 'L', 'F', 'C', 'M', 'A', 'W', 'G', 'T', 'S', 'Y', 'P', 'H', 'N', 'D', 'Q', 'E', 'K', 'R']
nts_list=['A', 'C', 'G', 'T', del_sign]

In [8]:
# dfs_expanded_aas_included = dfs_expanded_aas_included[dfs_expanded_aas_included['label']=='EarlyGC_B18-383_CGG_VH_-'].copy()
# len(dfs_expanded_aas_included)

In [9]:
def seq_logo_prep(df, seq_type):
    # create Logo object
    df.reset_index(inplace=True, drop=True)
    df_size=len(df)
    # print(df_size)
    # display(df)

    columns_dic={'nt':nts_list, 'aa':aas_list}
    Ref='ref_{}'.format(seq_type)

    ref_seq=df.loc[0, Ref]
    # print(ref_seq)
    df_counts=pd.DataFrame(0, index=[i for i in range(0, len(ref_seq))], columns=columns_dic[seq_type])

    for seq, p in itertools.product(*[range(0, df_size), range(0, len(ref_seq))]):
        ref=ref_seq[p]
        query=df.loc[seq, 'seq_{}'.format(seq_type)][p]
        # if query==ref : continue
        if query==ref or query==ambiguity_sign: continue

        # injjaro bayad yekari konam ke kole soton ro hesab kone bad taghsim be oon konym chon 
        # tedade kole har soton fargh mikone chon tedade N fargh mikone jahaye mokhtalef
        
        df_counts.loc[p, query]+=1
    # display(df_counts)
    df_logo=df_counts/df_size
    return(df_logo)

In [10]:
grouping=dfs_expanded_aas_included.groupby(by=['status', 'mouse', 'dataset', 'chain', 'sub_dataset'])

for grouped, df in grouping:
    ID='_'.join(grouped)+'_aa'
    df_aa=seq_logo_prep(df, 'aa')
    seq_logo_plot(df_aa, output_folder_seq_logos, ID)
    

------------ID EarlyGC_B18-383_APC_VH_-_aa
EarlyGC_B18-383_APC_VH_-_aa
------------ID EarlyGC_B18-383_APC_VL_-_aa
EarlyGC_B18-383_APC_VL_-_aa
------------ID EarlyGC_B18-383_CGG_VH_-_aa
EarlyGC_B18-383_CGG_VH_-_aa
------------ID EarlyGC_B18-383_CGG_VL_-_aa
EarlyGC_B18-383_CGG_VL_-_aa
------------ID EarlyGC_B18-383_OVA_VH_-_aa
EarlyGC_B18-383_OVA_VH_-_aa
------------ID EarlyGC_B18-383_OVA_VL_-_aa
EarlyGC_B18-383_OVA_VL_-_aa
------------ID EarlyGC_HA-uMT_APC_VH_0-1_aa
EarlyGC_HA-uMT_APC_VH_0-1_aa
------------ID EarlyGC_HA-uMT_APC_VL_0-1_aa
EarlyGC_HA-uMT_APC_VL_0-1_aa
------------ID EarlyGC_HA-uMT_CGG_VH_0-1_aa
EarlyGC_HA-uMT_CGG_VH_0-1_aa
------------ID EarlyGC_HA-uMT_CGG_VL_0-1_aa
EarlyGC_HA-uMT_CGG_VL_0-1_aa
------------ID EarlyGC_HA-uMT_OVA_VH_0-1_aa
EarlyGC_HA-uMT_OVA_VH_0-1_aa
------------ID EarlyGC_HA-uMT_OVA_VL_0-1_aa
EarlyGC_HA-uMT_OVA_VL_0-1_aa
------------ID LateGC_B18-383_APC_VH_-_aa
LateGC_B18-383_APC_VH_-_aa
------------ID LateGC_B18-383_APC_VL_-_aa
LateGC_B18-383_APC_VL_-_a