In [1]:
# Python script for analyses of "Antibody affinity birth through somatic hypermutation" publication.
# This pipeline is divided into X sections. At the beginning of each section there is a comment which indicates which figures of the publication are generated based on that section.

# input sequences for these analyses are uploaded in data folder. By a successful run, the result of each section will be saved in output folder.
print('Running...')
import re
import operator

import os
#import sys
import pandas as pd
import numpy as np

import time
import itertools
import matplotlib.pyplot as plt
import glob
#import logomaker #https://logomaker.readthedocs.io

# Functions
def display_big():

    # df = pd.DataFrame()
    # pd.options.display.max_colwidth = 2000
    pd.set_option('display.max_rows', 10)
    pd.set_option('display.max_columns', 500)
    pd.set_option('display.width', 1000)

display_big()

Running...


In [2]:
data_folder='../data'

input_folder = os.getenv('VAR_IN_FOLDER', f"{data_folder}/input")
output_folder = os.getenv('VAR_OUT_FOLDER', f"{data_folder}/output")

In [3]:
def set_output_folder(section_output):
    output_folder=data_folder+'/output/'+section_output

    if not os.path.isdir(output_folder): # make output folder if it doesn't exist
        os.makedirs(output_folder)
    return(output_folder)

In [4]:
# Section1: preparation

output_folder_prep=set_output_folder('1_prep')
output_folder_freq_pos=set_output_folder('3_freq_per_position')

In [5]:
dfs_expanded_nts_included=pd.read_csv('{}/../1_prep/dfs_expanded_nts_included.tsv'.format(output_folder_prep), sep='\t', header=0, low_memory=False)
dfs_expanded_nts_included.reset_index(inplace=True, drop=True)

In [6]:
# dfs_expanded_nts_included = dfs_expanded_nts_included[dfs_expanded_nts_included['status']!='EarlyGC'].copy()

In [7]:
df_frq_nts_mismatch=pd.DataFrame()

grouping=dfs_expanded_nts_included.groupby(by=['status', 'mouse', 'dataset', 'chain', 'sub_dataset'])

for grouped, df in grouping:
    suffix='_'.join(grouped)
    df.reset_index(drop=True, inplace=True)
    num_seqs=len(df)

    ref_nt=df.loc[0, 'ref_nt']
    for p in range(0, len(ref_nt)):

        mismatches_list = ['A', 'T', 'C', 'G']
        mismatches_list.remove(ref_nt[p])
        miss_instances = len(df[df['NT{}'.format(p)].apply(lambda x: True if x in mismatches_list else False)]) #mismatched instances
        N_instances = len(df[df['NT{}'.format(p)].apply(lambda x: True if x=='N' else False)]) #N instances

        if num_seqs-N_instances == 0:
            df_frq_nts_mismatch.loc[p, suffix] = 0
        else:
            df_frq_nts_mismatch.loc[p, suffix] = miss_instances / (num_seqs-N_instances)

df_frq_nts_mismatch.to_csv('{}/frq_nts_mismatches_included.tsv'.format(output_folder_freq_pos), sep = '\t', index=False)

In [8]:
df_frq_nts_mismatch.max().max()

np.float64(1.0)

In [9]:
df_frq_nts_mismatch.stack().idxmax()

(np.int64(262), 'EarlyGC_HA-uMT_CGG_VH_0-1')

In [10]:
df_frq_nts_mismatch['LateGC_HA-uMT_CGG_VL_0-1'][240]

np.float64(0.9439252336448598)

In [11]:
dfs_expanded_nts_excluded=pd.read_csv('{}/../1_prep/dfs_expanded_nts_excluded.tsv'.format(output_folder_prep), sep='\t', header=0, low_memory=False)
dfs_expanded_nts_excluded.reset_index(inplace=True, drop=True)
dfs_expanded_nts_excluded = dfs_expanded_nts_excluded[dfs_expanded_nts_excluded['status']!='EarlyGC'].copy()
df_frq_nts_mismatch=pd.DataFrame()

grouping=dfs_expanded_nts_excluded.groupby(by=['status', 'mouse', 'dataset', 'chain', 'sub_dataset'])

for grouped, df in grouping:
    suffix='_'.join(grouped)
    df.reset_index(drop=True, inplace=True)
    num_seqs=len(df)

    ref_nt=df.loc[0, 'ref_nt']
    for p in range(0, len(ref_nt)):

        mismatches_list = ['A', 'T', 'C', 'G']
        mismatches_list.remove(ref_nt[p])
        miss_instances = len(df[df['NT{}'.format(p)].apply(lambda x: True if x in mismatches_list else False)]) #mismatched instances
        N_instances = len(df[df['NT{}'.format(p)].apply(lambda x: True if x=='N' else False)]) #N instances

        if num_seqs-N_instances == 0:
            df_frq_nts_mismatch.loc[p, suffix] = 0
        else:
            df_frq_nts_mismatch.loc[p, suffix] = miss_instances / (num_seqs-N_instances)

df_frq_nts_mismatch.to_csv('{}/frq_nts_mismatches_excluded.tsv'.format(output_folder_freq_pos), sep = '\t', index=False)
df_frq_nts_mismatch.max().max()

np.float64(0.9439252336448598)