In [1]:
# Python script for analyses of "Antibody affinity birth through somatic hypermutation" publication.
# This pipeline is divided into X sections. At the beginning of each section there is a comment which indicates which figures of the publication are generated based on that section.

# input sequences for these analyses are uploaded in data folder. By a successful run, the result of each section will be saved in output folder.
print('Running...')
import re
import operator

import os
#import sys
import pandas as pd
import numpy as np

import time
import itertools
import matplotlib.pyplot as plt
import glob
#import logomaker #https://logomaker.readthedocs.io

# Functions
def display_big():

    # df = pd.DataFrame()
    # pd.options.display.max_colwidth = 2000
    pd.set_option('display.max_rows', 10)
    pd.set_option('display.max_columns', 500)
    pd.set_option('display.width', 1000)

display_big()

Running...


In [2]:
data_folder='../data'

input_folder = os.getenv('VAR_IN_FOLDER', f"{data_folder}/input")
output_folder = os.getenv('VAR_OUT_FOLDER', f"{data_folder}/output")

In [3]:
del_sign='-'
ambiguity_sign='.'
aas_dic={'AAA':'K','AAC':'N','AAT':'N','AAG':'K','ACA':'T','ACC':'T','ACT':'T','ACG':'T','ATA':'I','ATC':'I',\
        'ATT':'I','ATG':'M','AGA':'R','AGC':'S','AGT':'S','AGG':'R','CAA':'Q','CAC':'H','CAT':'H','CAG':'Q',\
        'CCA':'P','CCC':'P','CCT':'P','CCG':'P','CTA':'L','CTC':'L','CTT':'L','CTG':'L','CGA':'R','CGC':'R',\
        'CGT':'R','CGG':'R','TAA':'*','TAC':'Y','TAT':'Y','TAG':'*','TCA':'S','TCC':'S','TCT':'S','TCG':'S',\
        'TTA':'L','TTC':'F','TTT':'F','TTG':'L','TGA':'*','TGC':'C','TGT':'C','TGG':'W','GAA':'E','GAC':'D',\
        'GAT':'D','GAG':'E','GCA':'A','GCC':'A','GCT':'A','GCG':'A','GTA':'V','GTC':'V','GTT':'V','GTG':'V',\
        'GGA':'G','GGC':'G','GGT':'G','GGG':'G','---':del_sign}
aas_list=['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', '*', del_sign]
aas_chemistry_list=['I', 'V', 'L', 'F', 'C', 'M', 'A', 'W', 'G', 'T', 'S', 'Y', 'P', 'H', 'N', 'D', 'Q', 'E', 'K', 'R']
nts_list=['A', 'C', 'G', 'T', del_sign, ambiguity_sign]

In [4]:
mouse_DB_dic={'B18-383':'B18', 'B18':'B18', 'HA-uMT':'HA', 'HA-WT':'HA', 'HA':'HA'}
locus_syno_dic = {'IGK':'VL', 'IGH':'VH', 'IGL':'VL', 'NA':'NA'}

In [5]:
def set_output_folder(section_output):
    output_folder=data_folder+'/output/'+section_output

    if not os.path.isdir(output_folder): # make output folder if it doesn't exist
        os.makedirs(output_folder)
    return(output_folder)

In [6]:
# Section1: preparation
output_folder_prep=set_output_folder('1_prep')

In [7]:
files_List=glob.glob('{}/output/*.fasta'.format(data_folder))
files_List

['../data/output/LateGC_HA-WT_APC_VL_1-1.fasta',
 '../data/output/LateGC_HA-WT_APC_VH_1-100.fasta',
 '../data/output/LateGC_B18-383_OVA_VL_-.fasta',
 '../data/output/LateGC_HA-WT_CGG_VL_1-1.fasta',
 '../data/output/LateGC_B18-383_APC_VL_-.fasta',
 '../data/output/LateGC_HA-uMT_OVA_VL_0-1.fasta',
 '../data/output/LateGC_B18-383_APC_VH_-.fasta',
 '../data/output/LateGC_HA-uMT_APC_VH_0-1.fasta',
 '../data/output/LateGC_B18-383_OVA-Isotype_VH_-.fasta',
 '../data/output/Unimmunized_HA-uMT_SPL_VH_rep2.fasta',
 '../data/output/Unimmunized_B18-383_PP_VL_rep1.fasta',
 '../data/output/Unimmunized_HA-uMT_MLN_VL_rep2.fasta',
 '../data/output/LateGC_HA-WT_CGG_VH_1-1000.fasta',
 '../data/output/LateGC_HA-WT_OVA_VH_1-1000.fasta',
 '../data/output/Unimmunized_B18-383_PP_VH_rep2.fasta',
 '../data/output/EarlyGC_HA-uMT_APC_VL_0-1.fasta',
 '../data/output/Unimmunized_B18-383_MLN_VL_rep2.fasta',
 '../data/output/LateGC_HA-WT_APC_VH_1-1000.fasta',
 '../data/output/LateGC_HA-WT_CGG-CTLA4_VL_1-1000.fasta',
 

In [8]:
def translate(seq_nt):
    seq_aa=[]
    frameshift, stopcodon=False, False
    codons=[seq_nt[i:i+3] for i in range(0, len(seq_nt), 3)]
    for codon in codons:
        if codon in aas_dic:
            seq_aa.append(aas_dic[codon])
            if aas_dic[codon]=='*': stopcodon=True
        else:
            seq_aa.append(ambiguity_sign) # MITOONE BEKHATERE N HAM BASHE
            frameshift=True
    return([seq_aa, frameshift, stopcodon])

In [9]:
def import_fasta(files_List):
    dfs=pd.DataFrame()
    for file in files_List:

        df=pd.DataFrame(pd.read_csv(file, sep='\t',header=None).values.reshape(-1, 2),columns=['header', 'seq_nt'])
        labels = file.split('/')[-1].split('.fasta')[0].split('_')[:5]
        status, mouse, dataset, chain, sub_dataset = labels
        label= '_'.join(labels)
    
        df['status']=status
        df['mouse']=mouse
        df['dataset']=dataset
        df['chain']=chain
        df['sub_dataset']=sub_dataset
        df['mouse_DB']=mouse_DB_dic[mouse]
        df['label']=label
        #print(file_name, '/ chain:', chain, '/dataset:', dataset, '/ sub_dataset:', sub_dataset, )

        df['type']='query'
        df.loc[0, 'type']='ref'
        df['ref_nt']=df.loc[0, 'seq_nt']
        df['ref_aa']=''.join(translate(df.loc[0, 'seq_nt'])[0])

        for i in df.index:
            translation=translate(df.loc[i, 'seq_nt'])
            df.loc[i, 'seq_aa']=''.join(translation[0])
            # df.loc[i, 'frameshift']=translation[1]
            df.loc[i, 'stopcodon']=translation[2]
            df.loc[i, 'len_nt']=len(df.loc[i, 'seq_nt'])
            df.loc[i, 'len_aa']=len(translation[0])

        df['header'] = df['header'].apply(lambda x: x.lstrip('>'))
        dfs=pd.concat([dfs,df])
        
    return(dfs)

In [10]:
dfs=import_fasta(files_List)
dfs.reset_index(drop=True, inplace=True)
dfs

Unnamed: 0,header,seq_nt,status,mouse,dataset,chain,sub_dataset,mouse_DB,label,type,ref_nt,ref_aa,seq_aa,stopcodon,len_nt,len_aa
0,LateGC_HA-WT_APC_VL_1-1 ref,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,LateGC,HA-WT,APC,VL,1-1,HA,LateGC_HA-WT_APC_VL_1-1,ref,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,False,285.0,95.0
1,51I_037_L-1350833-1362242-R-B9_L_H08,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,LateGC,HA-WT,APC,VL,1-1,HA,LateGC_HA-WT_APC_VL_1-1,query,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,False,285.0,95.0
2,58A_074_L-1524392-R2-A10_L_B10,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,LateGC,HA-WT,APC,VL,1-1,HA,LateGC_HA-WT_APC_VL_1-1,query,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,VTMSCTSSQSLFNSGKQKSYLTWYQQKLGQPPKVLIYWASTRESGV...,False,285.0,95.0
3,51I_053_L-1350833-1362242-R-D11_L_H10,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,LateGC,HA-WT,APC,VL,1-1,HA,LateGC_HA-WT_APC_VL_1-1,query,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,False,285.0,95.0
4,57B_101_L-1524393-L-D6_L_E05,GTCACTCTGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAG...,LateGC,HA-WT,APC,VL,1-1,HA,LateGC_HA-WT_APC_VL_1-1,query,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,VTLSCTSSQSLFNSGEQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,False,285.0,95.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96754,52A_055_L-1443862-R-APCGC-B10_L_G07,GTCACTTTGACCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGCAG...,LateGC,HA-uMT,APC,VL,0-1,HA,LateGC_HA-uMT_APC_VL_0-1,query,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,VTLTCTSSQSLFNSADQKNYLTWYQQKSGQPPKVLIYWASTRKFGV...,False,285.0,95.0
96755,52A_052_L-1443862-R2-APCGC-B4_L_D07,GTCACTTTGAACTGCACGTCCAGTCAGAGTCTGTTTCACACTGTAA...,LateGC,HA-uMT,APC,VL,0-1,HA,LateGC_HA-uMT_APC_VL_0-1,query,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,VTLNCTSSQSLFHTVRQKNYLTWYQLKPGQPPKVLIYWASTRESGV...,False,285.0,95.0
96756,55A_028_L-1406144-APCMem-A1_L_D04,GTCACTTTGAGCTGCACGTCCAGTCAGAGTCTGTTTAGCAGTGGAA...,LateGC,HA-uMT,APC,VL,0-1,HA,LateGC_HA-uMT_APC_VL_0-1,query,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,VTLSCTSSQSLFSSGMQKNYLAWYQQKPGQPPKLLIYWASTRKSGV...,False,285.0,95.0
96757,52A_057_L-1443862-R2-APCGC-B11_L_A08,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,LateGC,HA-uMT,APC,VL,0-1,HA,LateGC_HA-uMT_APC_VL_0-1,query,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,VTMSCTSSQSLFNSGKQKNYLTWYQQRPGQPPKLLIYWASTRESGV...,False,285.0,95.0


In [11]:
df_ha_wt=dfs.loc[
(dfs['status']=='LateGC')  &\
(dfs['mouse']=='HA-WT')    &\
(dfs['sub_dataset']=='1-1') \
,].copy()
len(df_ha_wt)

337

In [12]:
df_ha_wt['dataset']='mix'

df_ha_wt

dfs=pd.concat([dfs, df_ha_wt])
dfs.reset_index(inplace=True, drop=True)
dfs #18046 including reference sequences

Unnamed: 0,header,seq_nt,status,mouse,dataset,chain,sub_dataset,mouse_DB,label,type,ref_nt,ref_aa,seq_aa,stopcodon,len_nt,len_aa
0,LateGC_HA-WT_APC_VL_1-1 ref,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,LateGC,HA-WT,APC,VL,1-1,HA,LateGC_HA-WT_APC_VL_1-1,ref,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,False,285.0,95.0
1,51I_037_L-1350833-1362242-R-B9_L_H08,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,LateGC,HA-WT,APC,VL,1-1,HA,LateGC_HA-WT_APC_VL_1-1,query,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,False,285.0,95.0
2,58A_074_L-1524392-R2-A10_L_B10,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,LateGC,HA-WT,APC,VL,1-1,HA,LateGC_HA-WT_APC_VL_1-1,query,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,VTMSCTSSQSLFNSGKQKSYLTWYQQKLGQPPKVLIYWASTRESGV...,False,285.0,95.0
3,51I_053_L-1350833-1362242-R-D11_L_H10,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,LateGC,HA-WT,APC,VL,1-1,HA,LateGC_HA-WT_APC_VL_1-1,query,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,False,285.0,95.0
4,57B_101_L-1524393-L-D6_L_E05,GTCACTCTGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAG...,LateGC,HA-WT,APC,VL,1-1,HA,LateGC_HA-WT_APC_VL_1-1,query,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,VTLSCTSSQSLFNSGEQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,False,285.0,95.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97091,52A_018_H-1350833-unL-1362242-L-B12_H_B03,TCCTGTGCAGCCTCTGGATTCACTTTCAGTACCTATGGCATGTCTT...,LateGC,HA-WT,mix,VH,1-1,HA,LateGC_HA-WT_OVA_VH_1-1,query,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,SCAASGFTFSTYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,False,300.0,100.0
97092,53A_021_H-1524392-R-C5_H_E03,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,LateGC,HA-WT,mix,VH,1-1,HA,LateGC_HA-WT_OVA_VH_1-1,query,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGDTYTSCPDSVKG...,False,300.0,100.0
97093,53A_022_H-1524392-R-C7_H_F03,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,LateGC,HA-WT,mix,VH,1-1,HA,LateGC_HA-WT_OVA_VH_1-1,query,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTFYPDTVKG...,False,300.0,100.0
97094,53A_016_H-1524392-R-B10_H_H02,TCCTGTGCAGCCTCTGGAATCACTTTCAGTAGCTATGGCATGTCTT...,LateGC,HA-WT,mix,VH,1-1,HA,LateGC_HA-WT_OVA_VH_1-1,query,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,SCAASGITFSSYGMSWVRQTPDKRLEWVATISNGGGSTYYPDSVKG...,False,300.0,100.0


In [13]:
len(dfs)

97096

In [14]:
179991+337

180328

In [15]:
def ins_dels_miss(df):
    df.reset_index(inplace=True, drop=True)

    df[['nt_ins', 'nt_dels', 'nt_miss', 'nt_N']]=0
    for i in df.index:
        query=df.loc[i, 'seq_nt']
        ref=df.loc[i, 'ref_nt']
        for p in range(0, len(ref)):

            if query[p] != ref[p]:
                if ref[p] == '-': df.loc[i, 'nt_ins']+=1
                elif query[p] == '-': df.loc[i, 'nt_dels']+=1
                elif query[p] == 'N': df.loc[i, 'nt_N']+=1
                else: df.loc[i, 'nt_miss']+=1

    df[['aa_ins', 'aa_dels', 'aa_miss', 'aa_.']]=0
    for i in df.index:
        query=df.loc[i, 'seq_aa']
        ref=df.loc[i, 'ref_aa']
        for p in range(0, len(ref)):

            if query[p] != ref[p]:
                if ref[p] == del_sign: df.loc[i, 'aa_ins']+=1
                elif query[p] == del_sign: df.loc[i, 'aa_dels']+=1
                elif query[p] == '.': df.loc[i, 'aa_.']+=1
                else: df.loc[i, 'aa_miss']+=1
    return(df)

In [16]:
dfs=ins_dels_miss(dfs)
dfs

Unnamed: 0,header,seq_nt,status,mouse,dataset,chain,sub_dataset,mouse_DB,label,type,ref_nt,ref_aa,seq_aa,stopcodon,len_nt,len_aa,nt_ins,nt_dels,nt_miss,nt_N,aa_ins,aa_dels,aa_miss,aa_.
0,LateGC_HA-WT_APC_VL_1-1 ref,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,LateGC,HA-WT,APC,VL,1-1,HA,LateGC_HA-WT_APC_VL_1-1,ref,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,False,285.0,95.0,0,0,0,0,0,0,0,0
1,51I_037_L-1350833-1362242-R-B9_L_H08,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,LateGC,HA-WT,APC,VL,1-1,HA,LateGC_HA-WT_APC_VL_1-1,query,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,False,285.0,95.0,0,0,0,0,0,0,0,0
2,58A_074_L-1524392-R2-A10_L_B10,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,LateGC,HA-WT,APC,VL,1-1,HA,LateGC_HA-WT_APC_VL_1-1,query,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,VTMSCTSSQSLFNSGKQKSYLTWYQQKLGQPPKVLIYWASTRESGV...,False,285.0,95.0,0,0,8,0,0,0,3,0
3,51I_053_L-1350833-1362242-R-D11_L_H10,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,LateGC,HA-WT,APC,VL,1-1,HA,LateGC_HA-WT_APC_VL_1-1,query,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,False,285.0,95.0,0,0,1,0,0,0,0,0
4,57B_101_L-1524393-L-D6_L_E05,GTCACTCTGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAG...,LateGC,HA-WT,APC,VL,1-1,HA,LateGC_HA-WT_APC_VL_1-1,query,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,VTLSCTSSQSLFNSGEQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,False,285.0,95.0,0,0,2,0,0,0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97091,52A_018_H-1350833-unL-1362242-L-B12_H_B03,TCCTGTGCAGCCTCTGGATTCACTTTCAGTACCTATGGCATGTCTT...,LateGC,HA-WT,mix,VH,1-1,HA,LateGC_HA-WT_OVA_VH_1-1,query,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,SCAASGFTFSTYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,False,300.0,100.0,0,0,2,0,0,0,2,0
97092,53A_021_H-1524392-R-C5_H_E03,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,LateGC,HA-WT,mix,VH,1-1,HA,LateGC_HA-WT_OVA_VH_1-1,query,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGDTYTSCPDSVKG...,False,300.0,100.0,0,0,7,0,0,0,5,0
97093,53A_022_H-1524392-R-C7_H_F03,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,LateGC,HA-WT,mix,VH,1-1,HA,LateGC_HA-WT_OVA_VH_1-1,query,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTFYPDTVKG...,False,300.0,100.0,0,0,3,0,0,0,3,0
97094,53A_016_H-1524392-R-B10_H_H02,TCCTGTGCAGCCTCTGGAATCACTTTCAGTAGCTATGGCATGTCTT...,LateGC,HA-WT,mix,VH,1-1,HA,LateGC_HA-WT_OVA_VH_1-1,query,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,SCAASGITFSSYGMSWVRQTPDKRLEWVATISNGGGSTYYPDSVKG...,False,300.0,100.0,0,0,4,0,0,0,3,0


In [17]:
dfs['label'] = dfs[['status', 'mouse', 'dataset', 'chain', 'sub_dataset']].agg('_'.join, axis=1)
dfs.to_csv('{}/dfs_all_rep1-2.tsv'.format(output_folder_prep), sep = '\t', index=False)
len(set(dfs['label']))

81

In [18]:
# To have replicates merged or separate
dfs.loc[dfs['status']=='Unimmunized', 'sub_dataset'] = 'reps'
dfs['label'] = dfs[['status', 'mouse', 'dataset', 'chain', 'sub_dataset']].agg('_'.join, axis=1)
dfs.to_csv('{}/dfs_all.tsv'.format(output_folder_prep), sep = '\t', index=False)
len(set(dfs['label']))

69

In [19]:
grouping=dfs.groupby(by=['status', 'mouse', 'dataset', 'chain', 'sub_dataset'])

for grouped, df in grouping:
    suffix='_'.join(grouped)
    df.reset_index(drop=True, inplace=True)
    print(suffix, len(df))

EarlyGC_B18-383_APC_VH_- 109
EarlyGC_B18-383_APC_VL_- 90
EarlyGC_B18-383_CGG_VH_- 47
EarlyGC_B18-383_CGG_VL_- 42
EarlyGC_B18-383_OVA_VH_- 39
EarlyGC_B18-383_OVA_VL_- 78
EarlyGC_HA-uMT_APC_VH_0-1 74
EarlyGC_HA-uMT_APC_VL_0-1 98
EarlyGC_HA-uMT_CGG_VH_0-1 34
EarlyGC_HA-uMT_CGG_VL_0-1 59
EarlyGC_HA-uMT_OVA_VH_0-1 45
EarlyGC_HA-uMT_OVA_VL_0-1 68
LateGC_B18-383_APC_VH_- 96
LateGC_B18-383_APC_VL_- 112
LateGC_B18-383_CGG_VH_- 49
LateGC_B18-383_CGG_VL_- 78
LateGC_B18-383_OVA_VH_- 142
LateGC_B18-383_OVA_VL_- 215
LateGC_B18-383_OVA-CTLA4_VH_- 123
LateGC_B18-383_OVA-CTLA4_VL_- 175
LateGC_B18-383_OVA-Isotype_VH_- 156
LateGC_B18-383_OVA-Isotype_VL_- 284
LateGC_HA-WT_APC_VH_1-1 30
LateGC_HA-WT_APC_VH_1-100 80
LateGC_HA-WT_APC_VH_1-1000 28
LateGC_HA-WT_APC_VL_1-1 63
LateGC_HA-WT_APC_VL_1-100 155
LateGC_HA-WT_APC_VL_1-1000 57
LateGC_HA-WT_CGG_VH_1-1 49
LateGC_HA-WT_CGG_VH_1-100 67
LateGC_HA-WT_CGG_VH_1-1000 64
LateGC_HA-WT_CGG_VL_1-1 94
LateGC_HA-WT_CGG_VL_1-100 102
LateGC_HA-WT_CGG_VL_1-1000 129
LateG

In [20]:
def df_clean_up(df, zero_miss='exclude'): # By default, excludes sequences without nt mismatches. (nt mismatches are distinguished with insertions/deletions)
    if zero_miss=='exclude':
        df=df[df['nt_miss']!=0]
    #df=df[df['frameshift']==False]
    #df=df[df['stopcodon']==False]
    df=df[df['type']=='query']
    return(df)

In [21]:
def expand_nt(df_initial):
    df = df_initial.copy()
    max_len_nt = int(df['len_nt'].max())
    
    # Create NT columns by expanding 'seq_nt' into individual characters with padding
    nt_data = df['seq_nt'].apply(lambda x: list(x.ljust(max_len_nt, ' '))).tolist()
    
    # Create a DataFrame for the new NT columns
    df_nt = pd.DataFrame(nt_data, columns=['NT{}'.format(i) for i in range(max_len_nt)], index=df.index)
    
    # Concatenate the original df with the new NT columns
    df = pd.concat([df, df_nt], axis=1)
    
    # Optionally replace spaces with NaN (if needed)
    df.replace(' ', pd.NA, inplace=True)
    
    return df


In [22]:
dfs_expanded_nts=expand_nt(dfs)
dfs_expanded_nts.to_csv('{}/dfs_expanded_nts.tsv'.format(output_folder_prep), sep = '\t', index=False)

dfs_expanded_nts

Unnamed: 0,header,seq_nt,status,mouse,dataset,chain,sub_dataset,mouse_DB,label,type,ref_nt,ref_aa,seq_aa,stopcodon,len_nt,len_aa,nt_ins,nt_dels,nt_miss,nt_N,aa_ins,aa_dels,aa_miss,aa_.,NT0,NT1,NT2,NT3,NT4,NT5,NT6,NT7,NT8,NT9,NT10,NT11,NT12,NT13,NT14,NT15,NT16,NT17,NT18,NT19,NT20,NT21,NT22,NT23,NT24,NT25,NT26,NT27,NT28,NT29,NT30,NT31,NT32,NT33,NT34,NT35,NT36,NT37,NT38,NT39,NT40,NT41,NT42,NT43,NT44,NT45,NT46,NT47,NT48,NT49,NT50,NT51,NT52,NT53,NT54,NT55,NT56,NT57,NT58,NT59,NT60,NT61,NT62,NT63,NT64,NT65,NT66,NT67,NT68,NT69,NT70,NT71,NT72,NT73,NT74,NT75,NT76,NT77,NT78,NT79,NT80,NT81,NT82,NT83,NT84,NT85,NT86,NT87,NT88,NT89,NT90,NT91,NT92,NT93,NT94,NT95,NT96,NT97,NT98,NT99,NT100,NT101,NT102,NT103,NT104,NT105,NT106,NT107,NT108,NT109,NT110,NT111,NT112,NT113,NT114,NT115,NT116,NT117,NT118,NT119,NT120,NT121,NT122,NT123,NT124,NT125,NT126,NT127,NT128,NT129,NT130,NT131,NT132,NT133,NT134,NT135,NT136,NT137,NT138,NT139,NT140,NT141,NT142,NT143,NT144,NT145,NT146,NT147,NT148,NT149,NT150,NT151,NT152,NT153,NT154,NT155,NT156,NT157,NT158,NT159,NT160,NT161,NT162,NT163,NT164,NT165,NT166,NT167,NT168,NT169,NT170,NT171,NT172,NT173,NT174,NT175,NT176,NT177,NT178,NT179,NT180,NT181,NT182,NT183,NT184,NT185,NT186,NT187,NT188,NT189,NT190,NT191,NT192,NT193,NT194,NT195,NT196,NT197,NT198,NT199,NT200,NT201,NT202,NT203,NT204,NT205,NT206,NT207,NT208,NT209,NT210,NT211,NT212,NT213,NT214,NT215,NT216,NT217,NT218,NT219,NT220,NT221,NT222,NT223,NT224,NT225,NT226,NT227,NT228,NT229,NT230,NT231,NT232,NT233,NT234,NT235,NT236,NT237,NT238,NT239,NT240,NT241,NT242,NT243,NT244,NT245,NT246,NT247,NT248,NT249,NT250,NT251,NT252,NT253,NT254,NT255,NT256,NT257,NT258,NT259,NT260,NT261,NT262,NT263,NT264,NT265,NT266,NT267,NT268,NT269,NT270,NT271,NT272,NT273,NT274,NT275,NT276,NT277,NT278,NT279,NT280,NT281,NT282,NT283,NT284,NT285,NT286,NT287,NT288,NT289,NT290,NT291,NT292,NT293,NT294,NT295,NT296,NT297,NT298,NT299,NT300,NT301,NT302,NT303,NT304,NT305,NT306,NT307,NT308,NT309,NT310,NT311,NT312,NT313,NT314,NT315,NT316,NT317
0,LateGC_HA-WT_APC_VL_1-1 ref,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,LateGC,HA-WT,APC,VL,1-1,HA,LateGC_HA-WT_APC_VL_1-1,ref,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,False,285.0,95.0,0,0,0,0,0,0,0,0,G,T,C,A,C,T,A,T,G,A,G,C,T,G,C,A,C,G,T,C,C,A,G,T,C,A,G,A,G,T,C,T,G,T,T,T,A,A,C,A,G,T,G,G,A,A,A,G,C,A,A,A,A,G,A,A,T,T,A,C,T,T,G,A,C,C,T,G,G,T,A,C,C,A,G,C,A,G,A,A,A,C,C,A,G,G,A,C,A,G,C,C,T,C,C,T,A,A,A,G,T,G,T,T,G,A,T,C,T,A,C,T,G,G,G,C,A,T,C,C,A,C,T,A,G,G,G,A,A,T,C,T,G,G,G,G,T,C,C,C,T,G,A,T,C,G,C,T,T,C,A,C,A,G,G,C,A,G,T,G,G,A,T,C,T,G,G,A,A,C,A,G,A,T,T,T,C,A,C,T,C,T,C,A,C,C,A,T,C,A,G,C,A,G,T,G,T,G,C,A,G,G,C,T,G,A,A,G,A,C,C,T,G,G,C,A,G,T,T,T,A,T,T,A,C,T,G,T,C,A,G,A,A,T,G,A,T,T,A,T,A,G,T,A,A,T,C,C,G,C,T,C,A,C,G,T,T,C,G,G,T,G,G,T,G,G,G,A,C,C,A,A,G,C,T,G,G,A,G,C,T,G,A,A,A,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,51I_037_L-1350833-1362242-R-B9_L_H08,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,LateGC,HA-WT,APC,VL,1-1,HA,LateGC_HA-WT_APC_VL_1-1,query,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,False,285.0,95.0,0,0,0,0,0,0,0,0,G,T,C,A,C,T,A,T,G,A,G,C,T,G,C,A,C,G,T,C,C,A,G,T,C,A,G,A,G,T,C,T,G,T,T,T,A,A,C,A,G,T,G,G,A,A,A,G,C,A,A,A,A,G,A,A,T,T,A,C,T,T,G,A,C,C,T,G,G,T,A,C,C,A,G,C,A,G,A,A,A,C,C,A,G,G,A,C,A,G,C,C,T,C,C,T,A,A,A,G,T,G,T,T,G,A,T,C,T,A,C,T,G,G,G,C,A,T,C,C,A,C,T,A,G,G,G,A,A,T,C,T,G,G,G,G,T,C,C,C,T,G,A,T,C,G,C,T,T,C,A,C,A,G,G,C,A,G,T,G,G,A,T,C,T,G,G,A,A,C,A,G,A,T,T,T,C,A,C,T,C,T,C,A,C,C,A,T,C,A,G,C,A,G,T,G,T,G,C,A,G,G,C,T,G,A,A,G,A,C,C,T,G,G,C,A,G,T,T,T,A,T,T,A,C,T,G,T,C,A,G,A,A,T,G,A,T,T,A,T,A,G,T,A,A,T,C,C,G,C,T,C,A,C,G,T,T,C,G,G,T,G,G,T,G,G,G,A,C,C,A,A,G,C,T,G,G,A,G,C,T,G,A,A,A,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,58A_074_L-1524392-R2-A10_L_B10,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,LateGC,HA-WT,APC,VL,1-1,HA,LateGC_HA-WT_APC_VL_1-1,query,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,VTMSCTSSQSLFNSGKQKSYLTWYQQKLGQPPKVLIYWASTRESGV...,False,285.0,95.0,0,0,8,0,0,0,3,0,G,T,C,A,C,T,A,T,G,A,G,C,T,G,C,A,C,G,T,C,C,A,G,T,C,A,G,A,G,T,C,T,G,T,T,T,A,A,C,A,G,T,G,G,A,A,A,G,C,A,A,A,A,G,A,G,T,T,A,C,T,T,G,A,C,C,T,G,G,T,A,C,C,A,G,C,A,G,A,A,A,T,T,A,G,G,A,C,A,G,C,C,T,C,C,T,A,A,A,G,T,A,T,T,G,A,T,C,T,A,T,T,G,G,G,C,A,T,C,C,A,C,T,A,G,G,G,A,A,T,C,A,G,G,G,G,T,C,C,C,T,G,A,T,C,G,C,T,T,C,A,C,A,G,G,C,A,G,T,G,G,A,T,C,T,G,G,A,A,C,A,G,A,T,T,T,C,A,C,T,C,T,C,A,C,C,A,T,C,A,G,C,A,G,T,G,T,G,C,A,G,G,C,T,G,A,A,G,A,C,C,T,G,G,C,A,G,T,T,T,A,T,T,A,C,T,G,T,C,A,A,A,A,T,G,A,T,T,A,T,A,G,T,T,A,T,C,C,G,C,T,C,A,C,G,T,T,C,G,G,T,G,G,T,G,G,G,A,C,C,A,A,G,C,T,G,G,A,G,C,T,G,A,A,A,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,51I_053_L-1350833-1362242-R-D11_L_H10,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,LateGC,HA-WT,APC,VL,1-1,HA,LateGC_HA-WT_APC_VL_1-1,query,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,False,285.0,95.0,0,0,1,0,0,0,0,0,G,T,C,A,C,T,A,T,G,A,G,C,T,G,C,A,C,G,T,C,C,A,G,T,C,A,G,A,G,T,C,T,G,T,T,T,A,A,C,A,G,T,G,G,A,A,A,G,C,A,A,A,A,G,A,A,T,T,A,C,T,T,G,A,C,C,T,G,G,T,A,C,C,A,G,C,A,G,A,A,A,C,C,A,G,G,A,C,A,G,C,C,T,C,C,T,A,A,A,G,T,G,T,T,G,A,T,C,T,A,C,T,G,G,G,C,A,T,C,C,A,C,T,A,G,G,G,A,A,T,C,T,G,G,G,G,T,C,C,C,T,G,A,T,C,G,C,T,T,C,A,C,A,G,G,C,A,G,T,G,G,A,T,C,T,G,G,A,A,C,A,G,A,T,T,T,C,A,C,T,C,T,C,A,C,C,A,T,C,A,G,C,A,G,T,G,T,G,C,A,G,G,C,T,G,A,A,G,A,C,C,T,G,G,C,A,G,T,T,T,A,T,T,A,C,T,G,T,C,A,G,A,A,T,G,A,T,T,A,T,A,G,T,A,A,T,C,C,G,C,T,C,A,C,A,T,T,C,G,G,T,G,G,T,G,G,G,A,C,C,A,A,G,C,T,G,G,A,G,C,T,G,A,A,A,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,57B_101_L-1524393-L-D6_L_E05,GTCACTCTGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAG...,LateGC,HA-WT,APC,VL,1-1,HA,LateGC_HA-WT_APC_VL_1-1,query,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,VTLSCTSSQSLFNSGEQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,False,285.0,95.0,0,0,2,0,0,0,2,0,G,T,C,A,C,T,C,T,G,A,G,C,T,G,C,A,C,G,T,C,C,A,G,T,C,A,G,A,G,T,C,T,G,T,T,T,A,A,C,A,G,T,G,G,A,G,A,G,C,A,A,A,A,G,A,A,T,T,A,C,T,T,G,A,C,C,T,G,G,T,A,C,C,A,G,C,A,G,A,A,A,C,C,A,G,G,A,C,A,G,C,C,T,C,C,T,A,A,A,G,T,G,T,T,G,A,T,C,T,A,C,T,G,G,G,C,A,T,C,C,A,C,T,A,G,G,G,A,A,T,C,T,G,G,G,G,T,C,C,C,T,G,A,T,C,G,C,T,T,C,A,C,A,G,G,C,A,G,T,G,G,A,T,C,T,G,G,A,A,C,A,G,A,T,T,T,C,A,C,T,C,T,C,A,C,C,A,T,C,A,G,C,A,G,T,G,T,G,C,A,G,G,C,T,G,A,A,G,A,C,C,T,G,G,C,A,G,T,T,T,A,T,T,A,C,T,G,T,C,A,G,A,A,T,G,A,T,T,A,T,A,G,T,A,A,T,C,C,G,C,T,C,A,C,G,T,T,C,G,G,T,G,G,T,G,G,G,A,C,C,A,A,G,C,T,G,G,A,G,C,T,G,A,A,A,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97091,52A_018_H-1350833-unL-1362242-L-B12_H_B03,TCCTGTGCAGCCTCTGGATTCACTTTCAGTACCTATGGCATGTCTT...,LateGC,HA-WT,mix,VH,1-1,HA,LateGC_HA-WT_mix_VH_1-1,query,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,SCAASGFTFSTYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,False,300.0,100.0,0,0,2,0,0,0,2,0,T,C,C,T,G,T,G,C,A,G,C,C,T,C,T,G,G,A,T,T,C,A,C,T,T,T,C,A,G,T,A,C,C,T,A,T,G,G,C,A,T,G,T,C,T,T,G,G,G,T,T,C,G,C,C,A,G,A,C,T,C,C,A,G,A,C,A,A,G,A,G,G,C,T,G,G,A,G,T,G,G,G,T,C,G,C,A,A,C,C,A,T,T,A,G,T,A,A,T,G,G,T,G,G,T,G,G,T,T,A,C,A,C,C,T,A,C,T,A,T,C,C,A,G,A,C,A,G,T,G,T,G,A,A,G,G,G,G,C,G,A,T,T,C,A,C,C,A,T,C,T,C,C,A,G,A,G,A,C,A,A,T,G,C,C,A,A,G,A,A,C,A,C,C,C,T,G,T,A,C,C,T,G,C,A,A,A,T,G,A,G,C,A,G,T,C,T,G,A,A,G,T,C,T,G,A,G,G,A,C,T,C,A,G,C,C,A,T,G,T,A,T,T,A,C,T,G,T,G,C,A,A,G,A,C,G,G,G,A,G,A,G,G,T,A,C,G,A,C,G,A,G,A,A,C,G,G,G,T,T,T,T,C,T,T,A,C,T,G,G,G,G,C,C,A,A,G,G,G,A,C,T,C,T,G,G,T,C,A,C,G,G,T,C,T,C,T,G,C,A,,,,,,,,,,,,,,,,,,
97092,53A_021_H-1524392-R-C5_H_E03,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,LateGC,HA-WT,mix,VH,1-1,HA,LateGC_HA-WT_mix_VH_1-1,query,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGDTYTSCPDSVKG...,False,300.0,100.0,0,0,7,0,0,0,5,0,T,C,C,T,G,T,G,C,A,G,C,C,T,C,T,G,G,A,T,T,C,A,C,T,T,T,C,A,G,T,A,G,C,T,A,T,G,G,C,A,T,G,T,C,T,T,G,G,G,T,T,C,G,C,C,A,G,A,C,T,C,C,A,G,A,C,A,A,G,A,G,G,C,T,G,G,A,G,T,G,G,G,T,C,G,C,A,A,C,C,A,T,T,A,G,T,A,A,T,G,G,T,G,A,T,A,C,T,T,A,C,A,C,C,T,C,C,T,G,T,C,C,A,G,A,C,A,G,T,G,T,G,A,A,G,G,G,G,C,G,A,T,T,C,A,C,C,A,T,C,T,C,C,A,G,A,G,A,C,A,A,T,G,C,C,A,A,G,A,A,C,A,C,C,C,T,G,T,A,C,C,T,G,C,A,A,A,T,G,A,G,C,A,G,T,C,T,G,A,A,G,T,C,T,G,A,G,G,A,C,T,C,A,G,C,C,A,T,G,T,A,T,T,A,C,T,G,T,G,C,A,A,G,A,C,G,G,G,A,G,A,G,G,T,A,C,G,A,C,G,A,C,A,A,C,G,G,C,T,T,T,G,C,T,T,A,C,T,G,G,G,G,C,C,A,A,G,G,G,A,C,T,C,T,G,G,T,C,A,C,G,G,T,C,T,C,T,G,C,A,,,,,,,,,,,,,,,,,,
97093,53A_022_H-1524392-R-C7_H_F03,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,LateGC,HA-WT,mix,VH,1-1,HA,LateGC_HA-WT_mix_VH_1-1,query,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTFYPDTVKG...,False,300.0,100.0,0,0,3,0,0,0,3,0,T,C,C,T,G,T,G,C,A,G,C,C,T,C,T,G,G,A,T,T,C,A,C,T,T,T,C,A,G,T,A,G,C,T,A,T,G,G,C,A,T,G,T,C,T,T,G,G,G,T,T,C,G,C,C,A,G,A,C,T,C,C,A,G,A,C,A,A,G,A,G,G,C,T,G,G,A,G,T,G,G,G,T,C,G,C,A,A,C,C,A,T,T,A,G,T,A,A,T,G,G,T,G,G,T,G,G,T,T,A,C,A,C,C,T,T,C,T,A,T,C,C,A,G,A,C,A,C,T,G,T,G,A,A,G,G,G,G,C,G,A,T,T,C,A,C,C,A,T,C,T,C,C,A,G,A,G,A,C,A,A,T,G,C,C,A,A,G,A,A,C,A,C,C,C,T,G,T,A,C,C,T,G,C,A,A,A,T,G,A,G,C,A,G,T,C,T,G,A,A,G,T,C,T,G,A,G,G,A,C,T,C,A,G,C,C,A,T,G,T,A,T,T,A,C,T,G,T,G,C,A,A,G,A,C,G,G,G,G,G,A,G,G,T,A,C,G,A,C,G,A,G,A,A,C,G,G,G,T,T,T,G,C,T,T,A,C,T,G,G,G,G,C,C,A,A,G,G,G,A,C,T,C,T,G,G,T,C,A,C,G,G,T,C,T,C,T,G,C,A,,,,,,,,,,,,,,,,,,
97094,53A_016_H-1524392-R-B10_H_H02,TCCTGTGCAGCCTCTGGAATCACTTTCAGTAGCTATGGCATGTCTT...,LateGC,HA-WT,mix,VH,1-1,HA,LateGC_HA-WT_mix_VH_1-1,query,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,SCAASGITFSSYGMSWVRQTPDKRLEWVATISNGGGSTYYPDSVKG...,False,300.0,100.0,0,0,4,0,0,0,3,0,T,C,C,T,G,T,G,C,A,G,C,C,T,C,T,G,G,A,A,T,C,A,C,T,T,T,C,A,G,T,A,G,C,T,A,T,G,G,C,A,T,G,T,C,T,T,G,G,G,T,T,C,G,C,C,A,G,A,C,T,C,C,A,G,A,C,A,A,G,A,G,G,C,T,G,G,A,G,T,G,G,G,T,C,G,C,A,A,C,C,A,T,T,A,G,T,A,A,T,G,G,T,G,G,T,G,G,T,T,C,C,A,C,C,T,A,C,T,A,T,C,C,A,G,A,C,A,G,T,G,T,G,A,A,G,G,G,G,C,G,A,T,T,C,A,C,C,A,T,C,T,C,C,A,G,A,G,A,C,A,A,T,G,C,C,A,A,G,A,A,C,A,C,C,C,T,G,T,A,C,C,T,G,C,A,A,A,T,G,A,G,C,A,G,T,C,T,G,A,A,G,T,C,T,G,A,G,G,A,C,T,C,A,G,C,C,A,T,G,T,A,T,T,A,T,T,G,T,G,C,A,A,G,C,C,G,G,G,A,G,A,G,G,T,A,C,G,A,C,G,A,G,A,A,C,G,G,G,T,T,T,G,C,T,T,A,C,T,G,G,G,G,C,C,A,A,G,G,G,A,C,T,C,T,G,G,T,C,A,C,G,G,T,C,T,C,T,G,C,A,,,,,,,,,,,,,,,,,,


In [23]:
dfs_expanded_nts_cleaned=df_clean_up(dfs_expanded_nts, 'include')
dfs_expanded_nts_cleaned.reset_index(inplace=True, drop=True)
dfs_expanded_nts_cleaned.to_csv('{}/dfs_expanded_nts_included.tsv'.format(output_folder_prep), sep = '\t', index=False)
dfs_expanded_nts_cleaned

Unnamed: 0,header,seq_nt,status,mouse,dataset,chain,sub_dataset,mouse_DB,label,type,ref_nt,ref_aa,seq_aa,stopcodon,len_nt,len_aa,nt_ins,nt_dels,nt_miss,nt_N,aa_ins,aa_dels,aa_miss,aa_.,NT0,NT1,NT2,NT3,NT4,NT5,NT6,NT7,NT8,NT9,NT10,NT11,NT12,NT13,NT14,NT15,NT16,NT17,NT18,NT19,NT20,NT21,NT22,NT23,NT24,NT25,NT26,NT27,NT28,NT29,NT30,NT31,NT32,NT33,NT34,NT35,NT36,NT37,NT38,NT39,NT40,NT41,NT42,NT43,NT44,NT45,NT46,NT47,NT48,NT49,NT50,NT51,NT52,NT53,NT54,NT55,NT56,NT57,NT58,NT59,NT60,NT61,NT62,NT63,NT64,NT65,NT66,NT67,NT68,NT69,NT70,NT71,NT72,NT73,NT74,NT75,NT76,NT77,NT78,NT79,NT80,NT81,NT82,NT83,NT84,NT85,NT86,NT87,NT88,NT89,NT90,NT91,NT92,NT93,NT94,NT95,NT96,NT97,NT98,NT99,NT100,NT101,NT102,NT103,NT104,NT105,NT106,NT107,NT108,NT109,NT110,NT111,NT112,NT113,NT114,NT115,NT116,NT117,NT118,NT119,NT120,NT121,NT122,NT123,NT124,NT125,NT126,NT127,NT128,NT129,NT130,NT131,NT132,NT133,NT134,NT135,NT136,NT137,NT138,NT139,NT140,NT141,NT142,NT143,NT144,NT145,NT146,NT147,NT148,NT149,NT150,NT151,NT152,NT153,NT154,NT155,NT156,NT157,NT158,NT159,NT160,NT161,NT162,NT163,NT164,NT165,NT166,NT167,NT168,NT169,NT170,NT171,NT172,NT173,NT174,NT175,NT176,NT177,NT178,NT179,NT180,NT181,NT182,NT183,NT184,NT185,NT186,NT187,NT188,NT189,NT190,NT191,NT192,NT193,NT194,NT195,NT196,NT197,NT198,NT199,NT200,NT201,NT202,NT203,NT204,NT205,NT206,NT207,NT208,NT209,NT210,NT211,NT212,NT213,NT214,NT215,NT216,NT217,NT218,NT219,NT220,NT221,NT222,NT223,NT224,NT225,NT226,NT227,NT228,NT229,NT230,NT231,NT232,NT233,NT234,NT235,NT236,NT237,NT238,NT239,NT240,NT241,NT242,NT243,NT244,NT245,NT246,NT247,NT248,NT249,NT250,NT251,NT252,NT253,NT254,NT255,NT256,NT257,NT258,NT259,NT260,NT261,NT262,NT263,NT264,NT265,NT266,NT267,NT268,NT269,NT270,NT271,NT272,NT273,NT274,NT275,NT276,NT277,NT278,NT279,NT280,NT281,NT282,NT283,NT284,NT285,NT286,NT287,NT288,NT289,NT290,NT291,NT292,NT293,NT294,NT295,NT296,NT297,NT298,NT299,NT300,NT301,NT302,NT303,NT304,NT305,NT306,NT307,NT308,NT309,NT310,NT311,NT312,NT313,NT314,NT315,NT316,NT317
0,51I_037_L-1350833-1362242-R-B9_L_H08,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,LateGC,HA-WT,APC,VL,1-1,HA,LateGC_HA-WT_APC_VL_1-1,query,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,False,285.0,95.0,0,0,0,0,0,0,0,0,G,T,C,A,C,T,A,T,G,A,G,C,T,G,C,A,C,G,T,C,C,A,G,T,C,A,G,A,G,T,C,T,G,T,T,T,A,A,C,A,G,T,G,G,A,A,A,G,C,A,A,A,A,G,A,A,T,T,A,C,T,T,G,A,C,C,T,G,G,T,A,C,C,A,G,C,A,G,A,A,A,C,C,A,G,G,A,C,A,G,C,C,T,C,C,T,A,A,A,G,T,G,T,T,G,A,T,C,T,A,C,T,G,G,G,C,A,T,C,C,A,C,T,A,G,G,G,A,A,T,C,T,G,G,G,G,T,C,C,C,T,G,A,T,C,G,C,T,T,C,A,C,A,G,G,C,A,G,T,G,G,A,T,C,T,G,G,A,A,C,A,G,A,T,T,T,C,A,C,T,C,T,C,A,C,C,A,T,C,A,G,C,A,G,T,G,T,G,C,A,G,G,C,T,G,A,A,G,A,C,C,T,G,G,C,A,G,T,T,T,A,T,T,A,C,T,G,T,C,A,G,A,A,T,G,A,T,T,A,T,A,G,T,A,A,T,C,C,G,C,T,C,A,C,G,T,T,C,G,G,T,G,G,T,G,G,G,A,C,C,A,A,G,C,T,G,G,A,G,C,T,G,A,A,A,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,58A_074_L-1524392-R2-A10_L_B10,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,LateGC,HA-WT,APC,VL,1-1,HA,LateGC_HA-WT_APC_VL_1-1,query,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,VTMSCTSSQSLFNSGKQKSYLTWYQQKLGQPPKVLIYWASTRESGV...,False,285.0,95.0,0,0,8,0,0,0,3,0,G,T,C,A,C,T,A,T,G,A,G,C,T,G,C,A,C,G,T,C,C,A,G,T,C,A,G,A,G,T,C,T,G,T,T,T,A,A,C,A,G,T,G,G,A,A,A,G,C,A,A,A,A,G,A,G,T,T,A,C,T,T,G,A,C,C,T,G,G,T,A,C,C,A,G,C,A,G,A,A,A,T,T,A,G,G,A,C,A,G,C,C,T,C,C,T,A,A,A,G,T,A,T,T,G,A,T,C,T,A,T,T,G,G,G,C,A,T,C,C,A,C,T,A,G,G,G,A,A,T,C,A,G,G,G,G,T,C,C,C,T,G,A,T,C,G,C,T,T,C,A,C,A,G,G,C,A,G,T,G,G,A,T,C,T,G,G,A,A,C,A,G,A,T,T,T,C,A,C,T,C,T,C,A,C,C,A,T,C,A,G,C,A,G,T,G,T,G,C,A,G,G,C,T,G,A,A,G,A,C,C,T,G,G,C,A,G,T,T,T,A,T,T,A,C,T,G,T,C,A,A,A,A,T,G,A,T,T,A,T,A,G,T,T,A,T,C,C,G,C,T,C,A,C,G,T,T,C,G,G,T,G,G,T,G,G,G,A,C,C,A,A,G,C,T,G,G,A,G,C,T,G,A,A,A,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,51I_053_L-1350833-1362242-R-D11_L_H10,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,LateGC,HA-WT,APC,VL,1-1,HA,LateGC_HA-WT_APC_VL_1-1,query,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,False,285.0,95.0,0,0,1,0,0,0,0,0,G,T,C,A,C,T,A,T,G,A,G,C,T,G,C,A,C,G,T,C,C,A,G,T,C,A,G,A,G,T,C,T,G,T,T,T,A,A,C,A,G,T,G,G,A,A,A,G,C,A,A,A,A,G,A,A,T,T,A,C,T,T,G,A,C,C,T,G,G,T,A,C,C,A,G,C,A,G,A,A,A,C,C,A,G,G,A,C,A,G,C,C,T,C,C,T,A,A,A,G,T,G,T,T,G,A,T,C,T,A,C,T,G,G,G,C,A,T,C,C,A,C,T,A,G,G,G,A,A,T,C,T,G,G,G,G,T,C,C,C,T,G,A,T,C,G,C,T,T,C,A,C,A,G,G,C,A,G,T,G,G,A,T,C,T,G,G,A,A,C,A,G,A,T,T,T,C,A,C,T,C,T,C,A,C,C,A,T,C,A,G,C,A,G,T,G,T,G,C,A,G,G,C,T,G,A,A,G,A,C,C,T,G,G,C,A,G,T,T,T,A,T,T,A,C,T,G,T,C,A,G,A,A,T,G,A,T,T,A,T,A,G,T,A,A,T,C,C,G,C,T,C,A,C,A,T,T,C,G,G,T,G,G,T,G,G,G,A,C,C,A,A,G,C,T,G,G,A,G,C,T,G,A,A,A,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,57B_101_L-1524393-L-D6_L_E05,GTCACTCTGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAG...,LateGC,HA-WT,APC,VL,1-1,HA,LateGC_HA-WT_APC_VL_1-1,query,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,VTLSCTSSQSLFNSGEQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,False,285.0,95.0,0,0,2,0,0,0,2,0,G,T,C,A,C,T,C,T,G,A,G,C,T,G,C,A,C,G,T,C,C,A,G,T,C,A,G,A,G,T,C,T,G,T,T,T,A,A,C,A,G,T,G,G,A,G,A,G,C,A,A,A,A,G,A,A,T,T,A,C,T,T,G,A,C,C,T,G,G,T,A,C,C,A,G,C,A,G,A,A,A,C,C,A,G,G,A,C,A,G,C,C,T,C,C,T,A,A,A,G,T,G,T,T,G,A,T,C,T,A,C,T,G,G,G,C,A,T,C,C,A,C,T,A,G,G,G,A,A,T,C,T,G,G,G,G,T,C,C,C,T,G,A,T,C,G,C,T,T,C,A,C,A,G,G,C,A,G,T,G,G,A,T,C,T,G,G,A,A,C,A,G,A,T,T,T,C,A,C,T,C,T,C,A,C,C,A,T,C,A,G,C,A,G,T,G,T,G,C,A,G,G,C,T,G,A,A,G,A,C,C,T,G,G,C,A,G,T,T,T,A,T,T,A,C,T,G,T,C,A,G,A,A,T,G,A,T,T,A,T,A,G,T,A,A,T,C,C,G,C,T,C,A,C,G,T,T,C,G,G,T,G,G,T,G,G,G,A,C,C,A,A,G,C,T,G,G,A,G,C,T,G,A,A,A,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,51I_035_L-1350833-1362242-R-B7_L_F08,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,LateGC,HA-WT,APC,VL,1-1,HA,LateGC_HA-WT_APC_VL_1-1,query,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,False,285.0,95.0,0,0,1,0,0,0,0,0,G,T,C,A,C,T,A,T,G,A,G,C,T,G,C,A,C,G,T,C,C,A,G,T,C,A,G,A,G,T,C,T,G,T,T,T,A,A,C,A,G,T,G,G,A,A,A,G,C,A,A,A,A,G,A,A,T,T,A,C,T,T,G,A,C,C,T,G,G,T,A,C,C,A,G,C,A,G,A,A,A,C,C,A,G,G,A,C,A,G,C,C,T,C,C,T,A,A,A,G,T,G,T,T,G,A,T,C,T,A,C,T,G,G,G,C,A,T,C,C,A,C,T,A,G,G,G,A,A,T,C,T,G,G,G,G,T,C,C,C,T,G,A,T,C,G,C,T,T,C,A,C,A,G,G,C,A,G,T,G,G,A,T,C,T,G,G,A,A,C,A,G,A,T,T,T,C,A,C,T,C,T,C,A,C,C,A,T,C,A,G,C,A,G,T,G,T,G,C,A,G,G,C,T,G,A,A,G,A,C,C,T,G,G,C,G,G,T,T,T,A,T,T,A,C,T,G,T,C,A,G,A,A,T,G,A,T,T,A,T,A,G,T,A,A,T,C,C,G,C,T,C,A,C,G,T,T,C,G,G,T,G,G,T,G,G,G,A,C,C,A,A,G,C,T,G,G,A,G,C,T,G,A,A,A,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97006,52A_018_H-1350833-unL-1362242-L-B12_H_B03,TCCTGTGCAGCCTCTGGATTCACTTTCAGTACCTATGGCATGTCTT...,LateGC,HA-WT,mix,VH,1-1,HA,LateGC_HA-WT_mix_VH_1-1,query,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,SCAASGFTFSTYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,False,300.0,100.0,0,0,2,0,0,0,2,0,T,C,C,T,G,T,G,C,A,G,C,C,T,C,T,G,G,A,T,T,C,A,C,T,T,T,C,A,G,T,A,C,C,T,A,T,G,G,C,A,T,G,T,C,T,T,G,G,G,T,T,C,G,C,C,A,G,A,C,T,C,C,A,G,A,C,A,A,G,A,G,G,C,T,G,G,A,G,T,G,G,G,T,C,G,C,A,A,C,C,A,T,T,A,G,T,A,A,T,G,G,T,G,G,T,G,G,T,T,A,C,A,C,C,T,A,C,T,A,T,C,C,A,G,A,C,A,G,T,G,T,G,A,A,G,G,G,G,C,G,A,T,T,C,A,C,C,A,T,C,T,C,C,A,G,A,G,A,C,A,A,T,G,C,C,A,A,G,A,A,C,A,C,C,C,T,G,T,A,C,C,T,G,C,A,A,A,T,G,A,G,C,A,G,T,C,T,G,A,A,G,T,C,T,G,A,G,G,A,C,T,C,A,G,C,C,A,T,G,T,A,T,T,A,C,T,G,T,G,C,A,A,G,A,C,G,G,G,A,G,A,G,G,T,A,C,G,A,C,G,A,G,A,A,C,G,G,G,T,T,T,T,C,T,T,A,C,T,G,G,G,G,C,C,A,A,G,G,G,A,C,T,C,T,G,G,T,C,A,C,G,G,T,C,T,C,T,G,C,A,,,,,,,,,,,,,,,,,,
97007,53A_021_H-1524392-R-C5_H_E03,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,LateGC,HA-WT,mix,VH,1-1,HA,LateGC_HA-WT_mix_VH_1-1,query,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGDTYTSCPDSVKG...,False,300.0,100.0,0,0,7,0,0,0,5,0,T,C,C,T,G,T,G,C,A,G,C,C,T,C,T,G,G,A,T,T,C,A,C,T,T,T,C,A,G,T,A,G,C,T,A,T,G,G,C,A,T,G,T,C,T,T,G,G,G,T,T,C,G,C,C,A,G,A,C,T,C,C,A,G,A,C,A,A,G,A,G,G,C,T,G,G,A,G,T,G,G,G,T,C,G,C,A,A,C,C,A,T,T,A,G,T,A,A,T,G,G,T,G,A,T,A,C,T,T,A,C,A,C,C,T,C,C,T,G,T,C,C,A,G,A,C,A,G,T,G,T,G,A,A,G,G,G,G,C,G,A,T,T,C,A,C,C,A,T,C,T,C,C,A,G,A,G,A,C,A,A,T,G,C,C,A,A,G,A,A,C,A,C,C,C,T,G,T,A,C,C,T,G,C,A,A,A,T,G,A,G,C,A,G,T,C,T,G,A,A,G,T,C,T,G,A,G,G,A,C,T,C,A,G,C,C,A,T,G,T,A,T,T,A,C,T,G,T,G,C,A,A,G,A,C,G,G,G,A,G,A,G,G,T,A,C,G,A,C,G,A,C,A,A,C,G,G,C,T,T,T,G,C,T,T,A,C,T,G,G,G,G,C,C,A,A,G,G,G,A,C,T,C,T,G,G,T,C,A,C,G,G,T,C,T,C,T,G,C,A,,,,,,,,,,,,,,,,,,
97008,53A_022_H-1524392-R-C7_H_F03,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,LateGC,HA-WT,mix,VH,1-1,HA,LateGC_HA-WT_mix_VH_1-1,query,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTFYPDTVKG...,False,300.0,100.0,0,0,3,0,0,0,3,0,T,C,C,T,G,T,G,C,A,G,C,C,T,C,T,G,G,A,T,T,C,A,C,T,T,T,C,A,G,T,A,G,C,T,A,T,G,G,C,A,T,G,T,C,T,T,G,G,G,T,T,C,G,C,C,A,G,A,C,T,C,C,A,G,A,C,A,A,G,A,G,G,C,T,G,G,A,G,T,G,G,G,T,C,G,C,A,A,C,C,A,T,T,A,G,T,A,A,T,G,G,T,G,G,T,G,G,T,T,A,C,A,C,C,T,T,C,T,A,T,C,C,A,G,A,C,A,C,T,G,T,G,A,A,G,G,G,G,C,G,A,T,T,C,A,C,C,A,T,C,T,C,C,A,G,A,G,A,C,A,A,T,G,C,C,A,A,G,A,A,C,A,C,C,C,T,G,T,A,C,C,T,G,C,A,A,A,T,G,A,G,C,A,G,T,C,T,G,A,A,G,T,C,T,G,A,G,G,A,C,T,C,A,G,C,C,A,T,G,T,A,T,T,A,C,T,G,T,G,C,A,A,G,A,C,G,G,G,G,G,A,G,G,T,A,C,G,A,C,G,A,G,A,A,C,G,G,G,T,T,T,G,C,T,T,A,C,T,G,G,G,G,C,C,A,A,G,G,G,A,C,T,C,T,G,G,T,C,A,C,G,G,T,C,T,C,T,G,C,A,,,,,,,,,,,,,,,,,,
97009,53A_016_H-1524392-R-B10_H_H02,TCCTGTGCAGCCTCTGGAATCACTTTCAGTAGCTATGGCATGTCTT...,LateGC,HA-WT,mix,VH,1-1,HA,LateGC_HA-WT_mix_VH_1-1,query,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,SCAASGITFSSYGMSWVRQTPDKRLEWVATISNGGGSTYYPDSVKG...,False,300.0,100.0,0,0,4,0,0,0,3,0,T,C,C,T,G,T,G,C,A,G,C,C,T,C,T,G,G,A,A,T,C,A,C,T,T,T,C,A,G,T,A,G,C,T,A,T,G,G,C,A,T,G,T,C,T,T,G,G,G,T,T,C,G,C,C,A,G,A,C,T,C,C,A,G,A,C,A,A,G,A,G,G,C,T,G,G,A,G,T,G,G,G,T,C,G,C,A,A,C,C,A,T,T,A,G,T,A,A,T,G,G,T,G,G,T,G,G,T,T,C,C,A,C,C,T,A,C,T,A,T,C,C,A,G,A,C,A,G,T,G,T,G,A,A,G,G,G,G,C,G,A,T,T,C,A,C,C,A,T,C,T,C,C,A,G,A,G,A,C,A,A,T,G,C,C,A,A,G,A,A,C,A,C,C,C,T,G,T,A,C,C,T,G,C,A,A,A,T,G,A,G,C,A,G,T,C,T,G,A,A,G,T,C,T,G,A,G,G,A,C,T,C,A,G,C,C,A,T,G,T,A,T,T,A,T,T,G,T,G,C,A,A,G,C,C,G,G,G,A,G,A,G,G,T,A,C,G,A,C,G,A,G,A,A,C,G,G,G,T,T,T,G,C,T,T,A,C,T,G,G,G,G,C,C,A,A,G,G,G,A,C,T,C,T,G,G,T,C,A,C,G,G,T,C,T,C,T,G,C,A,,,,,,,,,,,,,,,,,,


In [24]:
dfs_expanded_nts_cleaned=df_clean_up(dfs_expanded_nts)
dfs_expanded_nts_cleaned.reset_index(inplace=True, drop=True)
dfs_expanded_nts_cleaned.to_csv('{}/dfs_expanded_nts_excluded.tsv'.format(output_folder_prep), sep = '\t', index=False)
dfs_expanded_nts_cleaned

Unnamed: 0,header,seq_nt,status,mouse,dataset,chain,sub_dataset,mouse_DB,label,type,ref_nt,ref_aa,seq_aa,stopcodon,len_nt,len_aa,nt_ins,nt_dels,nt_miss,nt_N,aa_ins,aa_dels,aa_miss,aa_.,NT0,NT1,NT2,NT3,NT4,NT5,NT6,NT7,NT8,NT9,NT10,NT11,NT12,NT13,NT14,NT15,NT16,NT17,NT18,NT19,NT20,NT21,NT22,NT23,NT24,NT25,NT26,NT27,NT28,NT29,NT30,NT31,NT32,NT33,NT34,NT35,NT36,NT37,NT38,NT39,NT40,NT41,NT42,NT43,NT44,NT45,NT46,NT47,NT48,NT49,NT50,NT51,NT52,NT53,NT54,NT55,NT56,NT57,NT58,NT59,NT60,NT61,NT62,NT63,NT64,NT65,NT66,NT67,NT68,NT69,NT70,NT71,NT72,NT73,NT74,NT75,NT76,NT77,NT78,NT79,NT80,NT81,NT82,NT83,NT84,NT85,NT86,NT87,NT88,NT89,NT90,NT91,NT92,NT93,NT94,NT95,NT96,NT97,NT98,NT99,NT100,NT101,NT102,NT103,NT104,NT105,NT106,NT107,NT108,NT109,NT110,NT111,NT112,NT113,NT114,NT115,NT116,NT117,NT118,NT119,NT120,NT121,NT122,NT123,NT124,NT125,NT126,NT127,NT128,NT129,NT130,NT131,NT132,NT133,NT134,NT135,NT136,NT137,NT138,NT139,NT140,NT141,NT142,NT143,NT144,NT145,NT146,NT147,NT148,NT149,NT150,NT151,NT152,NT153,NT154,NT155,NT156,NT157,NT158,NT159,NT160,NT161,NT162,NT163,NT164,NT165,NT166,NT167,NT168,NT169,NT170,NT171,NT172,NT173,NT174,NT175,NT176,NT177,NT178,NT179,NT180,NT181,NT182,NT183,NT184,NT185,NT186,NT187,NT188,NT189,NT190,NT191,NT192,NT193,NT194,NT195,NT196,NT197,NT198,NT199,NT200,NT201,NT202,NT203,NT204,NT205,NT206,NT207,NT208,NT209,NT210,NT211,NT212,NT213,NT214,NT215,NT216,NT217,NT218,NT219,NT220,NT221,NT222,NT223,NT224,NT225,NT226,NT227,NT228,NT229,NT230,NT231,NT232,NT233,NT234,NT235,NT236,NT237,NT238,NT239,NT240,NT241,NT242,NT243,NT244,NT245,NT246,NT247,NT248,NT249,NT250,NT251,NT252,NT253,NT254,NT255,NT256,NT257,NT258,NT259,NT260,NT261,NT262,NT263,NT264,NT265,NT266,NT267,NT268,NT269,NT270,NT271,NT272,NT273,NT274,NT275,NT276,NT277,NT278,NT279,NT280,NT281,NT282,NT283,NT284,NT285,NT286,NT287,NT288,NT289,NT290,NT291,NT292,NT293,NT294,NT295,NT296,NT297,NT298,NT299,NT300,NT301,NT302,NT303,NT304,NT305,NT306,NT307,NT308,NT309,NT310,NT311,NT312,NT313,NT314,NT315,NT316,NT317
0,58A_074_L-1524392-R2-A10_L_B10,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,LateGC,HA-WT,APC,VL,1-1,HA,LateGC_HA-WT_APC_VL_1-1,query,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,VTMSCTSSQSLFNSGKQKSYLTWYQQKLGQPPKVLIYWASTRESGV...,False,285.0,95.0,0,0,8,0,0,0,3,0,G,T,C,A,C,T,A,T,G,A,G,C,T,G,C,A,C,G,T,C,C,A,G,T,C,A,G,A,G,T,C,T,G,T,T,T,A,A,C,A,G,T,G,G,A,A,A,G,C,A,A,A,A,G,A,G,T,T,A,C,T,T,G,A,C,C,T,G,G,T,A,C,C,A,G,C,A,G,A,A,A,T,T,A,G,G,A,C,A,G,C,C,T,C,C,T,A,A,A,G,T,A,T,T,G,A,T,C,T,A,T,T,G,G,G,C,A,T,C,C,A,C,T,A,G,G,G,A,A,T,C,A,G,G,G,G,T,C,C,C,T,G,A,T,C,G,C,T,T,C,A,C,A,G,G,C,A,G,T,G,G,A,T,C,T,G,G,A,A,C,A,G,A,T,T,T,C,A,C,T,C,T,C,A,C,C,A,T,C,A,G,C,A,G,T,G,T,G,C,A,G,G,C,T,G,A,A,G,A,C,C,T,G,G,C,A,G,T,T,T,A,T,T,A,C,T,G,T,C,A,A,A,A,T,G,A,T,T,A,T,A,G,T,T,A,T,C,C,G,C,T,C,A,C,G,T,T,C,G,G,T,G,G,T,G,G,G,A,C,C,A,A,G,C,T,G,G,A,G,C,T,G,A,A,A,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,51I_053_L-1350833-1362242-R-D11_L_H10,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,LateGC,HA-WT,APC,VL,1-1,HA,LateGC_HA-WT_APC_VL_1-1,query,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,False,285.0,95.0,0,0,1,0,0,0,0,0,G,T,C,A,C,T,A,T,G,A,G,C,T,G,C,A,C,G,T,C,C,A,G,T,C,A,G,A,G,T,C,T,G,T,T,T,A,A,C,A,G,T,G,G,A,A,A,G,C,A,A,A,A,G,A,A,T,T,A,C,T,T,G,A,C,C,T,G,G,T,A,C,C,A,G,C,A,G,A,A,A,C,C,A,G,G,A,C,A,G,C,C,T,C,C,T,A,A,A,G,T,G,T,T,G,A,T,C,T,A,C,T,G,G,G,C,A,T,C,C,A,C,T,A,G,G,G,A,A,T,C,T,G,G,G,G,T,C,C,C,T,G,A,T,C,G,C,T,T,C,A,C,A,G,G,C,A,G,T,G,G,A,T,C,T,G,G,A,A,C,A,G,A,T,T,T,C,A,C,T,C,T,C,A,C,C,A,T,C,A,G,C,A,G,T,G,T,G,C,A,G,G,C,T,G,A,A,G,A,C,C,T,G,G,C,A,G,T,T,T,A,T,T,A,C,T,G,T,C,A,G,A,A,T,G,A,T,T,A,T,A,G,T,A,A,T,C,C,G,C,T,C,A,C,A,T,T,C,G,G,T,G,G,T,G,G,G,A,C,C,A,A,G,C,T,G,G,A,G,C,T,G,A,A,A,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,57B_101_L-1524393-L-D6_L_E05,GTCACTCTGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAG...,LateGC,HA-WT,APC,VL,1-1,HA,LateGC_HA-WT_APC_VL_1-1,query,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,VTLSCTSSQSLFNSGEQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,False,285.0,95.0,0,0,2,0,0,0,2,0,G,T,C,A,C,T,C,T,G,A,G,C,T,G,C,A,C,G,T,C,C,A,G,T,C,A,G,A,G,T,C,T,G,T,T,T,A,A,C,A,G,T,G,G,A,G,A,G,C,A,A,A,A,G,A,A,T,T,A,C,T,T,G,A,C,C,T,G,G,T,A,C,C,A,G,C,A,G,A,A,A,C,C,A,G,G,A,C,A,G,C,C,T,C,C,T,A,A,A,G,T,G,T,T,G,A,T,C,T,A,C,T,G,G,G,C,A,T,C,C,A,C,T,A,G,G,G,A,A,T,C,T,G,G,G,G,T,C,C,C,T,G,A,T,C,G,C,T,T,C,A,C,A,G,G,C,A,G,T,G,G,A,T,C,T,G,G,A,A,C,A,G,A,T,T,T,C,A,C,T,C,T,C,A,C,C,A,T,C,A,G,C,A,G,T,G,T,G,C,A,G,G,C,T,G,A,A,G,A,C,C,T,G,G,C,A,G,T,T,T,A,T,T,A,C,T,G,T,C,A,G,A,A,T,G,A,T,T,A,T,A,G,T,A,A,T,C,C,G,C,T,C,A,C,G,T,T,C,G,G,T,G,G,T,G,G,G,A,C,C,A,A,G,C,T,G,G,A,G,C,T,G,A,A,A,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,51I_035_L-1350833-1362242-R-B7_L_F08,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,LateGC,HA-WT,APC,VL,1-1,HA,LateGC_HA-WT_APC_VL_1-1,query,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,False,285.0,95.0,0,0,1,0,0,0,0,0,G,T,C,A,C,T,A,T,G,A,G,C,T,G,C,A,C,G,T,C,C,A,G,T,C,A,G,A,G,T,C,T,G,T,T,T,A,A,C,A,G,T,G,G,A,A,A,G,C,A,A,A,A,G,A,A,T,T,A,C,T,T,G,A,C,C,T,G,G,T,A,C,C,A,G,C,A,G,A,A,A,C,C,A,G,G,A,C,A,G,C,C,T,C,C,T,A,A,A,G,T,G,T,T,G,A,T,C,T,A,C,T,G,G,G,C,A,T,C,C,A,C,T,A,G,G,G,A,A,T,C,T,G,G,G,G,T,C,C,C,T,G,A,T,C,G,C,T,T,C,A,C,A,G,G,C,A,G,T,G,G,A,T,C,T,G,G,A,A,C,A,G,A,T,T,T,C,A,C,T,C,T,C,A,C,C,A,T,C,A,G,C,A,G,T,G,T,G,C,A,G,G,C,T,G,A,A,G,A,C,C,T,G,G,C,G,G,T,T,T,A,T,T,A,C,T,G,T,C,A,G,A,A,T,G,A,T,T,A,T,A,G,T,A,A,T,C,C,G,C,T,C,A,C,G,T,T,C,G,G,T,G,G,T,G,G,G,A,C,C,A,A,G,C,T,G,G,A,G,C,T,G,A,A,A,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,58A_094_L-1524393-L-C9_L_F12,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTAGAA...,LateGC,HA-WT,APC,VL,1-1,HA,LateGC_HA-WT_APC_VL_1-1,query,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...,VTMSCTSSQSLFNSGKQKNYLTWYQQKPGQPPKVLIYWASTRESGV...,VTMSCTSSQSLFNSRKQKNFLTWYQQKPGQPPKLLIYWASTRESGV...,False,285.0,95.0,0,0,6,0,0,0,4,0,G,T,C,A,C,T,A,T,G,A,G,C,T,G,C,A,C,G,T,C,C,A,G,T,C,A,G,A,G,T,C,T,G,T,T,T,A,A,C,A,G,T,A,G,A,A,A,G,C,A,A,A,A,G,A,A,T,T,T,C,T,T,G,A,C,C,T,G,G,T,A,C,C,A,G,C,A,G,A,A,A,C,C,A,G,G,A,C,A,G,C,C,T,C,C,T,A,A,A,T,T,G,T,T,G,A,T,C,T,A,C,T,G,G,G,C,A,T,C,C,A,C,T,A,G,G,G,A,G,T,C,T,G,G,G,G,T,C,C,C,T,G,A,T,C,G,C,T,T,C,A,C,A,G,G,C,A,G,T,G,G,A,T,C,T,G,G,A,A,C,A,G,A,T,T,T,C,A,C,T,C,T,C,A,C,C,A,T,C,A,G,C,A,G,T,G,T,G,C,A,G,G,C,T,G,A,A,G,A,C,C,T,G,G,C,A,G,T,T,T,A,T,T,A,C,T,G,T,C,A,G,A,A,T,G,A,T,T,A,T,A,G,T,T,A,T,C,C,A,C,T,C,A,C,G,T,T,C,G,G,T,G,G,T,G,G,G,A,C,C,A,A,G,C,T,G,G,A,G,C,T,G,A,A,A,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11401,52A_018_H-1350833-unL-1362242-L-B12_H_B03,TCCTGTGCAGCCTCTGGATTCACTTTCAGTACCTATGGCATGTCTT...,LateGC,HA-WT,mix,VH,1-1,HA,LateGC_HA-WT_mix_VH_1-1,query,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,SCAASGFTFSTYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,False,300.0,100.0,0,0,2,0,0,0,2,0,T,C,C,T,G,T,G,C,A,G,C,C,T,C,T,G,G,A,T,T,C,A,C,T,T,T,C,A,G,T,A,C,C,T,A,T,G,G,C,A,T,G,T,C,T,T,G,G,G,T,T,C,G,C,C,A,G,A,C,T,C,C,A,G,A,C,A,A,G,A,G,G,C,T,G,G,A,G,T,G,G,G,T,C,G,C,A,A,C,C,A,T,T,A,G,T,A,A,T,G,G,T,G,G,T,G,G,T,T,A,C,A,C,C,T,A,C,T,A,T,C,C,A,G,A,C,A,G,T,G,T,G,A,A,G,G,G,G,C,G,A,T,T,C,A,C,C,A,T,C,T,C,C,A,G,A,G,A,C,A,A,T,G,C,C,A,A,G,A,A,C,A,C,C,C,T,G,T,A,C,C,T,G,C,A,A,A,T,G,A,G,C,A,G,T,C,T,G,A,A,G,T,C,T,G,A,G,G,A,C,T,C,A,G,C,C,A,T,G,T,A,T,T,A,C,T,G,T,G,C,A,A,G,A,C,G,G,G,A,G,A,G,G,T,A,C,G,A,C,G,A,G,A,A,C,G,G,G,T,T,T,T,C,T,T,A,C,T,G,G,G,G,C,C,A,A,G,G,G,A,C,T,C,T,G,G,T,C,A,C,G,G,T,C,T,C,T,G,C,A,,,,,,,,,,,,,,,,,,
11402,53A_021_H-1524392-R-C5_H_E03,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,LateGC,HA-WT,mix,VH,1-1,HA,LateGC_HA-WT_mix_VH_1-1,query,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGDTYTSCPDSVKG...,False,300.0,100.0,0,0,7,0,0,0,5,0,T,C,C,T,G,T,G,C,A,G,C,C,T,C,T,G,G,A,T,T,C,A,C,T,T,T,C,A,G,T,A,G,C,T,A,T,G,G,C,A,T,G,T,C,T,T,G,G,G,T,T,C,G,C,C,A,G,A,C,T,C,C,A,G,A,C,A,A,G,A,G,G,C,T,G,G,A,G,T,G,G,G,T,C,G,C,A,A,C,C,A,T,T,A,G,T,A,A,T,G,G,T,G,A,T,A,C,T,T,A,C,A,C,C,T,C,C,T,G,T,C,C,A,G,A,C,A,G,T,G,T,G,A,A,G,G,G,G,C,G,A,T,T,C,A,C,C,A,T,C,T,C,C,A,G,A,G,A,C,A,A,T,G,C,C,A,A,G,A,A,C,A,C,C,C,T,G,T,A,C,C,T,G,C,A,A,A,T,G,A,G,C,A,G,T,C,T,G,A,A,G,T,C,T,G,A,G,G,A,C,T,C,A,G,C,C,A,T,G,T,A,T,T,A,C,T,G,T,G,C,A,A,G,A,C,G,G,G,A,G,A,G,G,T,A,C,G,A,C,G,A,C,A,A,C,G,G,C,T,T,T,G,C,T,T,A,C,T,G,G,G,G,C,C,A,A,G,G,G,A,C,T,C,T,G,G,T,C,A,C,G,G,T,C,T,C,T,G,C,A,,,,,,,,,,,,,,,,,,
11403,53A_022_H-1524392-R-C7_H_F03,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,LateGC,HA-WT,mix,VH,1-1,HA,LateGC_HA-WT_mix_VH_1-1,query,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTFYPDTVKG...,False,300.0,100.0,0,0,3,0,0,0,3,0,T,C,C,T,G,T,G,C,A,G,C,C,T,C,T,G,G,A,T,T,C,A,C,T,T,T,C,A,G,T,A,G,C,T,A,T,G,G,C,A,T,G,T,C,T,T,G,G,G,T,T,C,G,C,C,A,G,A,C,T,C,C,A,G,A,C,A,A,G,A,G,G,C,T,G,G,A,G,T,G,G,G,T,C,G,C,A,A,C,C,A,T,T,A,G,T,A,A,T,G,G,T,G,G,T,G,G,T,T,A,C,A,C,C,T,T,C,T,A,T,C,C,A,G,A,C,A,C,T,G,T,G,A,A,G,G,G,G,C,G,A,T,T,C,A,C,C,A,T,C,T,C,C,A,G,A,G,A,C,A,A,T,G,C,C,A,A,G,A,A,C,A,C,C,C,T,G,T,A,C,C,T,G,C,A,A,A,T,G,A,G,C,A,G,T,C,T,G,A,A,G,T,C,T,G,A,G,G,A,C,T,C,A,G,C,C,A,T,G,T,A,T,T,A,C,T,G,T,G,C,A,A,G,A,C,G,G,G,G,G,A,G,G,T,A,C,G,A,C,G,A,G,A,A,C,G,G,G,T,T,T,G,C,T,T,A,C,T,G,G,G,G,C,C,A,A,G,G,G,A,C,T,C,T,G,G,T,C,A,C,G,G,T,C,T,C,T,G,C,A,,,,,,,,,,,,,,,,,,
11404,53A_016_H-1524392-R-B10_H_H02,TCCTGTGCAGCCTCTGGAATCACTTTCAGTAGCTATGGCATGTCTT...,LateGC,HA-WT,mix,VH,1-1,HA,LateGC_HA-WT_mix_VH_1-1,query,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,SCAASGITFSSYGMSWVRQTPDKRLEWVATISNGGGSTYYPDSVKG...,False,300.0,100.0,0,0,4,0,0,0,3,0,T,C,C,T,G,T,G,C,A,G,C,C,T,C,T,G,G,A,A,T,C,A,C,T,T,T,C,A,G,T,A,G,C,T,A,T,G,G,C,A,T,G,T,C,T,T,G,G,G,T,T,C,G,C,C,A,G,A,C,T,C,C,A,G,A,C,A,A,G,A,G,G,C,T,G,G,A,G,T,G,G,G,T,C,G,C,A,A,C,C,A,T,T,A,G,T,A,A,T,G,G,T,G,G,T,G,G,T,T,C,C,A,C,C,T,A,C,T,A,T,C,C,A,G,A,C,A,G,T,G,T,G,A,A,G,G,G,G,C,G,A,T,T,C,A,C,C,A,T,C,T,C,C,A,G,A,G,A,C,A,A,T,G,C,C,A,A,G,A,A,C,A,C,C,C,T,G,T,A,C,C,T,G,C,A,A,A,T,G,A,G,C,A,G,T,C,T,G,A,A,G,T,C,T,G,A,G,G,A,C,T,C,A,G,C,C,A,T,G,T,A,T,T,A,T,T,G,T,G,C,A,A,G,C,C,G,G,G,A,G,A,G,G,T,A,C,G,A,C,G,A,G,A,A,C,G,G,G,T,T,T,G,C,T,T,A,C,T,G,G,G,G,C,C,A,A,G,G,G,A,C,T,C,T,G,G,T,C,A,C,G,G,T,C,T,C,T,G,C,A,,,,,,,,,,,,,,,,,,


In [25]:
def expand_aa(df_initial):
    df = df_initial.copy()
    max_len_aa = int(df['len_aa'].max())
    
    # Expand 'seq_aa' into individual characters with padding if needed
    aa_data = df['seq_aa'].apply(lambda x: list(x.ljust(max_len_aa, ' '))).tolist()
    
    # Create a DataFrame for the new A columns
    df_aa = pd.DataFrame(aa_data, columns=['A{}'.format(i) for i in range(max_len_aa)], index=df.index)
    
    # Concatenate the original df with the new A columns
    df = pd.concat([df, df_aa], axis=1)
    
    # Optionally replace spaces with NaN (if needed)
    df.replace(' ', pd.NA, inplace=True)
    
    return df

In [26]:
dfs_expanded_aas=expand_aa(dfs)
dfs_expanded_aas.to_csv('{}/dfs_expanded_aas.tsv'.format(output_folder_prep), sep = '\t', index=False)

In [27]:
dfs_expanded_aas_cleaned=df_clean_up(dfs_expanded_aas, 'include')
dfs_expanded_aas_cleaned.reset_index(inplace=True, drop=True)
dfs_expanded_aas_cleaned.to_csv('{}/dfs_expanded_aas_included.tsv'.format(output_folder_prep), sep = '\t', index=False)

In [28]:
dfs_expanded_aas_cleaned=df_clean_up(dfs_expanded_aas)
dfs_expanded_aas_cleaned.reset_index(inplace=True, drop=True)
dfs_expanded_aas_cleaned.to_csv('{}/dfs_expanded_aas_excluded.tsv'.format(output_folder_prep), sep = '\t', index=False)

In [30]:
df_stats_seq=pd.read_csv(f"../../D/data/output/df_stats_seq.tsv", sep='\t',header=0, low_memory=False)
df_stats_seq.set_index(['status', 'mouse', 'dataset', 'chain', 'sub_dataset'], inplace=True)
df_stats_seq

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,total,no_alignment,no_locus,no_locus_consensus,stopcodons,10X_chain_filtering,10X_Vgene_filtering,10X_Jgene_filtering,no_alignment_c,no_locus_consensus_c,stopcodon_c,frameshift_c,length_0.7_ref_c,final_c
status,mouse,dataset,chain,sub_dataset,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
EarlyGC,B18-383,APC,VH,-,312,27.0,3.0,,95.0,0.0,0.0,0.0,8.0,,,46.0,25.0,108.0
EarlyGC,B18-383,APC,VL,-,255,4.0,,,23.0,0.0,0.0,0.0,,,,135.0,4.0,89.0
EarlyGC,B18-383,CGG,VH,-,128,4.0,,,39.0,0.0,0.0,0.0,5.0,,,26.0,8.0,46.0
EarlyGC,B18-383,CGG,VL,-,134,32.0,,,18.0,0.0,0.0,0.0,10.0,,,19.0,14.0,41.0
EarlyGC,B18-383,OVA,VH,-,158,,1.0,,55.0,0.0,0.0,0.0,,,,35.0,29.0,38.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Unimmunized,HA-uMT,PP,VL,rep2,3148,,,,,15.0,2.0,0.0,,,,,,3131.0
Unimmunized,HA-uMT,SPL,VH,rep1,1719,,,,,7.0,0.0,0.0,,,,,,1712.0
Unimmunized,HA-uMT,SPL,VH,rep2,9277,,,,,15.0,3.0,0.0,,,,,,9259.0
Unimmunized,HA-uMT,SPL,VL,rep1,1718,,,,,6.0,0.0,0.0,,,,,,1712.0


In [31]:
dfs_expanded_nts_included=pd.read_csv('{}/dfs_expanded_nts_included.tsv'.format(output_folder_prep), sep='\t', header=0, low_memory=False)
dfs_expanded_nts_included.reset_index(inplace=True, drop=True)

In [32]:
def update_stats_seq(current_df, col):
    global df_stats_seq
    group = ['status', 'mouse', 'dataset', 'chain', 'sub_dataset']
    grouped = '_'.join(group)
    grouping = current_df.groupby(group, dropna=False)[['status']]
    for i in grouping.count().index:
        value = grouping.count().loc[i, 'status']
        df_stats_seq.loc[i, col] = value

In [33]:
update_stats_seq(dfs_expanded_nts_included[dfs_expanded_nts_included['nt_miss']==0], 'zero_nt_missmatch')
df_stats_seq.reset_index().to_csv(f"{output_folder_prep}/df_stats_seq.tsv", sep = '\t', header=True, index=False)

  df_stats_seq.loc[i, col] = value
  df_stats_seq.loc[i, col] = value
  df_stats_seq.loc[i, col] = value
  df_stats_seq.loc[i, col] = value
  df_stats_seq.loc[i, col] = value
  df_stats_seq.loc[i, col] = value
  df_stats_seq.loc[i, col] = value
  df_stats_seq.loc[i, col] = value
  df_stats_seq.loc[i, col] = value
  df_stats_seq.loc[i, col] = value
  df_stats_seq.loc[i, col] = value
  df_stats_seq.loc[i, col] = value
  df_stats_seq.loc[i, col] = value
  df_stats_seq.loc[i, col] = value
  df_stats_seq.loc[i, col] = value
  df_stats_seq.loc[i, col] = value
  df_stats_seq.loc[i, col] = value
  df_stats_seq.loc[i, col] = value
  df_stats_seq.loc[i, col] = value


In [34]:
df_stats_seq

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,total,no_alignment,no_locus,no_locus_consensus,stopcodons,10X_chain_filtering,10X_Vgene_filtering,10X_Jgene_filtering,no_alignment_c,no_locus_consensus_c,stopcodon_c,frameshift_c,length_0.7_ref_c,final_c,zero_nt_missmatch
status,mouse,dataset,chain,sub_dataset,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
EarlyGC,B18-383,APC,VH,-,312.0,27.0,3.0,,95.0,0.0,0.0,0.0,8.0,,,46.0,25.0,108.0,14.0
EarlyGC,B18-383,APC,VL,-,255.0,4.0,,,23.0,0.0,0.0,0.0,,,,135.0,4.0,89.0,1.0
EarlyGC,B18-383,CGG,VH,-,128.0,4.0,,,39.0,0.0,0.0,0.0,5.0,,,26.0,8.0,46.0,1.0
EarlyGC,B18-383,CGG,VL,-,134.0,32.0,,,18.0,0.0,0.0,0.0,10.0,,,19.0,14.0,41.0,2.0
EarlyGC,B18-383,OVA,VH,-,158.0,,1.0,,55.0,0.0,0.0,0.0,,,,35.0,29.0,38.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Unimmunized,HA-uMT,MLN,VL,reps,,,,,,,,,,,,,,,10089.0
Unimmunized,HA-uMT,PP,VH,reps,,,,,,,,,,,,,,,4339.0
Unimmunized,HA-uMT,PP,VL,reps,,,,,,,,,,,,,,,4358.0
Unimmunized,HA-uMT,SPL,VH,reps,,,,,,,,,,,,,,,10907.0
