In [1]:
# Python script for analyses of "Antibody affinity birth through somatic hypermutation" publication.
# This pipeline is divided into X sections. At the beginning of each section there is a comment which indicates which figures of the publication are generated based on that section.

# input sequences for these analyses are uploaded in data folder. By a successful run, the result of each section will be saved in output folder.
print('Running...')
import re
import operator

import os
#import sys
import pandas as pd
import numpy as np

import time
import itertools
import matplotlib.pyplot as plt
import glob
#import logomaker #https://logomaker.readthedocs.io

# Functions
def display_big():

    # df = pd.DataFrame()
    # pd.options.display.max_colwidth = 2000
    pd.set_option('display.max_rows', 500)
    pd.set_option('display.max_columns', 500)
    pd.set_option('display.width', 1000)

display_big()

Running...


In [2]:
data_folder='../data'

input_folder = os.getenv('VAR_IN_FOLDER', f"{data_folder}/input")
output_folder = os.getenv('VAR_OUT_FOLDER', f"{data_folder}/output")

In [3]:
files_List=glob.glob(f"../../C/data/output/*_igblast.tsv")

output_folder = f"{output_folder}"
sect_out_folder=f"{output_folder}"

os.makedirs(output_folder, exist_ok=True)

In [4]:
files_List

['../../C/data/output/Unimmunized_B18-383_SPL_VH_rep2_igblast.tsv',
 '../../C/data/output/Published_B18_Passenger_VH_-_igblast.tsv',
 '../../C/data/output/LateGC_HA-WT_APC_VH_1-100_igblast.tsv',
 '../../C/data/output/Unimmunized_HA-uMT_PP_VH_rep2_igblast.tsv',
 '../../C/data/output/LateGC_HA-WT_CGG-Isotype_VL_1-1000_igblast.tsv',
 '../../C/data/output/LateGC_HA-uMT_OVA-Isotype_VH_0-1_igblast.tsv',
 '../../C/data/output/LateGC_HA-WT_CGG_VL_1-100_igblast.tsv',
 '../../C/data/output/EarlyGC_HA-uMT_APC_VL_0-1_igblast.tsv',
 '../../C/data/output/LateGC_HA-WT_OVA_VH_1-1000_igblast.tsv',
 '../../C/data/output/EarlyGC_HA-uMT_CGG_VH_0-1_igblast.tsv',
 '../../C/data/output/EarlyGC_B18-383_APC_VH_-_igblast.tsv',
 '../../C/data/output/EarlyGC_HA-uMT_OVA_VH_0-1_igblast.tsv',
 '../../C/data/output/Unimmunized_B18-383_SPL_VL_rep2_igblast.tsv',
 '../../C/data/output/LateGC_HA-WT_APC_VH_1-1_igblast.tsv',
 '../../C/data/output/EarlyGC_B18-383_OVA_VH_-_igblast.tsv',
 '../../C/data/output/Unimmunized_B18-

In [5]:
mouse_DB_dic={'B18-383':'B18', 'B18':'B18', 'HA-uMT':'HA', 'HA-WT':'HA', 'HA':'HA'}
locus_syno_dic = {'IGK':'VL', 'IGH':'VH', 'IGL':'VL', 'NA':'NA'}

In [6]:
dfs_ref=pd.DataFrame(columns=['type', 'mouse_DB', 'chain', 'header', 'seq'])
dfs_ref

seq='GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCACCAGCTACTGGATGCACTGGGTGAAGCAGAGGCCTGGACGAGGCCTTGAGTGGATTGGAAGGATTGATCCTAATAGTGGTGGTACTAAGTACAATGAGAAGTTCAAGAGCAAGGCCACACTGACTGTAGACAAACCCTCCAGCACAGCCTACATGCAGCTCAGCAGCCTGACATCTGAGGACTCTGCGGTCTATTATTGCGCAAGATACGATTACTACGGTAGTAGCTACTTTGACTACTGGGGCCAAGGCACCACTCTCACAGTCTCCTCA'
dfs_ref.loc[len(dfs_ref)]=['References', 'B18', 'VH', '>B18-383-VH_ref', seq]
seq='TCTCTAGGGGAACGGGTCACCATGACCTGCACTGCCAGCTCAAGTGTAAGTTCCAGTTACTTGCACTGGTTCCAGCAGAAGCCAGGATCCTCCCCCAAACTCTGGATTTATAGCACATCCAACCTGGCTTCTGGAGTCCCAGGTCGCTTCAGTGGCAGTGGGTCTGAGACCTCTTACTCTCTCACAATCGGCAGCATGGAGGCTGAAGATGCTGCCACTTATTACTGCCACCAGTATCATCGTTCCCCACCCACGTTCGGGGGGGGGACCAAGCTGGAAATAAAA'
dfs_ref.loc[len(dfs_ref)]=['References', 'B18', 'VL', '>B18-383-VL_ref', seq]
seq='TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTTGGGTTCGCCAGACTCCAGACAAGAGGCTGGAGTGGGTCGCAACCATTAGTAATGGTGGTGGTTACACCTACTATCCAGACAGTGTGAAGGGGCGATTCACCATCTCCAGAGACAATGCCAAGAACACCCTGTACCTGCAAATGAGCAGTCTGAAGTCTGAGGACTCAGCCATGTATTACTGTGCAAGACGGGAGAGGTACGACGAGAACGGGTTTGCTTACTGGGGCCAAGGGACTCTGGTCACGGTCTCTGCA'
dfs_ref.loc[len(dfs_ref)]=['References', 'HA', 'VH', '>HA-VH_ref', seq]
seq='GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAAAGCAAAAGAATTACTTGACCTGGTACCAGCAGAAACCAGGACAGCCTCCTAAAGTGTTGATCTACTGGGCATCCACTAGGGAATCTGGGGTCCCTGATCGCTTCACAGGCAGTGGATCTGGAACAGATTTCACTCTCACCATCAGCAGTGTGCAGGCTGAAGACCTGGCAGTTTATTACTGTCAGAATGATTATAGTAATCCGCTCACGTTCGGTGGTGGGACCAAGCTGGAGCTGAAA'
dfs_ref.loc[len(dfs_ref)]=['References', 'HA', 'VL', '>HA-VL_ref', seq]
dfs_ref

Unnamed: 0,type,mouse_DB,chain,header,seq
0,References,B18,VH,>B18-383-VH_ref,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...
1,References,B18,VL,>B18-383-VL_ref,TCTCTAGGGGAACGGGTCACCATGACCTGCACTGCCAGCTCAAGTG...
2,References,HA,VH,>HA-VH_ref,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...
3,References,HA,VL,>HA-VL_ref,GTCACTATGAGCTGCACGTCCAGTCAGAGTCTGTTTAACAGTGGAA...


In [7]:
dfs=pd.DataFrame()
for file in files_List:

    labels = file.split('/')[-1].split('_igblast.tsv')[0].split('_')[:5]
    status, mouse, dataset, chain, sub_dataset = labels
    label= '_'.join(labels)

    df=pd.read_csv(file, sep='\t',header=0, low_memory=False)
    df['status']=status
    df['mouse']=mouse
    df['dataset']=dataset
    df['chain']=chain
    df['sub_dataset']=sub_dataset
    df['mouse_DB']=mouse_DB_dic[mouse]

    # print(set(df['chain']))
    # print(chain, locus)
    if status == 'Unimmunized':
        if len(set(df['locus'])) > 1: 
            print('ATTENTION !!!!!!!!!!!', label)
            print(df.groupby('locus', dropna=False).count()['sequence'])
            raise Exception("problem!")

        grouping = df.groupby('locus', dropna=False)
        for grouped, df_grouped in grouping:
            #print(grouped, len(df_grouped))
            # df_grouped['chain'] = locus_syno_dic[grouped]
            df_grouped['sequencing'] = 'sc-vdj'
            dfs = pd.concat([dfs, df_grouped], axis=0)
            
    #display(df)
    #print(status, mouse, dataset, len(df))
    else:
        
        if status == 'Published':
            df['sequencing'] = 'deep'

        else:
            df['sequencing'] = 'sanger'
            
        dfs = pd.concat([dfs, df], axis=0)

In [8]:
set(dfs['chain'])

{'VH', 'VL'}

In [9]:
set(dfs['sequencing'])

{'deep', 'sanger', 'sc-vdj'}

In [10]:
set(dfs['locus'])

{'IGH', 'IGK', nan}

In [11]:
dfs['label'] = dfs[['status', 'mouse', 'dataset', 'chain', 'sub_dataset']].agg('_'.join, axis=1)

dfs.reset_index(drop=True, inplace=True)
dfs #174415

Unnamed: 0,sequence_id,sequence,sequence_aa,locus,stop_codon,vj_in_frame,v_frameshift,productive,rev_comp,complete_vdj,d_frame,v_call,d_call,j_call,sequence_alignment,germline_alignment,sequence_alignment_aa,germline_alignment_aa,v_alignment_start,v_alignment_end,d_alignment_start,d_alignment_end,j_alignment_start,j_alignment_end,v_sequence_alignment,v_sequence_alignment_aa,v_germline_alignment,v_germline_alignment_aa,d_sequence_alignment,d_sequence_alignment_aa,d_germline_alignment,d_germline_alignment_aa,j_sequence_alignment,j_sequence_alignment_aa,j_germline_alignment,j_germline_alignment_aa,fwr1,fwr1_aa,cdr1,cdr1_aa,fwr2,fwr2_aa,cdr2,cdr2_aa,fwr3,fwr3_aa,fwr4,fwr4_aa,cdr3,cdr3_aa,junction,junction_length,junction_aa,junction_aa_length,v_score,d_score,j_score,v_cigar,d_cigar,j_cigar,v_support,d_support,j_support,v_identity,d_identity,j_identity,v_sequence_start,v_sequence_end,v_germline_start,v_germline_end,d_sequence_start,d_sequence_end,d_germline_start,d_germline_end,j_sequence_start,j_sequence_end,j_germline_start,j_germline_end,fwr1_start,fwr1_end,cdr1_start,cdr1_end,fwr2_start,fwr2_end,cdr2_start,cdr2_end,fwr3_start,fwr3_end,fwr4_start,fwr4_end,cdr3_start,cdr3_end,np1,np1_length,np2,np2_length,status,mouse,dataset,chain,sub_dataset,mouse_DB,sequencing,label
0,AAACCATTCGGGCTTG-1_contig_1,CAGGTCCAACTGCAGCAGCCTGGGGCTGAGCTTGTGAAGCCTGGGG...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,IGH,F,,,,F,F,,IGHV_B18,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,1.0,318.0,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,,,,,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCT,GASVKLSCKAS,GGCTACACCTTCACCAGCTACTGG,GYTFTSYW,ATGCACTGGGTGAAGCAGAGGCCTGGACGAGGCCTTGAGTGGATTG...,MHWVKQRPGRGLEWIGR,ATTGATCCTAATAGTGGTGGTACT,IDPNSGGT,AAGTACAATGAGAAGTTCAAGAGCAAGGCCACACTGACTGTAGACA...,KYNEKFKSKATLTVDKPSSTAYMQLSSLTSEDSAVYYCARYDYYGS...,,,,,,,,,497.316,,,42S318M1S,,,3.819000e-145,,,100.000,,,43.0,360.0,1.0,318.0,,,,,,,,,43.0,75.0,76.0,99.0,100.0,150.0,151.0,174.0,175.0,360.0,,,,,,,,,Unimmunized,B18-383,SPL,VH,rep2,B18,sc-vdj,Unimmunized_B18-383_SPL_VH_rep2
1,AAACCATTCGTCCTGC-1_contig_1,CAGGTCCAACTGCAGCAGCCTGGGGCTGAGCTTGTGAAGCCTGGGG...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,IGH,F,,,,F,F,,IGHV_B18,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,1.0,318.0,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,,,,,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCT,GASVKLSCKAS,GGCTACACCTTCACCAGCTACTGG,GYTFTSYW,ATGCACTGGGTGAAGCAGAGGCCTGGACGAGGCCTTGAGTGGATTG...,MHWVKQRPGRGLEWIGR,ATTGATCCTAATAGTGGTGGTACT,IDPNSGGT,AAGTACAATGAGAAGTTCAAGAGCAAGGCCACACTGACTGTAGACA...,KYNEKFKSKATLTVDKPSSTAYMQLSSLTSEDSAVYYCARYDYYGS...,,,,,,,,,497.316,,,42S318M1S,,,3.819000e-145,,,100.000,,,43.0,360.0,1.0,318.0,,,,,,,,,43.0,75.0,76.0,99.0,100.0,150.0,151.0,174.0,175.0,360.0,,,,,,,,,Unimmunized,B18-383,SPL,VH,rep2,B18,sc-vdj,Unimmunized_B18-383_SPL_VH_rep2
2,AAACCCCAGAAACGGT-1_contig_1,CAGGTCCAACTGCAGCAGCCTGGGGCTGAGCTTGTGAAGCCTGGGG...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,IGH,F,,,,F,F,,IGHV_B18,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,1.0,318.0,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,,,,,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCT,GASVKLSCKAS,GGCTACACCTTCACCAGCTACTGG,GYTFTSYW,ATGCACTGGGTGAAGCAGAGGCCTGGACGAGGCCTTGAGTGGATTG...,MHWVKQRPGRGLEWIGR,ATTGATCCTAATAGTGGTGGTACT,IDPNSGGT,AAGTACAATGAGAAGTTCAAGAGCAAGGCCACACTGACTGTAGACA...,KYNEKFKSKATLTVDKPSSTAYMQLSSLTSEDSAVYYCARYDYYGS...,,,,,,,,,497.316,,,42S318M1S,,,3.819000e-145,,,100.000,,,43.0,360.0,1.0,318.0,,,,,,,,,43.0,75.0,76.0,99.0,100.0,150.0,151.0,174.0,175.0,360.0,,,,,,,,,Unimmunized,B18-383,SPL,VH,rep2,B18,sc-vdj,Unimmunized_B18-383_SPL_VH_rep2
3,AAACCCCAGCCAGCAT-1_contig_1,CAGGTCCAACTGCAGCAGCCTGGGGCTGAGCTTGTGAAGCCTGGGG...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,IGH,F,,,,F,F,,IGHV_B18,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,1.0,318.0,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,,,,,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCT,GASVKLSCKAS,GGCTACACCTTCACCAGCTACTGG,GYTFTSYW,ATGCACTGGGTGAAGCAGAGGCCTGGACGAGGCCTTGAGTGGATTG...,MHWVKQRPGRGLEWIGR,ATTGATCCTAATAGTGGTGGTACT,IDPNSGGT,AAGTACAATGAGAAGTTCAAGAGCAAGGCCACACTGACTGTAGACA...,KYNEKFKSKATLTVDKPSSTAYMQLSSLTSEDSAVYYCARYDYYGS...,,,,,,,,,497.316,,,42S318M1S,,,3.819000e-145,,,100.000,,,43.0,360.0,1.0,318.0,,,,,,,,,43.0,75.0,76.0,99.0,100.0,150.0,151.0,174.0,175.0,360.0,,,,,,,,,Unimmunized,B18-383,SPL,VH,rep2,B18,sc-vdj,Unimmunized_B18-383_SPL_VH_rep2
4,AAACCCCAGCGTAGCT-1_contig_1,CAGGTCCAACTGCAGCAGCCTGGGGCTGAGCTTGTGAAGCCTGGGG...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,IGH,F,,,,F,F,,IGHV_B18,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,1.0,318.0,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,,,,,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCT,GASVKLSCKAS,GGCTACACCTTCACCAGCTACTGG,GYTFTSYW,ATGCACTGGGTGAAGCAGAGGCCTGGACGAGGCCTTGAGTGGATTG...,MHWVKQRPGRGLEWIGR,ATTGATCCTAATAGTGGTGGTACT,IDPNSGGT,AAGTACAATGAGAAGTTCAAGAGCAAGGCCACACTGACTGTAGACA...,KYNEKFKSKATLTVDKPSSTAYMQLSSLTSEDSAVYYCARYDYYGS...,,,,,,,,,497.316,,,42S318M1S,,,3.819000e-145,,,100.000,,,43.0,360.0,1.0,318.0,,,,,,,,,43.0,75.0,76.0,99.0,100.0,150.0,151.0,174.0,175.0,360.0,,,,,,,,,Unimmunized,B18-383,SPL,VH,rep2,B18,sc-vdj,Unimmunized_B18-383_SPL_VH_rep2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97868,52A_006_H-1406143-R-OVAGC-A9_H_F01,GGAGAGACTTAGTGAAGCCTGGAGGGTCCCTGAAACTCTCCTGTGC...,SCAASGFFFFTYGMSWVRQTPDKRLEWVATISYGGGYIYYSDSVKG...,IGH,F,,,,F,F,,IGHV_HA,,,TCCTGTGCAGCCTCTGGATTTTTTTTCTTTACCTATGGCATGTCTT...,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFFFFTYGMSWVRQTPDKRLEWVATISYGGGYIYYSDSVKG...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,1.0,300.0,,,,,TCCTGTGCAGCCTCTGGATTTTTTTTCTTTACCTATGGCATGTCTT...,SCAASGFFFFTYGMSWVRQTPDKRLEWVATISYGGGYIYYSDSVKG...,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,,,,,,,,,TCCTGTGCAGCCTCT,SCAAS,GGATTTTTTTTCTTTACCTATGGC,GFFFFTYG,ATGTCTTGGGTTCGCCAGACTCCAGACAAGAGGCTGGAGTGGGTCG...,MSWVRQTPDKRLEWVAT,TTAGTTATGGTGGTGGTTACATT,ISYGGGYI,TACTATTCAGACAGTGTCAAGGGACGATTCACCATCTCCAGAGACA...,YYSDSVKGRFTISRDNAKNTLYLRMSSLKSEDSAMYYCARRERYDE...,,,,,,,,,422.526,,,38S300M1S,,,1.130000e-122,,,95.000,,,39.0,338.0,1.0,300.0,,,,,,,,,39.0,53.0,54.0,77.0,78.0,129.0,130.0,152.0,153.0,338.0,,,,,,,,,LateGC,HA-uMT,OVA,VH,0-1,HA,sanger,LateGC_HA-uMT_OVA_VH_0-1
97869,52A_024_H-1406143-L-OVAGC-C7_H_H03,GAGGATCCCTCCAACTCTCCTGTGTAGCCTCTGGATTCACTTTCAG...,SCVASGFTFSSYGLSWVRQTPDKRPEWVATISYGVGYTFYLDSVKG...,IGH,F,,,,F,F,,IGHV_HA,,,TCCTGTGTAGCCTCTGGATTCACTTTCAGTAGCTATGGCTTGTCTT...,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCVASGFTFSSYGLSWVRQTPDKRPEWVATISYGVGYTFYLDSVKG...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,1.0,300.0,,,,,TCCTGTGTAGCCTCTGGATTCACTTTCAGTAGCTATGGCTTGTCTT...,SCVASGFTFSSYGLSWVRQTPDKRPEWVATISYGVGYTFYLDSVKG...,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,,,,,,,,,TCCTGTGTAGCCTCT,SCVAS,GGATTCACTTTCAGTAGCTATGGC,GFTFSSYG,TTGTCTTGGGTTCGCCAGACTCCAGACAAGAGGCCGGAATGGGTCG...,LSWVRQTPDKRPEWVAT,TTAGTTATGGTGTTGGTTATACT,ISYGVGYT,TTCTATTTAGACAGTGTGAAGGGGCGATTCACCATCTCCAGAGACA...,FYLDSVKGRFTISRDNAKNTLYLQMSSLKSEDSAMYYCTRREKYDE...,,,,,,,,,419.410,,,17S300M1S,,,9.219000e-122,,,94.667,,,18.0,317.0,1.0,300.0,,,,,,,,,18.0,32.0,33.0,56.0,57.0,108.0,109.0,131.0,132.0,317.0,,,,,,,,,LateGC,HA-uMT,OVA,VH,0-1,HA,sanger,LateGC_HA-uMT_OVA_VH_0-1
97870,53A_026_H-1443862-L-OVAGC-C8_H_B04,GGGAGTCCTAGTGCAGCCTGGAGGCTCCCTGTCCCTCTCCTGTGCA...,SCAASGFTFSVYGMSWVRQTPDKRLEWVATISYGVGYTFYADNLKG...,IGH,F,,,,F,F,,IGHV_HA,,,TCCTGTGCAGCCTCTGGATTCACTTTCAGTGTTTATGGCATGTCTT...,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSVYGMSWVRQTPDKRLEWVATISYGVGYTFYADNLKG...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,1.0,300.0,,,,,TCCTGTGCAGCCTCTGGATTCACTTTCAGTGTTTATGGCATGTCTT...,SCAASGFTFSVYGMSWVRQTPDKRLEWVATISYGVGYTFYADNLKG...,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,,,,,,,,,TCCTGTGCAGCCTCT,SCAAS,GGATTCACTTTCAGTGTTTATGGC,GFTFSVYG,ATGTCTTGGGTTCGCCAGACTCCAGACAAGAGGCTGGAGTGGGTCG...,MSWVRQTPDKRLEWVAT,TTAGTTATGGTGTTGGTTACACC,ISYGVGYT,TTCTATGCAGACAATTTGAAGGGGCGTTTCACCATCTCCAGAGACA...,FYADNLKGRFTISRDNAKNTLYLQMNSLKSEDSAMYYCTRRERFNE...,,,,,,,,,419.410,,,37S300M1S,,,9.765000e-122,,,94.667,,,38.0,337.0,1.0,300.0,,,,,,,,,38.0,52.0,53.0,76.0,77.0,128.0,129.0,151.0,152.0,337.0,,,,,,,,,LateGC,HA-uMT,OVA,VH,0-1,HA,sanger,LateGC_HA-uMT_OVA_VH_0-1
97871,52A_029_H-1406143-R-OVAGC-C11_H_E04,AGCCTGGAGGGTCCCTGAAACTCTCCTGTGTTGCCTCTGGATTCAC...,SCVASGFTFSTYGMSWVRQTPDKRLEWVATSSYGVGYTLYSDNVKG...,IGH,F,,,,F,F,,IGHV_HA,,,TCCTGTGTTGCCTCTGGATTCACTTTCAGTACCTATGGCATGTCTT...,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCVASGFTFSTYGMSWVRQTPDKRLEWVATSSYGVGYTLYSDNVKG...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,1.0,300.0,,,,,TCCTGTGTTGCCTCTGGATTCACTTTCAGTACCTATGGCATGTCTT...,SCVASGFTFSTYGMSWVRQTPDKRLEWVATSSYGVGYTLYSDNVKG...,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,,,,,,,,,TCCTGTGTTGCCTCT,SCVAS,GGATTCACTTTCAGTACCTATGGC,GFTFSTYG,ATGTCTTGGGTTCGCCAGACTCCAGACAAGAGGCTGGAGTGGGTCG...,MSWVRQTPDKRLEWVAT,GTAGTTATGGTGTTGGTTACACC,SSYGVGYT,CTCTATTCAGACAATGTGAAGGGGCGGTTCACCATCTCCAGAGACA...,LYSDNVKGRFTISRDNAKNTLYLQMSGLKSEDSAIYYCSRRGWYGE...,,,,,,,,,394.480,,,23S300M1S,,,3.005000e-114,,,92.000,,,24.0,323.0,1.0,300.0,,,,,,,,,24.0,38.0,39.0,62.0,63.0,114.0,115.0,137.0,138.0,323.0,,,,,,,,,LateGC,HA-uMT,OVA,VH,0-1,HA,sanger,LateGC_HA-uMT_OVA_VH_0-1


In [12]:
grouping=dfs.groupby(by=['status', 'mouse', 'dataset', 'chain', 'sub_dataset'])

for grouped, df in grouping:
    suffix='_'.join(grouped)
    df.reset_index(drop=True, inplace=True)
    print(suffix, len(df))

EarlyGC_B18-383_APC_VH_- 187
EarlyGC_B18-383_APC_VL_- 228
EarlyGC_B18-383_CGG_VH_- 85
EarlyGC_B18-383_CGG_VL_- 84
EarlyGC_B18-383_OVA_VH_- 102
EarlyGC_B18-383_OVA_VL_- 135
EarlyGC_HA-uMT_APC_VH_0-1 93
EarlyGC_HA-uMT_APC_VL_0-1 124
EarlyGC_HA-uMT_CGG_VH_0-1 53
EarlyGC_HA-uMT_CGG_VL_0-1 168
EarlyGC_HA-uMT_OVA_VH_0-1 58
EarlyGC_HA-uMT_OVA_VL_0-1 84
LateGC_B18-383_APC_VH_- 95
LateGC_B18-383_APC_VL_- 112
LateGC_B18-383_CGG_VH_- 49
LateGC_B18-383_CGG_VL_- 79
LateGC_B18-383_OVA_VH_- 146
LateGC_B18-383_OVA_VL_- 218
LateGC_B18-383_OVA-CTLA4_VH_- 125
LateGC_B18-383_OVA-CTLA4_VL_- 181
LateGC_B18-383_OVA-Isotype_VH_- 159
LateGC_B18-383_OVA-Isotype_VL_- 286
LateGC_HA-WT_APC_VH_1-1 30
LateGC_HA-WT_APC_VH_1-100 79
LateGC_HA-WT_APC_VH_1-1000 27
LateGC_HA-WT_APC_VL_1-1 62
LateGC_HA-WT_APC_VL_1-100 157
LateGC_HA-WT_APC_VL_1-1000 58
LateGC_HA-WT_CGG_VH_1-1 48
LateGC_HA-WT_CGG_VH_1-100 66
LateGC_HA-WT_CGG_VH_1-1000 64
LateGC_HA-WT_CGG_VL_1-1 94
LateGC_HA-WT_CGG_VL_1-100 109
LateGC_HA-WT_CGG_VL_1-1000 132


In [13]:
df_stats_seq=pd.read_csv(f"../../B/data/output/df_stats_seq.tsv", sep='\t',header=0, low_memory=False)
df_stats_seq.set_index(['status', 'mouse', 'dataset', 'chain', 'sub_dataset'], inplace=True)
df_stats_seq

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,total,no_alignment,no_locus,no_locus_consensus,stopcodons,10X_chain_filtering,10X_Vgene_filtering,10X_Jgene_filtering
status,mouse,dataset,chain,sub_dataset,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
EarlyGC,B18-383,APC,VH,-,312,27.0,3.0,,95.0,0.0,0.0,0.0
EarlyGC,B18-383,APC,VL,-,255,4.0,,,23.0,0.0,0.0,0.0
EarlyGC,B18-383,CGG,VH,-,128,4.0,,,39.0,0.0,0.0,0.0
EarlyGC,B18-383,CGG,VL,-,134,32.0,,,18.0,0.0,0.0,0.0
EarlyGC,B18-383,OVA,VH,-,158,,1.0,,55.0,0.0,0.0,0.0
EarlyGC,B18-383,OVA,VL,-,157,,,,22.0,0.0,0.0,0.0
EarlyGC,HA-uMT,APC,VH,0-1,160,57.0,,,10.0,0.0,0.0,0.0
EarlyGC,HA-uMT,APC,VL,0-1,159,10.0,,,25.0,0.0,0.0,0.0
EarlyGC,HA-uMT,CGG,VH,0-1,65,1.0,,,11.0,0.0,0.0,0.0
EarlyGC,HA-uMT,CGG,VL,0-1,206,1.0,,,37.0,0.0,0.0,0.0


In [14]:
def update_stats_seq(current_df, col):
    global df_stats_seq
    group = ['status', 'mouse', 'dataset', 'chain', 'sub_dataset']
    grouped = '_'.join(group)
    grouping = current_df.groupby(group, dropna=False)[['sequence_id']]
    for i in grouping.count().index:
        value = grouping.count().loc[i, 'sequence_id']
        df_stats_seq.loc[i, col] = value

### Discard sequences with no alignment

In [15]:
len(dfs[dfs['sequence_alignment'].isna()])

42

In [16]:
update_stats_seq(dfs[dfs['sequence_alignment'].isna()], 'no_alignment_c')
df_stats_seq

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,total,no_alignment,no_locus,no_locus_consensus,stopcodons,10X_chain_filtering,10X_Vgene_filtering,10X_Jgene_filtering,no_alignment_c
status,mouse,dataset,chain,sub_dataset,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
EarlyGC,B18-383,APC,VH,-,312,27.0,3.0,,95.0,0.0,0.0,0.0,8.0
EarlyGC,B18-383,APC,VL,-,255,4.0,,,23.0,0.0,0.0,0.0,
EarlyGC,B18-383,CGG,VH,-,128,4.0,,,39.0,0.0,0.0,0.0,5.0
EarlyGC,B18-383,CGG,VL,-,134,32.0,,,18.0,0.0,0.0,0.0,10.0
EarlyGC,B18-383,OVA,VH,-,158,,1.0,,55.0,0.0,0.0,0.0,
EarlyGC,B18-383,OVA,VL,-,157,,,,22.0,0.0,0.0,0.0,2.0
EarlyGC,HA-uMT,APC,VH,0-1,160,57.0,,,10.0,0.0,0.0,0.0,1.0
EarlyGC,HA-uMT,APC,VL,0-1,159,10.0,,,25.0,0.0,0.0,0.0,1.0
EarlyGC,HA-uMT,CGG,VH,0-1,65,1.0,,,11.0,0.0,0.0,0.0,3.0
EarlyGC,HA-uMT,CGG,VL,0-1,206,1.0,,,37.0,0.0,0.0,0.0,


In [17]:
dfs.dropna(subset=['sequence_alignment'], inplace=True)

In [18]:
len(dfs[dfs['sequence_alignment'].isna()])

0

### Discard sequences with no locus annotation

In [19]:
len(dfs[dfs['locus'].isna()])

0

In [20]:
update_stats_seq(dfs[dfs['locus'].isna()], 'no_locus_c')
df_stats_seq

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,total,no_alignment,no_locus,no_locus_consensus,stopcodons,10X_chain_filtering,10X_Vgene_filtering,10X_Jgene_filtering,no_alignment_c
status,mouse,dataset,chain,sub_dataset,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
EarlyGC,B18-383,APC,VH,-,312,27.0,3.0,,95.0,0.0,0.0,0.0,8.0
EarlyGC,B18-383,APC,VL,-,255,4.0,,,23.0,0.0,0.0,0.0,
EarlyGC,B18-383,CGG,VH,-,128,4.0,,,39.0,0.0,0.0,0.0,5.0
EarlyGC,B18-383,CGG,VL,-,134,32.0,,,18.0,0.0,0.0,0.0,10.0
EarlyGC,B18-383,OVA,VH,-,158,,1.0,,55.0,0.0,0.0,0.0,
EarlyGC,B18-383,OVA,VL,-,157,,,,22.0,0.0,0.0,0.0,2.0
EarlyGC,HA-uMT,APC,VH,0-1,160,57.0,,,10.0,0.0,0.0,0.0,1.0
EarlyGC,HA-uMT,APC,VL,0-1,159,10.0,,,25.0,0.0,0.0,0.0,1.0
EarlyGC,HA-uMT,CGG,VH,0-1,65,1.0,,,11.0,0.0,0.0,0.0,3.0
EarlyGC,HA-uMT,CGG,VL,0-1,206,1.0,,,37.0,0.0,0.0,0.0,


In [21]:
dfs.dropna(subset=['locus'], inplace=True)

In [22]:
len(dfs[dfs['locus'].isna()])

0

In [23]:
set(dfs['locus'])

{'IGH', 'IGK'}

### Filter based on locus consensus between "locus" and "v_call" and "c_call" if available

In [24]:
dfs['locus_v_call'] = dfs['v_call'].apply(lambda x: x.split('V')[0] if isinstance(x, str) and 'V' in x else x)

In [25]:
def determine_value(row):
    values = [row['locus'], row['locus_v_call']]
    non_nan_values = [v for v in values if not pd.isna(v)]
    
    if not non_nan_values:
        return 'NA'
    
    if len(non_nan_values) == 1:
        return non_nan_values[0]
    
    unique_values = set(non_nan_values)
    
    if len(unique_values) == 1:
        return unique_values.pop()
    else:
        return 'disagree'

# Apply the function to each row
dfs['locus_consensus'] = dfs.apply(determine_value, axis=1)
len(dfs[dfs['locus_consensus']=='disagree'])

0

In [26]:
print(set(dfs['locus'].apply(lambda x: x if pd.notna(x) else 'NaN')))
print(set(dfs['locus_v_call'].apply(lambda x: x if pd.notna(x) else 'NaN')))
print(set(dfs['locus_consensus']))

{'IGH', 'IGK'}
{'IGH', 'IGK'}
{'IGH', 'IGK'}


In [27]:
# Passenger dataset should be IGH only
dfs.loc[(dfs['status'] == 'Published') & (dfs['locus'] != 'IGH'), 'locus_consensus'] = 'disagree'

In [28]:
grouping = dfs.groupby('locus_consensus', dropna=False)[['sequence_id']]
grouping.count()

Unnamed: 0_level_0,sequence_id
locus_consensus,Unnamed: 1_level_1
IGH,54384
IGK,43446
disagree,1


In [29]:
dfs[dfs['locus_consensus']=='disagree']

Unnamed: 0,sequence_id,sequence,sequence_aa,locus,stop_codon,vj_in_frame,v_frameshift,productive,rev_comp,complete_vdj,d_frame,v_call,d_call,j_call,sequence_alignment,germline_alignment,sequence_alignment_aa,germline_alignment_aa,v_alignment_start,v_alignment_end,d_alignment_start,d_alignment_end,j_alignment_start,j_alignment_end,v_sequence_alignment,v_sequence_alignment_aa,v_germline_alignment,v_germline_alignment_aa,d_sequence_alignment,d_sequence_alignment_aa,d_germline_alignment,d_germline_alignment_aa,j_sequence_alignment,j_sequence_alignment_aa,j_germline_alignment,j_germline_alignment_aa,fwr1,fwr1_aa,cdr1,cdr1_aa,fwr2,fwr2_aa,cdr2,cdr2_aa,fwr3,fwr3_aa,fwr4,fwr4_aa,cdr3,cdr3_aa,junction,junction_length,junction_aa,junction_aa_length,v_score,d_score,j_score,v_cigar,d_cigar,j_cigar,v_support,d_support,j_support,v_identity,d_identity,j_identity,v_sequence_start,v_sequence_end,v_germline_start,v_germline_end,d_sequence_start,d_sequence_end,d_germline_start,d_germline_end,j_sequence_start,j_sequence_end,j_germline_start,j_germline_end,fwr1_start,fwr1_end,cdr1_start,cdr1_end,fwr2_start,fwr2_end,cdr2_start,cdr2_end,fwr3_start,fwr3_end,fwr4_start,fwr4_end,cdr3_start,cdr3_end,np1,np1_length,np2,np2_length,status,mouse,dataset,chain,sub_dataset,mouse_DB,sequencing,label,locus_v_call,locus_consensus
11558,SRR2229673.377,ACAAAAATACTGATGGCAGTCGGCGTGTGAATCATTAGCCTTGCGA...,GSRRVNH*PCDPRQQEPYDQYHENSHAKHWDYH,IGK,T,,,F,F,F,,IGKV_B18,,,TGGCAGTCGGCGTGTGAATCATTAGCCTTGCGACCCTCGGCAGCA,TGGCAGTGGGTCTGAGACCTCTTACTCTCTC-ACAATCGGCAGCA,GSRRVNH*PCDPRQ,GSGSETSYSLTIGS,1.0,45.0,,,,,TGGCAGTCGGCGTGTGAATCATTAGCCTTGCGACCCTCGGCAGCA,GSRRVNH*PCDPRQ,TGGCAGTGGGTCTGAGACCTCTTACTCTCTC-ACAATCGGCAGCA,GSGSETSYSLTIGS,,,,,,,,,,,,,,,,,TGGCAGTCGGCGTGTGAATCATTAGCCTTGCGACCCTCGGCAGCA,GSRRVNH*PCDPRQ,,,,,,,,,18.976,,,13S152N31M1I13M56S89N,,,0.11,,,66.667,,,14.0,58.0,153.0,196.0,,,,,,,,,,,,,,,,,14.0,58.0,,,,,,,,,Published,B18,Passenger,VH,-,B18,deep,Published_B18_Passenger_VH_-,IGK,disagree


In [30]:
update_stats_seq(dfs[dfs['locus_consensus']=='disagree'], 'no_locus_consensus_c')
df_stats_seq

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,total,no_alignment,no_locus,no_locus_consensus,stopcodons,10X_chain_filtering,10X_Vgene_filtering,10X_Jgene_filtering,no_alignment_c,no_locus_consensus_c
status,mouse,dataset,chain,sub_dataset,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
EarlyGC,B18-383,APC,VH,-,312,27.0,3.0,,95.0,0.0,0.0,0.0,8.0,
EarlyGC,B18-383,APC,VL,-,255,4.0,,,23.0,0.0,0.0,0.0,,
EarlyGC,B18-383,CGG,VH,-,128,4.0,,,39.0,0.0,0.0,0.0,5.0,
EarlyGC,B18-383,CGG,VL,-,134,32.0,,,18.0,0.0,0.0,0.0,10.0,
EarlyGC,B18-383,OVA,VH,-,158,,1.0,,55.0,0.0,0.0,0.0,,
EarlyGC,B18-383,OVA,VL,-,157,,,,22.0,0.0,0.0,0.0,2.0,
EarlyGC,HA-uMT,APC,VH,0-1,160,57.0,,,10.0,0.0,0.0,0.0,1.0,
EarlyGC,HA-uMT,APC,VL,0-1,159,10.0,,,25.0,0.0,0.0,0.0,1.0,
EarlyGC,HA-uMT,CGG,VH,0-1,65,1.0,,,11.0,0.0,0.0,0.0,3.0,
EarlyGC,HA-uMT,CGG,VL,0-1,206,1.0,,,37.0,0.0,0.0,0.0,,


In [31]:
dfs=dfs[dfs['locus_consensus']!='disagree'].copy()
len(dfs)

97830

In [32]:
grouping=dfs.groupby(by=['status', 'mouse', 'dataset', 'chain', 'sub_dataset'])

for grouped, df in grouping:
    suffix='_'.join(grouped)
    df.reset_index(drop=True, inplace=True)
    print(suffix, len(df))

EarlyGC_B18-383_APC_VH_- 179
EarlyGC_B18-383_APC_VL_- 228
EarlyGC_B18-383_CGG_VH_- 80
EarlyGC_B18-383_CGG_VL_- 74
EarlyGC_B18-383_OVA_VH_- 102
EarlyGC_B18-383_OVA_VL_- 133
EarlyGC_HA-uMT_APC_VH_0-1 92
EarlyGC_HA-uMT_APC_VL_0-1 123
EarlyGC_HA-uMT_CGG_VH_0-1 50
EarlyGC_HA-uMT_CGG_VL_0-1 168
EarlyGC_HA-uMT_OVA_VH_0-1 51
EarlyGC_HA-uMT_OVA_VL_0-1 81
LateGC_B18-383_APC_VH_- 95
LateGC_B18-383_APC_VL_- 112
LateGC_B18-383_CGG_VH_- 49
LateGC_B18-383_CGG_VL_- 79
LateGC_B18-383_OVA_VH_- 146
LateGC_B18-383_OVA_VL_- 218
LateGC_B18-383_OVA-CTLA4_VH_- 125
LateGC_B18-383_OVA-CTLA4_VL_- 181
LateGC_B18-383_OVA-Isotype_VH_- 159
LateGC_B18-383_OVA-Isotype_VL_- 286
LateGC_HA-WT_APC_VH_1-1 30
LateGC_HA-WT_APC_VH_1-100 79
LateGC_HA-WT_APC_VH_1-1000 27
LateGC_HA-WT_APC_VL_1-1 62
LateGC_HA-WT_APC_VL_1-100 157
LateGC_HA-WT_APC_VL_1-1000 58
LateGC_HA-WT_CGG_VH_1-1 48
LateGC_HA-WT_CGG_VH_1-100 66
LateGC_HA-WT_CGG_VH_1-1000 64
LateGC_HA-WT_CGG_VL_1-1 94
LateGC_HA-WT_CGG_VL_1-100 109
LateGC_HA-WT_CGG_VL_1-1000 132


### Discard sequences with stop codon except passenger dataset

In [33]:
dfs.loc[(dfs['status'] != 'Published') & (dfs['sequence_alignment_aa'].str.contains(r'\*', na=False)), 'stopcodon'] = True

In [34]:
dfs.loc[(dfs['label'] == 'LateGC_B18-383_CGG_VH_-') & (dfs['stopcodon'] == True)]#['sequence_alignment_aa'].values

Unnamed: 0,sequence_id,sequence,sequence_aa,locus,stop_codon,vj_in_frame,v_frameshift,productive,rev_comp,complete_vdj,d_frame,v_call,d_call,j_call,sequence_alignment,germline_alignment,sequence_alignment_aa,germline_alignment_aa,v_alignment_start,v_alignment_end,d_alignment_start,d_alignment_end,j_alignment_start,j_alignment_end,v_sequence_alignment,v_sequence_alignment_aa,v_germline_alignment,v_germline_alignment_aa,d_sequence_alignment,d_sequence_alignment_aa,d_germline_alignment,d_germline_alignment_aa,j_sequence_alignment,j_sequence_alignment_aa,j_germline_alignment,j_germline_alignment_aa,fwr1,fwr1_aa,cdr1,cdr1_aa,fwr2,fwr2_aa,cdr2,cdr2_aa,fwr3,fwr3_aa,fwr4,fwr4_aa,cdr3,cdr3_aa,junction,junction_length,junction_aa,junction_aa_length,v_score,d_score,j_score,v_cigar,d_cigar,j_cigar,v_support,d_support,j_support,v_identity,d_identity,j_identity,v_sequence_start,v_sequence_end,v_germline_start,v_germline_end,d_sequence_start,d_sequence_end,d_germline_start,d_germline_end,j_sequence_start,j_sequence_end,j_germline_start,j_germline_end,fwr1_start,fwr1_end,cdr1_start,cdr1_end,fwr2_start,fwr2_end,cdr2_start,cdr2_end,fwr3_start,fwr3_end,fwr4_start,fwr4_end,cdr3_start,cdr3_end,np1,np1_length,np2,np2_length,status,mouse,dataset,chain,sub_dataset,mouse_DB,sequencing,label,locus_v_call,locus_consensus,stopcodon
95072,55C_018_B18-1212006-R-CGGGC-C1_B18_B06,TGGGGGCTTAGTGAACCTCTCCTGCAAGGCTTCTGGCTACACGTTC...,GA**TSPARLLATRSPTSGYTG*SRGLDEALSGLDGLILMVVVLDM...,IGH,T,,,F,F,F,,IGHV_B18,,,GGGGCTT-AGTGAACCTCTCCTGCAAGGCTTCTGGCTACACGTTCA...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GA**TSPARLLATRSPTSGYTG*SRGLDEALSGLDGLILMVVVLDM...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,1.0,318.0,,,,,GGGGCTT-AGTGAACCTCTCCTGCAAGGCTTCTGGCTACACGTTCA...,GA**TSPARLLATRSPTSGYTG*SRGLDEALSGLDGLILMVVVLDM...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,,,,,,,,,GGGGCTTAGTGAACCTCTCCTGCAAGGCTTCT,GA**TSPARLL,GGCTACACGTTCACCAACTTCTGG,ATRSPTSG,ATACACTGGGTGAAGCAGAGGCCTGGACGAGGCCTTGAGTGGATTG...,YTG*SRGLDEALSGLDG,ATTAATCCTAATGGTGGTGGTACT,LILMVVVL,AGATATGGGGAGAAGTTCAAGAACAGGGCCACACTGACTGTAGACA...,DMGRSSRTGPH*L*TYRPVQPTCSSAA*HLRTLRSIIAQHTTIILT...,,,,,,,,,375.783,,,2S7M1D244M6D60M1S,,,1.276e-108,,,88.994,,,3.0,313.0,1.0,318.0,,,,,,,,,3.0,34.0,35.0,58.0,59.0,109.0,110.0,133.0,134.0,313.0,,,,,,,,,LateGC,B18-383,CGG,VH,-,B18,sanger,LateGC_B18-383_CGG_VH_-,IGH,IGH,True


In [35]:
set(dfs.loc[(dfs['stopcodon'] == True)]['label'])

{'LateGC_B18-383_CGG_VH_-',
 'LateGC_B18-383_CGG_VL_-',
 'LateGC_B18-383_OVA-CTLA4_VL_-',
 'LateGC_B18-383_OVA-Isotype_VH_-',
 'LateGC_B18-383_OVA_VH_-',
 'LateGC_B18-383_OVA_VL_-',
 'LateGC_HA-WT_APC_VL_1-100',
 'LateGC_HA-WT_APC_VL_1-1000',
 'LateGC_HA-WT_CGG-CTLA4_VL_1-1000',
 'LateGC_HA-WT_CGG_VL_1-1',
 'LateGC_HA-WT_CGG_VL_1-100',
 'LateGC_HA-WT_CGG_VL_1-1000',
 'LateGC_HA-WT_OVA_VH_1-100',
 'LateGC_HA-WT_OVA_VL_1-1',
 'LateGC_HA-WT_OVA_VL_1-100',
 'LateGC_HA-WT_OVA_VL_1-1000',
 'LateGC_HA-uMT_CGG_VH_0-1'}

In [36]:
update_stats_seq(dfs[dfs['stopcodon']==True], 'stopcodon_c')
df_stats_seq.sum()

total                   101706.0
no_alignment               192.0
no_locus                     7.0
no_locus_consensus           7.0
stopcodons                2439.0
10X_chain_filtering        978.0
10X_Vgene_filtering        205.0
10X_Jgene_filtering          5.0
no_alignment_c              42.0
no_locus_consensus_c         1.0
stopcodon_c                 38.0
dtype: float64

In [37]:
dfs=dfs[dfs['stopcodon']!=True].copy()
dfs

Unnamed: 0,sequence_id,sequence,sequence_aa,locus,stop_codon,vj_in_frame,v_frameshift,productive,rev_comp,complete_vdj,d_frame,v_call,d_call,j_call,sequence_alignment,germline_alignment,sequence_alignment_aa,germline_alignment_aa,v_alignment_start,v_alignment_end,d_alignment_start,d_alignment_end,j_alignment_start,j_alignment_end,v_sequence_alignment,v_sequence_alignment_aa,v_germline_alignment,v_germline_alignment_aa,d_sequence_alignment,d_sequence_alignment_aa,d_germline_alignment,d_germline_alignment_aa,j_sequence_alignment,j_sequence_alignment_aa,j_germline_alignment,j_germline_alignment_aa,fwr1,fwr1_aa,cdr1,cdr1_aa,fwr2,fwr2_aa,cdr2,cdr2_aa,fwr3,fwr3_aa,fwr4,fwr4_aa,cdr3,cdr3_aa,junction,junction_length,junction_aa,junction_aa_length,v_score,d_score,j_score,v_cigar,d_cigar,j_cigar,v_support,d_support,j_support,v_identity,d_identity,j_identity,v_sequence_start,v_sequence_end,v_germline_start,v_germline_end,d_sequence_start,d_sequence_end,d_germline_start,d_germline_end,j_sequence_start,j_sequence_end,j_germline_start,j_germline_end,fwr1_start,fwr1_end,cdr1_start,cdr1_end,fwr2_start,fwr2_end,cdr2_start,cdr2_end,fwr3_start,fwr3_end,fwr4_start,fwr4_end,cdr3_start,cdr3_end,np1,np1_length,np2,np2_length,status,mouse,dataset,chain,sub_dataset,mouse_DB,sequencing,label,locus_v_call,locus_consensus,stopcodon
0,AAACCATTCGGGCTTG-1_contig_1,CAGGTCCAACTGCAGCAGCCTGGGGCTGAGCTTGTGAAGCCTGGGG...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,IGH,F,,,,F,F,,IGHV_B18,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,1.0,318.0,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,,,,,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCT,GASVKLSCKAS,GGCTACACCTTCACCAGCTACTGG,GYTFTSYW,ATGCACTGGGTGAAGCAGAGGCCTGGACGAGGCCTTGAGTGGATTG...,MHWVKQRPGRGLEWIGR,ATTGATCCTAATAGTGGTGGTACT,IDPNSGGT,AAGTACAATGAGAAGTTCAAGAGCAAGGCCACACTGACTGTAGACA...,KYNEKFKSKATLTVDKPSSTAYMQLSSLTSEDSAVYYCARYDYYGS...,,,,,,,,,497.316,,,42S318M1S,,,3.819000e-145,,,100.000,,,43.0,360.0,1.0,318.0,,,,,,,,,43.0,75.0,76.0,99.0,100.0,150.0,151.0,174.0,175.0,360.0,,,,,,,,,Unimmunized,B18-383,SPL,VH,rep2,B18,sc-vdj,Unimmunized_B18-383_SPL_VH_rep2,IGH,IGH,
1,AAACCATTCGTCCTGC-1_contig_1,CAGGTCCAACTGCAGCAGCCTGGGGCTGAGCTTGTGAAGCCTGGGG...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,IGH,F,,,,F,F,,IGHV_B18,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,1.0,318.0,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,,,,,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCT,GASVKLSCKAS,GGCTACACCTTCACCAGCTACTGG,GYTFTSYW,ATGCACTGGGTGAAGCAGAGGCCTGGACGAGGCCTTGAGTGGATTG...,MHWVKQRPGRGLEWIGR,ATTGATCCTAATAGTGGTGGTACT,IDPNSGGT,AAGTACAATGAGAAGTTCAAGAGCAAGGCCACACTGACTGTAGACA...,KYNEKFKSKATLTVDKPSSTAYMQLSSLTSEDSAVYYCARYDYYGS...,,,,,,,,,497.316,,,42S318M1S,,,3.819000e-145,,,100.000,,,43.0,360.0,1.0,318.0,,,,,,,,,43.0,75.0,76.0,99.0,100.0,150.0,151.0,174.0,175.0,360.0,,,,,,,,,Unimmunized,B18-383,SPL,VH,rep2,B18,sc-vdj,Unimmunized_B18-383_SPL_VH_rep2,IGH,IGH,
2,AAACCCCAGAAACGGT-1_contig_1,CAGGTCCAACTGCAGCAGCCTGGGGCTGAGCTTGTGAAGCCTGGGG...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,IGH,F,,,,F,F,,IGHV_B18,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,1.0,318.0,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,,,,,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCT,GASVKLSCKAS,GGCTACACCTTCACCAGCTACTGG,GYTFTSYW,ATGCACTGGGTGAAGCAGAGGCCTGGACGAGGCCTTGAGTGGATTG...,MHWVKQRPGRGLEWIGR,ATTGATCCTAATAGTGGTGGTACT,IDPNSGGT,AAGTACAATGAGAAGTTCAAGAGCAAGGCCACACTGACTGTAGACA...,KYNEKFKSKATLTVDKPSSTAYMQLSSLTSEDSAVYYCARYDYYGS...,,,,,,,,,497.316,,,42S318M1S,,,3.819000e-145,,,100.000,,,43.0,360.0,1.0,318.0,,,,,,,,,43.0,75.0,76.0,99.0,100.0,150.0,151.0,174.0,175.0,360.0,,,,,,,,,Unimmunized,B18-383,SPL,VH,rep2,B18,sc-vdj,Unimmunized_B18-383_SPL_VH_rep2,IGH,IGH,
3,AAACCCCAGCCAGCAT-1_contig_1,CAGGTCCAACTGCAGCAGCCTGGGGCTGAGCTTGTGAAGCCTGGGG...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,IGH,F,,,,F,F,,IGHV_B18,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,1.0,318.0,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,,,,,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCT,GASVKLSCKAS,GGCTACACCTTCACCAGCTACTGG,GYTFTSYW,ATGCACTGGGTGAAGCAGAGGCCTGGACGAGGCCTTGAGTGGATTG...,MHWVKQRPGRGLEWIGR,ATTGATCCTAATAGTGGTGGTACT,IDPNSGGT,AAGTACAATGAGAAGTTCAAGAGCAAGGCCACACTGACTGTAGACA...,KYNEKFKSKATLTVDKPSSTAYMQLSSLTSEDSAVYYCARYDYYGS...,,,,,,,,,497.316,,,42S318M1S,,,3.819000e-145,,,100.000,,,43.0,360.0,1.0,318.0,,,,,,,,,43.0,75.0,76.0,99.0,100.0,150.0,151.0,174.0,175.0,360.0,,,,,,,,,Unimmunized,B18-383,SPL,VH,rep2,B18,sc-vdj,Unimmunized_B18-383_SPL_VH_rep2,IGH,IGH,
4,AAACCCCAGCGTAGCT-1_contig_1,CAGGTCCAACTGCAGCAGCCTGGGGCTGAGCTTGTGAAGCCTGGGG...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,IGH,F,,,,F,F,,IGHV_B18,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,1.0,318.0,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,,,,,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCT,GASVKLSCKAS,GGCTACACCTTCACCAGCTACTGG,GYTFTSYW,ATGCACTGGGTGAAGCAGAGGCCTGGACGAGGCCTTGAGTGGATTG...,MHWVKQRPGRGLEWIGR,ATTGATCCTAATAGTGGTGGTACT,IDPNSGGT,AAGTACAATGAGAAGTTCAAGAGCAAGGCCACACTGACTGTAGACA...,KYNEKFKSKATLTVDKPSSTAYMQLSSLTSEDSAVYYCARYDYYGS...,,,,,,,,,497.316,,,42S318M1S,,,3.819000e-145,,,100.000,,,43.0,360.0,1.0,318.0,,,,,,,,,43.0,75.0,76.0,99.0,100.0,150.0,151.0,174.0,175.0,360.0,,,,,,,,,Unimmunized,B18-383,SPL,VH,rep2,B18,sc-vdj,Unimmunized_B18-383_SPL_VH_rep2,IGH,IGH,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97868,52A_006_H-1406143-R-OVAGC-A9_H_F01,GGAGAGACTTAGTGAAGCCTGGAGGGTCCCTGAAACTCTCCTGTGC...,SCAASGFFFFTYGMSWVRQTPDKRLEWVATISYGGGYIYYSDSVKG...,IGH,F,,,,F,F,,IGHV_HA,,,TCCTGTGCAGCCTCTGGATTTTTTTTCTTTACCTATGGCATGTCTT...,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFFFFTYGMSWVRQTPDKRLEWVATISYGGGYIYYSDSVKG...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,1.0,300.0,,,,,TCCTGTGCAGCCTCTGGATTTTTTTTCTTTACCTATGGCATGTCTT...,SCAASGFFFFTYGMSWVRQTPDKRLEWVATISYGGGYIYYSDSVKG...,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,,,,,,,,,TCCTGTGCAGCCTCT,SCAAS,GGATTTTTTTTCTTTACCTATGGC,GFFFFTYG,ATGTCTTGGGTTCGCCAGACTCCAGACAAGAGGCTGGAGTGGGTCG...,MSWVRQTPDKRLEWVAT,TTAGTTATGGTGGTGGTTACATT,ISYGGGYI,TACTATTCAGACAGTGTCAAGGGACGATTCACCATCTCCAGAGACA...,YYSDSVKGRFTISRDNAKNTLYLRMSSLKSEDSAMYYCARRERYDE...,,,,,,,,,422.526,,,38S300M1S,,,1.130000e-122,,,95.000,,,39.0,338.0,1.0,300.0,,,,,,,,,39.0,53.0,54.0,77.0,78.0,129.0,130.0,152.0,153.0,338.0,,,,,,,,,LateGC,HA-uMT,OVA,VH,0-1,HA,sanger,LateGC_HA-uMT_OVA_VH_0-1,IGH,IGH,
97869,52A_024_H-1406143-L-OVAGC-C7_H_H03,GAGGATCCCTCCAACTCTCCTGTGTAGCCTCTGGATTCACTTTCAG...,SCVASGFTFSSYGLSWVRQTPDKRPEWVATISYGVGYTFYLDSVKG...,IGH,F,,,,F,F,,IGHV_HA,,,TCCTGTGTAGCCTCTGGATTCACTTTCAGTAGCTATGGCTTGTCTT...,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCVASGFTFSSYGLSWVRQTPDKRPEWVATISYGVGYTFYLDSVKG...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,1.0,300.0,,,,,TCCTGTGTAGCCTCTGGATTCACTTTCAGTAGCTATGGCTTGTCTT...,SCVASGFTFSSYGLSWVRQTPDKRPEWVATISYGVGYTFYLDSVKG...,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,,,,,,,,,TCCTGTGTAGCCTCT,SCVAS,GGATTCACTTTCAGTAGCTATGGC,GFTFSSYG,TTGTCTTGGGTTCGCCAGACTCCAGACAAGAGGCCGGAATGGGTCG...,LSWVRQTPDKRPEWVAT,TTAGTTATGGTGTTGGTTATACT,ISYGVGYT,TTCTATTTAGACAGTGTGAAGGGGCGATTCACCATCTCCAGAGACA...,FYLDSVKGRFTISRDNAKNTLYLQMSSLKSEDSAMYYCTRREKYDE...,,,,,,,,,419.410,,,17S300M1S,,,9.219000e-122,,,94.667,,,18.0,317.0,1.0,300.0,,,,,,,,,18.0,32.0,33.0,56.0,57.0,108.0,109.0,131.0,132.0,317.0,,,,,,,,,LateGC,HA-uMT,OVA,VH,0-1,HA,sanger,LateGC_HA-uMT_OVA_VH_0-1,IGH,IGH,
97870,53A_026_H-1443862-L-OVAGC-C8_H_B04,GGGAGTCCTAGTGCAGCCTGGAGGCTCCCTGTCCCTCTCCTGTGCA...,SCAASGFTFSVYGMSWVRQTPDKRLEWVATISYGVGYTFYADNLKG...,IGH,F,,,,F,F,,IGHV_HA,,,TCCTGTGCAGCCTCTGGATTCACTTTCAGTGTTTATGGCATGTCTT...,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSVYGMSWVRQTPDKRLEWVATISYGVGYTFYADNLKG...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,1.0,300.0,,,,,TCCTGTGCAGCCTCTGGATTCACTTTCAGTGTTTATGGCATGTCTT...,SCAASGFTFSVYGMSWVRQTPDKRLEWVATISYGVGYTFYADNLKG...,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,,,,,,,,,TCCTGTGCAGCCTCT,SCAAS,GGATTCACTTTCAGTGTTTATGGC,GFTFSVYG,ATGTCTTGGGTTCGCCAGACTCCAGACAAGAGGCTGGAGTGGGTCG...,MSWVRQTPDKRLEWVAT,TTAGTTATGGTGTTGGTTACACC,ISYGVGYT,TTCTATGCAGACAATTTGAAGGGGCGTTTCACCATCTCCAGAGACA...,FYADNLKGRFTISRDNAKNTLYLQMNSLKSEDSAMYYCTRRERFNE...,,,,,,,,,419.410,,,37S300M1S,,,9.765000e-122,,,94.667,,,38.0,337.0,1.0,300.0,,,,,,,,,38.0,52.0,53.0,76.0,77.0,128.0,129.0,151.0,152.0,337.0,,,,,,,,,LateGC,HA-uMT,OVA,VH,0-1,HA,sanger,LateGC_HA-uMT_OVA_VH_0-1,IGH,IGH,
97871,52A_029_H-1406143-R-OVAGC-C11_H_E04,AGCCTGGAGGGTCCCTGAAACTCTCCTGTGTTGCCTCTGGATTCAC...,SCVASGFTFSTYGMSWVRQTPDKRLEWVATSSYGVGYTLYSDNVKG...,IGH,F,,,,F,F,,IGHV_HA,,,TCCTGTGTTGCCTCTGGATTCACTTTCAGTACCTATGGCATGTCTT...,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCVASGFTFSTYGMSWVRQTPDKRLEWVATSSYGVGYTLYSDNVKG...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,1.0,300.0,,,,,TCCTGTGTTGCCTCTGGATTCACTTTCAGTACCTATGGCATGTCTT...,SCVASGFTFSTYGMSWVRQTPDKRLEWVATSSYGVGYTLYSDNVKG...,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,,,,,,,,,TCCTGTGTTGCCTCT,SCVAS,GGATTCACTTTCAGTACCTATGGC,GFTFSTYG,ATGTCTTGGGTTCGCCAGACTCCAGACAAGAGGCTGGAGTGGGTCG...,MSWVRQTPDKRLEWVAT,GTAGTTATGGTGTTGGTTACACC,SSYGVGYT,CTCTATTCAGACAATGTGAAGGGGCGGTTCACCATCTCCAGAGACA...,LYSDNVKGRFTISRDNAKNTLYLQMSGLKSEDSAIYYCSRRGWYGE...,,,,,,,,,394.480,,,23S300M1S,,,3.005000e-114,,,92.000,,,24.0,323.0,1.0,300.0,,,,,,,,,24.0,38.0,39.0,62.0,63.0,114.0,115.0,137.0,138.0,323.0,,,,,,,,,LateGC,HA-uMT,OVA,VH,0-1,HA,sanger,LateGC_HA-uMT_OVA_VH_0-1,IGH,IGH,


## add_more_info

In [38]:
def add_more_info(row):

    seq_reference = dfs_ref[(dfs_ref['mouse_DB']==row['mouse_DB']) & (dfs_ref['chain']==row['chain'])]['seq'].values[0]
    len_reference=len(seq_reference)

    try:
        len_sequence_alignment = len(row['sequence_alignment'])
    except TypeError:
        print(row['sequence_id'])
        raise Exception("Something wrong!")

    
    len_sequence_alignment = len(row['sequence_alignment'])
    len_germline_alignment = len(row['germline_alignment'])
        
    return (seq_reference, len_reference, len_sequence_alignment, len_germline_alignment)

# Apply the function and assign the result to two new columns
dfs[['seq_reference', 'len_reference', 'len_sequence_alignment', 'len_germline_alignment']] = dfs.apply(add_more_info, axis=1, result_type='expand')

# Display the updated DataFrame
dfs

Unnamed: 0,sequence_id,sequence,sequence_aa,locus,stop_codon,vj_in_frame,v_frameshift,productive,rev_comp,complete_vdj,d_frame,v_call,d_call,j_call,sequence_alignment,germline_alignment,sequence_alignment_aa,germline_alignment_aa,v_alignment_start,v_alignment_end,d_alignment_start,d_alignment_end,j_alignment_start,j_alignment_end,v_sequence_alignment,v_sequence_alignment_aa,v_germline_alignment,v_germline_alignment_aa,d_sequence_alignment,d_sequence_alignment_aa,d_germline_alignment,d_germline_alignment_aa,j_sequence_alignment,j_sequence_alignment_aa,j_germline_alignment,j_germline_alignment_aa,fwr1,fwr1_aa,cdr1,cdr1_aa,fwr2,fwr2_aa,cdr2,cdr2_aa,fwr3,fwr3_aa,fwr4,fwr4_aa,cdr3,cdr3_aa,junction,junction_length,junction_aa,junction_aa_length,v_score,d_score,j_score,v_cigar,d_cigar,j_cigar,v_support,d_support,j_support,v_identity,d_identity,j_identity,v_sequence_start,v_sequence_end,v_germline_start,v_germline_end,d_sequence_start,d_sequence_end,d_germline_start,d_germline_end,j_sequence_start,j_sequence_end,j_germline_start,j_germline_end,fwr1_start,fwr1_end,cdr1_start,cdr1_end,fwr2_start,fwr2_end,cdr2_start,cdr2_end,fwr3_start,fwr3_end,fwr4_start,fwr4_end,cdr3_start,cdr3_end,np1,np1_length,np2,np2_length,status,mouse,dataset,chain,sub_dataset,mouse_DB,sequencing,label,locus_v_call,locus_consensus,stopcodon,seq_reference,len_reference,len_sequence_alignment,len_germline_alignment
0,AAACCATTCGGGCTTG-1_contig_1,CAGGTCCAACTGCAGCAGCCTGGGGCTGAGCTTGTGAAGCCTGGGG...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,IGH,F,,,,F,F,,IGHV_B18,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,1.0,318.0,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,,,,,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCT,GASVKLSCKAS,GGCTACACCTTCACCAGCTACTGG,GYTFTSYW,ATGCACTGGGTGAAGCAGAGGCCTGGACGAGGCCTTGAGTGGATTG...,MHWVKQRPGRGLEWIGR,ATTGATCCTAATAGTGGTGGTACT,IDPNSGGT,AAGTACAATGAGAAGTTCAAGAGCAAGGCCACACTGACTGTAGACA...,KYNEKFKSKATLTVDKPSSTAYMQLSSLTSEDSAVYYCARYDYYGS...,,,,,,,,,497.316,,,42S318M1S,,,3.819000e-145,,,100.000,,,43.0,360.0,1.0,318.0,,,,,,,,,43.0,75.0,76.0,99.0,100.0,150.0,151.0,174.0,175.0,360.0,,,,,,,,,Unimmunized,B18-383,SPL,VH,rep2,B18,sc-vdj,Unimmunized_B18-383_SPL_VH_rep2,IGH,IGH,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,318,318,318
1,AAACCATTCGTCCTGC-1_contig_1,CAGGTCCAACTGCAGCAGCCTGGGGCTGAGCTTGTGAAGCCTGGGG...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,IGH,F,,,,F,F,,IGHV_B18,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,1.0,318.0,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,,,,,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCT,GASVKLSCKAS,GGCTACACCTTCACCAGCTACTGG,GYTFTSYW,ATGCACTGGGTGAAGCAGAGGCCTGGACGAGGCCTTGAGTGGATTG...,MHWVKQRPGRGLEWIGR,ATTGATCCTAATAGTGGTGGTACT,IDPNSGGT,AAGTACAATGAGAAGTTCAAGAGCAAGGCCACACTGACTGTAGACA...,KYNEKFKSKATLTVDKPSSTAYMQLSSLTSEDSAVYYCARYDYYGS...,,,,,,,,,497.316,,,42S318M1S,,,3.819000e-145,,,100.000,,,43.0,360.0,1.0,318.0,,,,,,,,,43.0,75.0,76.0,99.0,100.0,150.0,151.0,174.0,175.0,360.0,,,,,,,,,Unimmunized,B18-383,SPL,VH,rep2,B18,sc-vdj,Unimmunized_B18-383_SPL_VH_rep2,IGH,IGH,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,318,318,318
2,AAACCCCAGAAACGGT-1_contig_1,CAGGTCCAACTGCAGCAGCCTGGGGCTGAGCTTGTGAAGCCTGGGG...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,IGH,F,,,,F,F,,IGHV_B18,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,1.0,318.0,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,,,,,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCT,GASVKLSCKAS,GGCTACACCTTCACCAGCTACTGG,GYTFTSYW,ATGCACTGGGTGAAGCAGAGGCCTGGACGAGGCCTTGAGTGGATTG...,MHWVKQRPGRGLEWIGR,ATTGATCCTAATAGTGGTGGTACT,IDPNSGGT,AAGTACAATGAGAAGTTCAAGAGCAAGGCCACACTGACTGTAGACA...,KYNEKFKSKATLTVDKPSSTAYMQLSSLTSEDSAVYYCARYDYYGS...,,,,,,,,,497.316,,,42S318M1S,,,3.819000e-145,,,100.000,,,43.0,360.0,1.0,318.0,,,,,,,,,43.0,75.0,76.0,99.0,100.0,150.0,151.0,174.0,175.0,360.0,,,,,,,,,Unimmunized,B18-383,SPL,VH,rep2,B18,sc-vdj,Unimmunized_B18-383_SPL_VH_rep2,IGH,IGH,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,318,318,318
3,AAACCCCAGCCAGCAT-1_contig_1,CAGGTCCAACTGCAGCAGCCTGGGGCTGAGCTTGTGAAGCCTGGGG...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,IGH,F,,,,F,F,,IGHV_B18,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,1.0,318.0,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,,,,,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCT,GASVKLSCKAS,GGCTACACCTTCACCAGCTACTGG,GYTFTSYW,ATGCACTGGGTGAAGCAGAGGCCTGGACGAGGCCTTGAGTGGATTG...,MHWVKQRPGRGLEWIGR,ATTGATCCTAATAGTGGTGGTACT,IDPNSGGT,AAGTACAATGAGAAGTTCAAGAGCAAGGCCACACTGACTGTAGACA...,KYNEKFKSKATLTVDKPSSTAYMQLSSLTSEDSAVYYCARYDYYGS...,,,,,,,,,497.316,,,42S318M1S,,,3.819000e-145,,,100.000,,,43.0,360.0,1.0,318.0,,,,,,,,,43.0,75.0,76.0,99.0,100.0,150.0,151.0,174.0,175.0,360.0,,,,,,,,,Unimmunized,B18-383,SPL,VH,rep2,B18,sc-vdj,Unimmunized_B18-383_SPL_VH_rep2,IGH,IGH,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,318,318,318
4,AAACCCCAGCGTAGCT-1_contig_1,CAGGTCCAACTGCAGCAGCCTGGGGCTGAGCTTGTGAAGCCTGGGG...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,IGH,F,,,,F,F,,IGHV_B18,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,1.0,318.0,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,,,,,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCT,GASVKLSCKAS,GGCTACACCTTCACCAGCTACTGG,GYTFTSYW,ATGCACTGGGTGAAGCAGAGGCCTGGACGAGGCCTTGAGTGGATTG...,MHWVKQRPGRGLEWIGR,ATTGATCCTAATAGTGGTGGTACT,IDPNSGGT,AAGTACAATGAGAAGTTCAAGAGCAAGGCCACACTGACTGTAGACA...,KYNEKFKSKATLTVDKPSSTAYMQLSSLTSEDSAVYYCARYDYYGS...,,,,,,,,,497.316,,,42S318M1S,,,3.819000e-145,,,100.000,,,43.0,360.0,1.0,318.0,,,,,,,,,43.0,75.0,76.0,99.0,100.0,150.0,151.0,174.0,175.0,360.0,,,,,,,,,Unimmunized,B18-383,SPL,VH,rep2,B18,sc-vdj,Unimmunized_B18-383_SPL_VH_rep2,IGH,IGH,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,318,318,318
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97868,52A_006_H-1406143-R-OVAGC-A9_H_F01,GGAGAGACTTAGTGAAGCCTGGAGGGTCCCTGAAACTCTCCTGTGC...,SCAASGFFFFTYGMSWVRQTPDKRLEWVATISYGGGYIYYSDSVKG...,IGH,F,,,,F,F,,IGHV_HA,,,TCCTGTGCAGCCTCTGGATTTTTTTTCTTTACCTATGGCATGTCTT...,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFFFFTYGMSWVRQTPDKRLEWVATISYGGGYIYYSDSVKG...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,1.0,300.0,,,,,TCCTGTGCAGCCTCTGGATTTTTTTTCTTTACCTATGGCATGTCTT...,SCAASGFFFFTYGMSWVRQTPDKRLEWVATISYGGGYIYYSDSVKG...,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,,,,,,,,,TCCTGTGCAGCCTCT,SCAAS,GGATTTTTTTTCTTTACCTATGGC,GFFFFTYG,ATGTCTTGGGTTCGCCAGACTCCAGACAAGAGGCTGGAGTGGGTCG...,MSWVRQTPDKRLEWVAT,TTAGTTATGGTGGTGGTTACATT,ISYGGGYI,TACTATTCAGACAGTGTCAAGGGACGATTCACCATCTCCAGAGACA...,YYSDSVKGRFTISRDNAKNTLYLRMSSLKSEDSAMYYCARRERYDE...,,,,,,,,,422.526,,,38S300M1S,,,1.130000e-122,,,95.000,,,39.0,338.0,1.0,300.0,,,,,,,,,39.0,53.0,54.0,77.0,78.0,129.0,130.0,152.0,153.0,338.0,,,,,,,,,LateGC,HA-uMT,OVA,VH,0-1,HA,sanger,LateGC_HA-uMT_OVA_VH_0-1,IGH,IGH,,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,300,300,300
97869,52A_024_H-1406143-L-OVAGC-C7_H_H03,GAGGATCCCTCCAACTCTCCTGTGTAGCCTCTGGATTCACTTTCAG...,SCVASGFTFSSYGLSWVRQTPDKRPEWVATISYGVGYTFYLDSVKG...,IGH,F,,,,F,F,,IGHV_HA,,,TCCTGTGTAGCCTCTGGATTCACTTTCAGTAGCTATGGCTTGTCTT...,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCVASGFTFSSYGLSWVRQTPDKRPEWVATISYGVGYTFYLDSVKG...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,1.0,300.0,,,,,TCCTGTGTAGCCTCTGGATTCACTTTCAGTAGCTATGGCTTGTCTT...,SCVASGFTFSSYGLSWVRQTPDKRPEWVATISYGVGYTFYLDSVKG...,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,,,,,,,,,TCCTGTGTAGCCTCT,SCVAS,GGATTCACTTTCAGTAGCTATGGC,GFTFSSYG,TTGTCTTGGGTTCGCCAGACTCCAGACAAGAGGCCGGAATGGGTCG...,LSWVRQTPDKRPEWVAT,TTAGTTATGGTGTTGGTTATACT,ISYGVGYT,TTCTATTTAGACAGTGTGAAGGGGCGATTCACCATCTCCAGAGACA...,FYLDSVKGRFTISRDNAKNTLYLQMSSLKSEDSAMYYCTRREKYDE...,,,,,,,,,419.410,,,17S300M1S,,,9.219000e-122,,,94.667,,,18.0,317.0,1.0,300.0,,,,,,,,,18.0,32.0,33.0,56.0,57.0,108.0,109.0,131.0,132.0,317.0,,,,,,,,,LateGC,HA-uMT,OVA,VH,0-1,HA,sanger,LateGC_HA-uMT_OVA_VH_0-1,IGH,IGH,,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,300,300,300
97870,53A_026_H-1443862-L-OVAGC-C8_H_B04,GGGAGTCCTAGTGCAGCCTGGAGGCTCCCTGTCCCTCTCCTGTGCA...,SCAASGFTFSVYGMSWVRQTPDKRLEWVATISYGVGYTFYADNLKG...,IGH,F,,,,F,F,,IGHV_HA,,,TCCTGTGCAGCCTCTGGATTCACTTTCAGTGTTTATGGCATGTCTT...,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSVYGMSWVRQTPDKRLEWVATISYGVGYTFYADNLKG...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,1.0,300.0,,,,,TCCTGTGCAGCCTCTGGATTCACTTTCAGTGTTTATGGCATGTCTT...,SCAASGFTFSVYGMSWVRQTPDKRLEWVATISYGVGYTFYADNLKG...,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,,,,,,,,,TCCTGTGCAGCCTCT,SCAAS,GGATTCACTTTCAGTGTTTATGGC,GFTFSVYG,ATGTCTTGGGTTCGCCAGACTCCAGACAAGAGGCTGGAGTGGGTCG...,MSWVRQTPDKRLEWVAT,TTAGTTATGGTGTTGGTTACACC,ISYGVGYT,TTCTATGCAGACAATTTGAAGGGGCGTTTCACCATCTCCAGAGACA...,FYADNLKGRFTISRDNAKNTLYLQMNSLKSEDSAMYYCTRRERFNE...,,,,,,,,,419.410,,,37S300M1S,,,9.765000e-122,,,94.667,,,38.0,337.0,1.0,300.0,,,,,,,,,38.0,52.0,53.0,76.0,77.0,128.0,129.0,151.0,152.0,337.0,,,,,,,,,LateGC,HA-uMT,OVA,VH,0-1,HA,sanger,LateGC_HA-uMT_OVA_VH_0-1,IGH,IGH,,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,300,300,300
97871,52A_029_H-1406143-R-OVAGC-C11_H_E04,AGCCTGGAGGGTCCCTGAAACTCTCCTGTGTTGCCTCTGGATTCAC...,SCVASGFTFSTYGMSWVRQTPDKRLEWVATSSYGVGYTLYSDNVKG...,IGH,F,,,,F,F,,IGHV_HA,,,TCCTGTGTTGCCTCTGGATTCACTTTCAGTACCTATGGCATGTCTT...,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCVASGFTFSTYGMSWVRQTPDKRLEWVATSSYGVGYTLYSDNVKG...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,1.0,300.0,,,,,TCCTGTGTTGCCTCTGGATTCACTTTCAGTACCTATGGCATGTCTT...,SCVASGFTFSTYGMSWVRQTPDKRLEWVATSSYGVGYTLYSDNVKG...,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,,,,,,,,,TCCTGTGTTGCCTCT,SCVAS,GGATTCACTTTCAGTACCTATGGC,GFTFSTYG,ATGTCTTGGGTTCGCCAGACTCCAGACAAGAGGCTGGAGTGGGTCG...,MSWVRQTPDKRLEWVAT,GTAGTTATGGTGTTGGTTACACC,SSYGVGYT,CTCTATTCAGACAATGTGAAGGGGCGGTTCACCATCTCCAGAGACA...,LYSDNVKGRFTISRDNAKNTLYLQMSGLKSEDSAIYYCSRRGWYGE...,,,,,,,,,394.480,,,23S300M1S,,,3.005000e-114,,,92.000,,,24.0,323.0,1.0,300.0,,,,,,,,,24.0,38.0,39.0,62.0,63.0,114.0,115.0,137.0,138.0,323.0,,,,,,,,,LateGC,HA-uMT,OVA,VH,0-1,HA,sanger,LateGC_HA-uMT_OVA_VH_0-1,IGH,IGH,,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,300,300,300


In [39]:
def inspect_cigar(row):

    A = row['sequence']
    cigar = row['v_cigar']

    cigars = re.findall(r'(\d+)([A-Z])', cigar)
    
    pos = 0
    result = []
    frameshift = False
    insertions = 0
    deletions = 0
    
    for length, op in cigars:
        length = int(length)
        
        if op == 'M':  # Match/mismatch, extract this part of A
            result.append(A[pos:pos+length])
            pos += length
        
        elif op == 'N':  # Add 'N' characters to the result
            result.append('N' * length)
        
        elif op == 'I':  # Insertion, ignore (trim out)
            pos += length

            if length % 3 != 0:
                frameshift = True
            insertions+=length
            continue
        
        elif op == 'S':  # Soft clipping, do nothing
            pos += length
    
        elif op == 'D':  # Deletion, add '-' characters
            if length % 3 != 0:
                frameshift = True      
            deletions+=length
            result.append('-' * length)

    new_sequence = ''.join(result)

    return (new_sequence, len(new_sequence), insertions, deletions, frameshift)

# Apply the function and assign the result to two new columns
dfs[['new_sequence', 'len_new_sequence', 'insertions', 'deletions', 'frameshift']] = dfs.apply(inspect_cigar, axis=1, result_type='expand')

# Display the updated DataFrame
dfs

Unnamed: 0,sequence_id,sequence,sequence_aa,locus,stop_codon,vj_in_frame,v_frameshift,productive,rev_comp,complete_vdj,d_frame,v_call,d_call,j_call,sequence_alignment,germline_alignment,sequence_alignment_aa,germline_alignment_aa,v_alignment_start,v_alignment_end,d_alignment_start,d_alignment_end,j_alignment_start,j_alignment_end,v_sequence_alignment,v_sequence_alignment_aa,v_germline_alignment,v_germline_alignment_aa,d_sequence_alignment,d_sequence_alignment_aa,d_germline_alignment,d_germline_alignment_aa,j_sequence_alignment,j_sequence_alignment_aa,j_germline_alignment,j_germline_alignment_aa,fwr1,fwr1_aa,cdr1,cdr1_aa,fwr2,fwr2_aa,cdr2,cdr2_aa,fwr3,fwr3_aa,fwr4,fwr4_aa,cdr3,cdr3_aa,junction,junction_length,junction_aa,junction_aa_length,v_score,d_score,j_score,v_cigar,d_cigar,j_cigar,v_support,d_support,j_support,v_identity,d_identity,j_identity,v_sequence_start,v_sequence_end,v_germline_start,v_germline_end,d_sequence_start,d_sequence_end,d_germline_start,d_germline_end,j_sequence_start,j_sequence_end,j_germline_start,j_germline_end,fwr1_start,fwr1_end,cdr1_start,cdr1_end,fwr2_start,fwr2_end,cdr2_start,cdr2_end,fwr3_start,fwr3_end,fwr4_start,fwr4_end,cdr3_start,cdr3_end,np1,np1_length,np2,np2_length,status,mouse,dataset,chain,sub_dataset,mouse_DB,sequencing,label,locus_v_call,locus_consensus,stopcodon,seq_reference,len_reference,len_sequence_alignment,len_germline_alignment,new_sequence,len_new_sequence,insertions,deletions,frameshift
0,AAACCATTCGGGCTTG-1_contig_1,CAGGTCCAACTGCAGCAGCCTGGGGCTGAGCTTGTGAAGCCTGGGG...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,IGH,F,,,,F,F,,IGHV_B18,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,1.0,318.0,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,,,,,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCT,GASVKLSCKAS,GGCTACACCTTCACCAGCTACTGG,GYTFTSYW,ATGCACTGGGTGAAGCAGAGGCCTGGACGAGGCCTTGAGTGGATTG...,MHWVKQRPGRGLEWIGR,ATTGATCCTAATAGTGGTGGTACT,IDPNSGGT,AAGTACAATGAGAAGTTCAAGAGCAAGGCCACACTGACTGTAGACA...,KYNEKFKSKATLTVDKPSSTAYMQLSSLTSEDSAVYYCARYDYYGS...,,,,,,,,,497.316,,,42S318M1S,,,3.819000e-145,,,100.000,,,43.0,360.0,1.0,318.0,,,,,,,,,43.0,75.0,76.0,99.0,100.0,150.0,151.0,174.0,175.0,360.0,,,,,,,,,Unimmunized,B18-383,SPL,VH,rep2,B18,sc-vdj,Unimmunized_B18-383_SPL_VH_rep2,IGH,IGH,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,318,318,318,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,318,0,0,False
1,AAACCATTCGTCCTGC-1_contig_1,CAGGTCCAACTGCAGCAGCCTGGGGCTGAGCTTGTGAAGCCTGGGG...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,IGH,F,,,,F,F,,IGHV_B18,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,1.0,318.0,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,,,,,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCT,GASVKLSCKAS,GGCTACACCTTCACCAGCTACTGG,GYTFTSYW,ATGCACTGGGTGAAGCAGAGGCCTGGACGAGGCCTTGAGTGGATTG...,MHWVKQRPGRGLEWIGR,ATTGATCCTAATAGTGGTGGTACT,IDPNSGGT,AAGTACAATGAGAAGTTCAAGAGCAAGGCCACACTGACTGTAGACA...,KYNEKFKSKATLTVDKPSSTAYMQLSSLTSEDSAVYYCARYDYYGS...,,,,,,,,,497.316,,,42S318M1S,,,3.819000e-145,,,100.000,,,43.0,360.0,1.0,318.0,,,,,,,,,43.0,75.0,76.0,99.0,100.0,150.0,151.0,174.0,175.0,360.0,,,,,,,,,Unimmunized,B18-383,SPL,VH,rep2,B18,sc-vdj,Unimmunized_B18-383_SPL_VH_rep2,IGH,IGH,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,318,318,318,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,318,0,0,False
2,AAACCCCAGAAACGGT-1_contig_1,CAGGTCCAACTGCAGCAGCCTGGGGCTGAGCTTGTGAAGCCTGGGG...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,IGH,F,,,,F,F,,IGHV_B18,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,1.0,318.0,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,,,,,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCT,GASVKLSCKAS,GGCTACACCTTCACCAGCTACTGG,GYTFTSYW,ATGCACTGGGTGAAGCAGAGGCCTGGACGAGGCCTTGAGTGGATTG...,MHWVKQRPGRGLEWIGR,ATTGATCCTAATAGTGGTGGTACT,IDPNSGGT,AAGTACAATGAGAAGTTCAAGAGCAAGGCCACACTGACTGTAGACA...,KYNEKFKSKATLTVDKPSSTAYMQLSSLTSEDSAVYYCARYDYYGS...,,,,,,,,,497.316,,,42S318M1S,,,3.819000e-145,,,100.000,,,43.0,360.0,1.0,318.0,,,,,,,,,43.0,75.0,76.0,99.0,100.0,150.0,151.0,174.0,175.0,360.0,,,,,,,,,Unimmunized,B18-383,SPL,VH,rep2,B18,sc-vdj,Unimmunized_B18-383_SPL_VH_rep2,IGH,IGH,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,318,318,318,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,318,0,0,False
3,AAACCCCAGCCAGCAT-1_contig_1,CAGGTCCAACTGCAGCAGCCTGGGGCTGAGCTTGTGAAGCCTGGGG...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,IGH,F,,,,F,F,,IGHV_B18,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,1.0,318.0,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,,,,,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCT,GASVKLSCKAS,GGCTACACCTTCACCAGCTACTGG,GYTFTSYW,ATGCACTGGGTGAAGCAGAGGCCTGGACGAGGCCTTGAGTGGATTG...,MHWVKQRPGRGLEWIGR,ATTGATCCTAATAGTGGTGGTACT,IDPNSGGT,AAGTACAATGAGAAGTTCAAGAGCAAGGCCACACTGACTGTAGACA...,KYNEKFKSKATLTVDKPSSTAYMQLSSLTSEDSAVYYCARYDYYGS...,,,,,,,,,497.316,,,42S318M1S,,,3.819000e-145,,,100.000,,,43.0,360.0,1.0,318.0,,,,,,,,,43.0,75.0,76.0,99.0,100.0,150.0,151.0,174.0,175.0,360.0,,,,,,,,,Unimmunized,B18-383,SPL,VH,rep2,B18,sc-vdj,Unimmunized_B18-383_SPL_VH_rep2,IGH,IGH,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,318,318,318,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,318,0,0,False
4,AAACCCCAGCGTAGCT-1_contig_1,CAGGTCCAACTGCAGCAGCCTGGGGCTGAGCTTGTGAAGCCTGGGG...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,IGH,F,,,,F,F,,IGHV_B18,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,1.0,318.0,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,,,,,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCT,GASVKLSCKAS,GGCTACACCTTCACCAGCTACTGG,GYTFTSYW,ATGCACTGGGTGAAGCAGAGGCCTGGACGAGGCCTTGAGTGGATTG...,MHWVKQRPGRGLEWIGR,ATTGATCCTAATAGTGGTGGTACT,IDPNSGGT,AAGTACAATGAGAAGTTCAAGAGCAAGGCCACACTGACTGTAGACA...,KYNEKFKSKATLTVDKPSSTAYMQLSSLTSEDSAVYYCARYDYYGS...,,,,,,,,,497.316,,,42S318M1S,,,3.819000e-145,,,100.000,,,43.0,360.0,1.0,318.0,,,,,,,,,43.0,75.0,76.0,99.0,100.0,150.0,151.0,174.0,175.0,360.0,,,,,,,,,Unimmunized,B18-383,SPL,VH,rep2,B18,sc-vdj,Unimmunized_B18-383_SPL_VH_rep2,IGH,IGH,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,318,318,318,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,318,0,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97868,52A_006_H-1406143-R-OVAGC-A9_H_F01,GGAGAGACTTAGTGAAGCCTGGAGGGTCCCTGAAACTCTCCTGTGC...,SCAASGFFFFTYGMSWVRQTPDKRLEWVATISYGGGYIYYSDSVKG...,IGH,F,,,,F,F,,IGHV_HA,,,TCCTGTGCAGCCTCTGGATTTTTTTTCTTTACCTATGGCATGTCTT...,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFFFFTYGMSWVRQTPDKRLEWVATISYGGGYIYYSDSVKG...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,1.0,300.0,,,,,TCCTGTGCAGCCTCTGGATTTTTTTTCTTTACCTATGGCATGTCTT...,SCAASGFFFFTYGMSWVRQTPDKRLEWVATISYGGGYIYYSDSVKG...,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,,,,,,,,,TCCTGTGCAGCCTCT,SCAAS,GGATTTTTTTTCTTTACCTATGGC,GFFFFTYG,ATGTCTTGGGTTCGCCAGACTCCAGACAAGAGGCTGGAGTGGGTCG...,MSWVRQTPDKRLEWVAT,TTAGTTATGGTGGTGGTTACATT,ISYGGGYI,TACTATTCAGACAGTGTCAAGGGACGATTCACCATCTCCAGAGACA...,YYSDSVKGRFTISRDNAKNTLYLRMSSLKSEDSAMYYCARRERYDE...,,,,,,,,,422.526,,,38S300M1S,,,1.130000e-122,,,95.000,,,39.0,338.0,1.0,300.0,,,,,,,,,39.0,53.0,54.0,77.0,78.0,129.0,130.0,152.0,153.0,338.0,,,,,,,,,LateGC,HA-uMT,OVA,VH,0-1,HA,sanger,LateGC_HA-uMT_OVA_VH_0-1,IGH,IGH,,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,300,300,300,TCCTGTGCAGCCTCTGGATTTTTTTTCTTTACCTATGGCATGTCTT...,300,0,0,False
97869,52A_024_H-1406143-L-OVAGC-C7_H_H03,GAGGATCCCTCCAACTCTCCTGTGTAGCCTCTGGATTCACTTTCAG...,SCVASGFTFSSYGLSWVRQTPDKRPEWVATISYGVGYTFYLDSVKG...,IGH,F,,,,F,F,,IGHV_HA,,,TCCTGTGTAGCCTCTGGATTCACTTTCAGTAGCTATGGCTTGTCTT...,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCVASGFTFSSYGLSWVRQTPDKRPEWVATISYGVGYTFYLDSVKG...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,1.0,300.0,,,,,TCCTGTGTAGCCTCTGGATTCACTTTCAGTAGCTATGGCTTGTCTT...,SCVASGFTFSSYGLSWVRQTPDKRPEWVATISYGVGYTFYLDSVKG...,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,,,,,,,,,TCCTGTGTAGCCTCT,SCVAS,GGATTCACTTTCAGTAGCTATGGC,GFTFSSYG,TTGTCTTGGGTTCGCCAGACTCCAGACAAGAGGCCGGAATGGGTCG...,LSWVRQTPDKRPEWVAT,TTAGTTATGGTGTTGGTTATACT,ISYGVGYT,TTCTATTTAGACAGTGTGAAGGGGCGATTCACCATCTCCAGAGACA...,FYLDSVKGRFTISRDNAKNTLYLQMSSLKSEDSAMYYCTRREKYDE...,,,,,,,,,419.410,,,17S300M1S,,,9.219000e-122,,,94.667,,,18.0,317.0,1.0,300.0,,,,,,,,,18.0,32.0,33.0,56.0,57.0,108.0,109.0,131.0,132.0,317.0,,,,,,,,,LateGC,HA-uMT,OVA,VH,0-1,HA,sanger,LateGC_HA-uMT_OVA_VH_0-1,IGH,IGH,,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,300,300,300,TCCTGTGTAGCCTCTGGATTCACTTTCAGTAGCTATGGCTTGTCTT...,300,0,0,False
97870,53A_026_H-1443862-L-OVAGC-C8_H_B04,GGGAGTCCTAGTGCAGCCTGGAGGCTCCCTGTCCCTCTCCTGTGCA...,SCAASGFTFSVYGMSWVRQTPDKRLEWVATISYGVGYTFYADNLKG...,IGH,F,,,,F,F,,IGHV_HA,,,TCCTGTGCAGCCTCTGGATTCACTTTCAGTGTTTATGGCATGTCTT...,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSVYGMSWVRQTPDKRLEWVATISYGVGYTFYADNLKG...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,1.0,300.0,,,,,TCCTGTGCAGCCTCTGGATTCACTTTCAGTGTTTATGGCATGTCTT...,SCAASGFTFSVYGMSWVRQTPDKRLEWVATISYGVGYTFYADNLKG...,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,,,,,,,,,TCCTGTGCAGCCTCT,SCAAS,GGATTCACTTTCAGTGTTTATGGC,GFTFSVYG,ATGTCTTGGGTTCGCCAGACTCCAGACAAGAGGCTGGAGTGGGTCG...,MSWVRQTPDKRLEWVAT,TTAGTTATGGTGTTGGTTACACC,ISYGVGYT,TTCTATGCAGACAATTTGAAGGGGCGTTTCACCATCTCCAGAGACA...,FYADNLKGRFTISRDNAKNTLYLQMNSLKSEDSAMYYCTRRERFNE...,,,,,,,,,419.410,,,37S300M1S,,,9.765000e-122,,,94.667,,,38.0,337.0,1.0,300.0,,,,,,,,,38.0,52.0,53.0,76.0,77.0,128.0,129.0,151.0,152.0,337.0,,,,,,,,,LateGC,HA-uMT,OVA,VH,0-1,HA,sanger,LateGC_HA-uMT_OVA_VH_0-1,IGH,IGH,,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,300,300,300,TCCTGTGCAGCCTCTGGATTCACTTTCAGTGTTTATGGCATGTCTT...,300,0,0,False
97871,52A_029_H-1406143-R-OVAGC-C11_H_E04,AGCCTGGAGGGTCCCTGAAACTCTCCTGTGTTGCCTCTGGATTCAC...,SCVASGFTFSTYGMSWVRQTPDKRLEWVATSSYGVGYTLYSDNVKG...,IGH,F,,,,F,F,,IGHV_HA,,,TCCTGTGTTGCCTCTGGATTCACTTTCAGTACCTATGGCATGTCTT...,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCVASGFTFSTYGMSWVRQTPDKRLEWVATSSYGVGYTLYSDNVKG...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,1.0,300.0,,,,,TCCTGTGTTGCCTCTGGATTCACTTTCAGTACCTATGGCATGTCTT...,SCVASGFTFSTYGMSWVRQTPDKRLEWVATSSYGVGYTLYSDNVKG...,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,,,,,,,,,TCCTGTGTTGCCTCT,SCVAS,GGATTCACTTTCAGTACCTATGGC,GFTFSTYG,ATGTCTTGGGTTCGCCAGACTCCAGACAAGAGGCTGGAGTGGGTCG...,MSWVRQTPDKRLEWVAT,GTAGTTATGGTGTTGGTTACACC,SSYGVGYT,CTCTATTCAGACAATGTGAAGGGGCGGTTCACCATCTCCAGAGACA...,LYSDNVKGRFTISRDNAKNTLYLQMSGLKSEDSAIYYCSRRGWYGE...,,,,,,,,,394.480,,,23S300M1S,,,3.005000e-114,,,92.000,,,24.0,323.0,1.0,300.0,,,,,,,,,24.0,38.0,39.0,62.0,63.0,114.0,115.0,137.0,138.0,323.0,,,,,,,,,LateGC,HA-uMT,OVA,VH,0-1,HA,sanger,LateGC_HA-uMT_OVA_VH_0-1,IGH,IGH,,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,300,300,300,TCCTGTGTTGCCTCTGGATTCACTTTCAGTACCTATGGCATGTCTT...,300,0,0,False


In [40]:
grouping=dfs.groupby(by=['status', 'mouse', 'dataset', 'chain', 'sub_dataset'])

for grouped, df in grouping:
    suffix='_'.join(grouped)
    df.reset_index(drop=True, inplace=True)
    print(suffix, len(df))

EarlyGC_B18-383_APC_VH_- 179
EarlyGC_B18-383_APC_VL_- 228
EarlyGC_B18-383_CGG_VH_- 80
EarlyGC_B18-383_CGG_VL_- 74
EarlyGC_B18-383_OVA_VH_- 102
EarlyGC_B18-383_OVA_VL_- 133
EarlyGC_HA-uMT_APC_VH_0-1 92
EarlyGC_HA-uMT_APC_VL_0-1 123
EarlyGC_HA-uMT_CGG_VH_0-1 50
EarlyGC_HA-uMT_CGG_VL_0-1 168
EarlyGC_HA-uMT_OVA_VH_0-1 51
EarlyGC_HA-uMT_OVA_VL_0-1 81
LateGC_B18-383_APC_VH_- 95
LateGC_B18-383_APC_VL_- 112
LateGC_B18-383_CGG_VH_- 48
LateGC_B18-383_CGG_VL_- 78
LateGC_B18-383_OVA_VH_- 145
LateGC_B18-383_OVA_VL_- 215
LateGC_B18-383_OVA-CTLA4_VH_- 125
LateGC_B18-383_OVA-CTLA4_VL_- 179
LateGC_B18-383_OVA-Isotype_VH_- 156
LateGC_B18-383_OVA-Isotype_VL_- 286
LateGC_HA-WT_APC_VH_1-1 30
LateGC_HA-WT_APC_VH_1-100 79
LateGC_HA-WT_APC_VH_1-1000 27
LateGC_HA-WT_APC_VL_1-1 62
LateGC_HA-WT_APC_VL_1-100 155
LateGC_HA-WT_APC_VL_1-1000 57
LateGC_HA-WT_CGG_VH_1-1 48
LateGC_HA-WT_CGG_VH_1-100 66
LateGC_HA-WT_CGG_VH_1-1000 64
LateGC_HA-WT_CGG_VL_1-1 93
LateGC_HA-WT_CGG_VL_1-100 103
LateGC_HA-WT_CGG_VL_1-1000 130


In [41]:
grouping = dfs.groupby(['frameshift', 'label'], dropna=False)[['sequence_id']]
grouping.count()

Unnamed: 0_level_0,Unnamed: 1_level_0,sequence_id
frameshift,label,Unnamed: 2_level_1
False,EarlyGC_B18-383_APC_VH_-,133
False,EarlyGC_B18-383_APC_VL_-,93
False,EarlyGC_B18-383_CGG_VH_-,54
False,EarlyGC_B18-383_CGG_VL_-,55
False,EarlyGC_B18-383_OVA_VH_-,67
False,EarlyGC_B18-383_OVA_VL_-,84
False,EarlyGC_HA-uMT_APC_VH_0-1,84
False,EarlyGC_HA-uMT_APC_VL_0-1,115
False,EarlyGC_HA-uMT_CGG_VH_0-1,36
False,EarlyGC_HA-uMT_CGG_VL_0-1,64


In [42]:
update_stats_seq(dfs[dfs['frameshift']==True], 'frameshift_c')
df_stats_seq.sum()

total                   101706.0
no_alignment               192.0
no_locus                     7.0
no_locus_consensus           7.0
stopcodons                2439.0
10X_chain_filtering        978.0
10X_Vgene_filtering        205.0
10X_Jgene_filtering          5.0
no_alignment_c              42.0
no_locus_consensus_c         1.0
stopcodon_c                 38.0
frameshift_c               781.0
dtype: float64

In [43]:
df_stats_seq

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,total,no_alignment,no_locus,no_locus_consensus,stopcodons,10X_chain_filtering,10X_Vgene_filtering,10X_Jgene_filtering,no_alignment_c,no_locus_consensus_c,stopcodon_c,frameshift_c
status,mouse,dataset,chain,sub_dataset,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
EarlyGC,B18-383,APC,VH,-,312,27.0,3.0,,95.0,0.0,0.0,0.0,8.0,,,46.0
EarlyGC,B18-383,APC,VL,-,255,4.0,,,23.0,0.0,0.0,0.0,,,,135.0
EarlyGC,B18-383,CGG,VH,-,128,4.0,,,39.0,0.0,0.0,0.0,5.0,,,26.0
EarlyGC,B18-383,CGG,VL,-,134,32.0,,,18.0,0.0,0.0,0.0,10.0,,,19.0
EarlyGC,B18-383,OVA,VH,-,158,,1.0,,55.0,0.0,0.0,0.0,,,,35.0
EarlyGC,B18-383,OVA,VL,-,157,,,,22.0,0.0,0.0,0.0,2.0,,,49.0
EarlyGC,HA-uMT,APC,VH,0-1,160,57.0,,,10.0,0.0,0.0,0.0,1.0,,,8.0
EarlyGC,HA-uMT,APC,VL,0-1,159,10.0,,,25.0,0.0,0.0,0.0,1.0,,,8.0
EarlyGC,HA-uMT,CGG,VH,0-1,65,1.0,,,11.0,0.0,0.0,0.0,3.0,,,14.0
EarlyGC,HA-uMT,CGG,VL,0-1,206,1.0,,,37.0,0.0,0.0,0.0,,,,104.0


In [44]:
grouping=dfs[dfs['frameshift']==True].groupby(by=['status', 'mouse', 'dataset', 'chain', 'sub_dataset'])

for grouped, df in grouping:
    suffix='_'.join(grouped)
    df.reset_index(drop=True, inplace=True)
    print(suffix, len(df))

EarlyGC_B18-383_APC_VH_- 46
EarlyGC_B18-383_APC_VL_- 135
EarlyGC_B18-383_CGG_VH_- 26
EarlyGC_B18-383_CGG_VL_- 19
EarlyGC_B18-383_OVA_VH_- 35
EarlyGC_B18-383_OVA_VL_- 49
EarlyGC_HA-uMT_APC_VH_0-1 8
EarlyGC_HA-uMT_APC_VL_0-1 8
EarlyGC_HA-uMT_CGG_VH_0-1 14
EarlyGC_HA-uMT_CGG_VL_0-1 104
EarlyGC_HA-uMT_OVA_VH_0-1 3
EarlyGC_HA-uMT_OVA_VL_0-1 8
LateGC_B18-383_APC_VL_- 1
LateGC_B18-383_CGG_VL_- 1
LateGC_B18-383_OVA_VH_- 4
LateGC_B18-383_OVA_VL_- 1
LateGC_B18-383_OVA-CTLA4_VH_- 3
LateGC_B18-383_OVA-CTLA4_VL_- 5
LateGC_B18-383_OVA-Isotype_VH_- 1
LateGC_B18-383_OVA-Isotype_VL_- 3
LateGC_HA-WT_APC_VH_1-1 1
LateGC_HA-WT_APC_VL_1-100 1
LateGC_HA-WT_APC_VL_1-1000 1
LateGC_HA-WT_CGG_VL_1-100 2
LateGC_HA-WT_CGG_VL_1-1000 2
LateGC_HA-WT_CGG-CTLA4_VL_1-1000 1
LateGC_HA-WT_OVA_VH_1-100 1
LateGC_HA-WT_OVA_VL_1-100 2
LateGC_HA-uMT_APC_VL_0-1 3
LateGC_HA-uMT_CGG_VH_0-1 1
LateGC_HA-uMT_OVA_VL_0-1 12
Published_B18_Passenger_VH_- 280


In [45]:
dfs=dfs[dfs['frameshift']!=True].copy()
dfs

Unnamed: 0,sequence_id,sequence,sequence_aa,locus,stop_codon,vj_in_frame,v_frameshift,productive,rev_comp,complete_vdj,d_frame,v_call,d_call,j_call,sequence_alignment,germline_alignment,sequence_alignment_aa,germline_alignment_aa,v_alignment_start,v_alignment_end,d_alignment_start,d_alignment_end,j_alignment_start,j_alignment_end,v_sequence_alignment,v_sequence_alignment_aa,v_germline_alignment,v_germline_alignment_aa,d_sequence_alignment,d_sequence_alignment_aa,d_germline_alignment,d_germline_alignment_aa,j_sequence_alignment,j_sequence_alignment_aa,j_germline_alignment,j_germline_alignment_aa,fwr1,fwr1_aa,cdr1,cdr1_aa,fwr2,fwr2_aa,cdr2,cdr2_aa,fwr3,fwr3_aa,fwr4,fwr4_aa,cdr3,cdr3_aa,junction,junction_length,junction_aa,junction_aa_length,v_score,d_score,j_score,v_cigar,d_cigar,j_cigar,v_support,d_support,j_support,v_identity,d_identity,j_identity,v_sequence_start,v_sequence_end,v_germline_start,v_germline_end,d_sequence_start,d_sequence_end,d_germline_start,d_germline_end,j_sequence_start,j_sequence_end,j_germline_start,j_germline_end,fwr1_start,fwr1_end,cdr1_start,cdr1_end,fwr2_start,fwr2_end,cdr2_start,cdr2_end,fwr3_start,fwr3_end,fwr4_start,fwr4_end,cdr3_start,cdr3_end,np1,np1_length,np2,np2_length,status,mouse,dataset,chain,sub_dataset,mouse_DB,sequencing,label,locus_v_call,locus_consensus,stopcodon,seq_reference,len_reference,len_sequence_alignment,len_germline_alignment,new_sequence,len_new_sequence,insertions,deletions,frameshift
0,AAACCATTCGGGCTTG-1_contig_1,CAGGTCCAACTGCAGCAGCCTGGGGCTGAGCTTGTGAAGCCTGGGG...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,IGH,F,,,,F,F,,IGHV_B18,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,1.0,318.0,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,,,,,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCT,GASVKLSCKAS,GGCTACACCTTCACCAGCTACTGG,GYTFTSYW,ATGCACTGGGTGAAGCAGAGGCCTGGACGAGGCCTTGAGTGGATTG...,MHWVKQRPGRGLEWIGR,ATTGATCCTAATAGTGGTGGTACT,IDPNSGGT,AAGTACAATGAGAAGTTCAAGAGCAAGGCCACACTGACTGTAGACA...,KYNEKFKSKATLTVDKPSSTAYMQLSSLTSEDSAVYYCARYDYYGS...,,,,,,,,,497.316,,,42S318M1S,,,3.819000e-145,,,100.000,,,43.0,360.0,1.0,318.0,,,,,,,,,43.0,75.0,76.0,99.0,100.0,150.0,151.0,174.0,175.0,360.0,,,,,,,,,Unimmunized,B18-383,SPL,VH,rep2,B18,sc-vdj,Unimmunized_B18-383_SPL_VH_rep2,IGH,IGH,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,318,318,318,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,318,0,0,False
1,AAACCATTCGTCCTGC-1_contig_1,CAGGTCCAACTGCAGCAGCCTGGGGCTGAGCTTGTGAAGCCTGGGG...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,IGH,F,,,,F,F,,IGHV_B18,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,1.0,318.0,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,,,,,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCT,GASVKLSCKAS,GGCTACACCTTCACCAGCTACTGG,GYTFTSYW,ATGCACTGGGTGAAGCAGAGGCCTGGACGAGGCCTTGAGTGGATTG...,MHWVKQRPGRGLEWIGR,ATTGATCCTAATAGTGGTGGTACT,IDPNSGGT,AAGTACAATGAGAAGTTCAAGAGCAAGGCCACACTGACTGTAGACA...,KYNEKFKSKATLTVDKPSSTAYMQLSSLTSEDSAVYYCARYDYYGS...,,,,,,,,,497.316,,,42S318M1S,,,3.819000e-145,,,100.000,,,43.0,360.0,1.0,318.0,,,,,,,,,43.0,75.0,76.0,99.0,100.0,150.0,151.0,174.0,175.0,360.0,,,,,,,,,Unimmunized,B18-383,SPL,VH,rep2,B18,sc-vdj,Unimmunized_B18-383_SPL_VH_rep2,IGH,IGH,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,318,318,318,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,318,0,0,False
2,AAACCCCAGAAACGGT-1_contig_1,CAGGTCCAACTGCAGCAGCCTGGGGCTGAGCTTGTGAAGCCTGGGG...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,IGH,F,,,,F,F,,IGHV_B18,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,1.0,318.0,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,,,,,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCT,GASVKLSCKAS,GGCTACACCTTCACCAGCTACTGG,GYTFTSYW,ATGCACTGGGTGAAGCAGAGGCCTGGACGAGGCCTTGAGTGGATTG...,MHWVKQRPGRGLEWIGR,ATTGATCCTAATAGTGGTGGTACT,IDPNSGGT,AAGTACAATGAGAAGTTCAAGAGCAAGGCCACACTGACTGTAGACA...,KYNEKFKSKATLTVDKPSSTAYMQLSSLTSEDSAVYYCARYDYYGS...,,,,,,,,,497.316,,,42S318M1S,,,3.819000e-145,,,100.000,,,43.0,360.0,1.0,318.0,,,,,,,,,43.0,75.0,76.0,99.0,100.0,150.0,151.0,174.0,175.0,360.0,,,,,,,,,Unimmunized,B18-383,SPL,VH,rep2,B18,sc-vdj,Unimmunized_B18-383_SPL_VH_rep2,IGH,IGH,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,318,318,318,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,318,0,0,False
3,AAACCCCAGCCAGCAT-1_contig_1,CAGGTCCAACTGCAGCAGCCTGGGGCTGAGCTTGTGAAGCCTGGGG...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,IGH,F,,,,F,F,,IGHV_B18,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,1.0,318.0,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,,,,,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCT,GASVKLSCKAS,GGCTACACCTTCACCAGCTACTGG,GYTFTSYW,ATGCACTGGGTGAAGCAGAGGCCTGGACGAGGCCTTGAGTGGATTG...,MHWVKQRPGRGLEWIGR,ATTGATCCTAATAGTGGTGGTACT,IDPNSGGT,AAGTACAATGAGAAGTTCAAGAGCAAGGCCACACTGACTGTAGACA...,KYNEKFKSKATLTVDKPSSTAYMQLSSLTSEDSAVYYCARYDYYGS...,,,,,,,,,497.316,,,42S318M1S,,,3.819000e-145,,,100.000,,,43.0,360.0,1.0,318.0,,,,,,,,,43.0,75.0,76.0,99.0,100.0,150.0,151.0,174.0,175.0,360.0,,,,,,,,,Unimmunized,B18-383,SPL,VH,rep2,B18,sc-vdj,Unimmunized_B18-383_SPL_VH_rep2,IGH,IGH,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,318,318,318,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,318,0,0,False
4,AAACCCCAGCGTAGCT-1_contig_1,CAGGTCCAACTGCAGCAGCCTGGGGCTGAGCTTGTGAAGCCTGGGG...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,IGH,F,,,,F,F,,IGHV_B18,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,1.0,318.0,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,GASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKY...,,,,,,,,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCT,GASVKLSCKAS,GGCTACACCTTCACCAGCTACTGG,GYTFTSYW,ATGCACTGGGTGAAGCAGAGGCCTGGACGAGGCCTTGAGTGGATTG...,MHWVKQRPGRGLEWIGR,ATTGATCCTAATAGTGGTGGTACT,IDPNSGGT,AAGTACAATGAGAAGTTCAAGAGCAAGGCCACACTGACTGTAGACA...,KYNEKFKSKATLTVDKPSSTAYMQLSSLTSEDSAVYYCARYDYYGS...,,,,,,,,,497.316,,,42S318M1S,,,3.819000e-145,,,100.000,,,43.0,360.0,1.0,318.0,,,,,,,,,43.0,75.0,76.0,99.0,100.0,150.0,151.0,174.0,175.0,360.0,,,,,,,,,Unimmunized,B18-383,SPL,VH,rep2,B18,sc-vdj,Unimmunized_B18-383_SPL_VH_rep2,IGH,IGH,,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,318,318,318,GGGGCTTCAGTGAAGCTGTCCTGCAAGGCTTCTGGCTACACCTTCA...,318,0,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97868,52A_006_H-1406143-R-OVAGC-A9_H_F01,GGAGAGACTTAGTGAAGCCTGGAGGGTCCCTGAAACTCTCCTGTGC...,SCAASGFFFFTYGMSWVRQTPDKRLEWVATISYGGGYIYYSDSVKG...,IGH,F,,,,F,F,,IGHV_HA,,,TCCTGTGCAGCCTCTGGATTTTTTTTCTTTACCTATGGCATGTCTT...,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFFFFTYGMSWVRQTPDKRLEWVATISYGGGYIYYSDSVKG...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,1.0,300.0,,,,,TCCTGTGCAGCCTCTGGATTTTTTTTCTTTACCTATGGCATGTCTT...,SCAASGFFFFTYGMSWVRQTPDKRLEWVATISYGGGYIYYSDSVKG...,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,,,,,,,,,TCCTGTGCAGCCTCT,SCAAS,GGATTTTTTTTCTTTACCTATGGC,GFFFFTYG,ATGTCTTGGGTTCGCCAGACTCCAGACAAGAGGCTGGAGTGGGTCG...,MSWVRQTPDKRLEWVAT,TTAGTTATGGTGGTGGTTACATT,ISYGGGYI,TACTATTCAGACAGTGTCAAGGGACGATTCACCATCTCCAGAGACA...,YYSDSVKGRFTISRDNAKNTLYLRMSSLKSEDSAMYYCARRERYDE...,,,,,,,,,422.526,,,38S300M1S,,,1.130000e-122,,,95.000,,,39.0,338.0,1.0,300.0,,,,,,,,,39.0,53.0,54.0,77.0,78.0,129.0,130.0,152.0,153.0,338.0,,,,,,,,,LateGC,HA-uMT,OVA,VH,0-1,HA,sanger,LateGC_HA-uMT_OVA_VH_0-1,IGH,IGH,,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,300,300,300,TCCTGTGCAGCCTCTGGATTTTTTTTCTTTACCTATGGCATGTCTT...,300,0,0,False
97869,52A_024_H-1406143-L-OVAGC-C7_H_H03,GAGGATCCCTCCAACTCTCCTGTGTAGCCTCTGGATTCACTTTCAG...,SCVASGFTFSSYGLSWVRQTPDKRPEWVATISYGVGYTFYLDSVKG...,IGH,F,,,,F,F,,IGHV_HA,,,TCCTGTGTAGCCTCTGGATTCACTTTCAGTAGCTATGGCTTGTCTT...,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCVASGFTFSSYGLSWVRQTPDKRPEWVATISYGVGYTFYLDSVKG...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,1.0,300.0,,,,,TCCTGTGTAGCCTCTGGATTCACTTTCAGTAGCTATGGCTTGTCTT...,SCVASGFTFSSYGLSWVRQTPDKRPEWVATISYGVGYTFYLDSVKG...,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,,,,,,,,,TCCTGTGTAGCCTCT,SCVAS,GGATTCACTTTCAGTAGCTATGGC,GFTFSSYG,TTGTCTTGGGTTCGCCAGACTCCAGACAAGAGGCCGGAATGGGTCG...,LSWVRQTPDKRPEWVAT,TTAGTTATGGTGTTGGTTATACT,ISYGVGYT,TTCTATTTAGACAGTGTGAAGGGGCGATTCACCATCTCCAGAGACA...,FYLDSVKGRFTISRDNAKNTLYLQMSSLKSEDSAMYYCTRREKYDE...,,,,,,,,,419.410,,,17S300M1S,,,9.219000e-122,,,94.667,,,18.0,317.0,1.0,300.0,,,,,,,,,18.0,32.0,33.0,56.0,57.0,108.0,109.0,131.0,132.0,317.0,,,,,,,,,LateGC,HA-uMT,OVA,VH,0-1,HA,sanger,LateGC_HA-uMT_OVA_VH_0-1,IGH,IGH,,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,300,300,300,TCCTGTGTAGCCTCTGGATTCACTTTCAGTAGCTATGGCTTGTCTT...,300,0,0,False
97870,53A_026_H-1443862-L-OVAGC-C8_H_B04,GGGAGTCCTAGTGCAGCCTGGAGGCTCCCTGTCCCTCTCCTGTGCA...,SCAASGFTFSVYGMSWVRQTPDKRLEWVATISYGVGYTFYADNLKG...,IGH,F,,,,F,F,,IGHV_HA,,,TCCTGTGCAGCCTCTGGATTCACTTTCAGTGTTTATGGCATGTCTT...,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSVYGMSWVRQTPDKRLEWVATISYGVGYTFYADNLKG...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,1.0,300.0,,,,,TCCTGTGCAGCCTCTGGATTCACTTTCAGTGTTTATGGCATGTCTT...,SCAASGFTFSVYGMSWVRQTPDKRLEWVATISYGVGYTFYADNLKG...,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,,,,,,,,,TCCTGTGCAGCCTCT,SCAAS,GGATTCACTTTCAGTGTTTATGGC,GFTFSVYG,ATGTCTTGGGTTCGCCAGACTCCAGACAAGAGGCTGGAGTGGGTCG...,MSWVRQTPDKRLEWVAT,TTAGTTATGGTGTTGGTTACACC,ISYGVGYT,TTCTATGCAGACAATTTGAAGGGGCGTTTCACCATCTCCAGAGACA...,FYADNLKGRFTISRDNAKNTLYLQMNSLKSEDSAMYYCTRRERFNE...,,,,,,,,,419.410,,,37S300M1S,,,9.765000e-122,,,94.667,,,38.0,337.0,1.0,300.0,,,,,,,,,38.0,52.0,53.0,76.0,77.0,128.0,129.0,151.0,152.0,337.0,,,,,,,,,LateGC,HA-uMT,OVA,VH,0-1,HA,sanger,LateGC_HA-uMT_OVA_VH_0-1,IGH,IGH,,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,300,300,300,TCCTGTGCAGCCTCTGGATTCACTTTCAGTGTTTATGGCATGTCTT...,300,0,0,False
97871,52A_029_H-1406143-R-OVAGC-C11_H_E04,AGCCTGGAGGGTCCCTGAAACTCTCCTGTGTTGCCTCTGGATTCAC...,SCVASGFTFSTYGMSWVRQTPDKRLEWVATSSYGVGYTLYSDNVKG...,IGH,F,,,,F,F,,IGHV_HA,,,TCCTGTGTTGCCTCTGGATTCACTTTCAGTACCTATGGCATGTCTT...,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCVASGFTFSTYGMSWVRQTPDKRLEWVATSSYGVGYTLYSDNVKG...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,1.0,300.0,,,,,TCCTGTGTTGCCTCTGGATTCACTTTCAGTACCTATGGCATGTCTT...,SCVASGFTFSTYGMSWVRQTPDKRLEWVATSSYGVGYTLYSDNVKG...,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,SCAASGFTFSSYGMSWVRQTPDKRLEWVATISNGGGYTYYPDSVKG...,,,,,,,,,TCCTGTGTTGCCTCT,SCVAS,GGATTCACTTTCAGTACCTATGGC,GFTFSTYG,ATGTCTTGGGTTCGCCAGACTCCAGACAAGAGGCTGGAGTGGGTCG...,MSWVRQTPDKRLEWVAT,GTAGTTATGGTGTTGGTTACACC,SSYGVGYT,CTCTATTCAGACAATGTGAAGGGGCGGTTCACCATCTCCAGAGACA...,LYSDNVKGRFTISRDNAKNTLYLQMSGLKSEDSAIYYCSRRGWYGE...,,,,,,,,,394.480,,,23S300M1S,,,3.005000e-114,,,92.000,,,24.0,323.0,1.0,300.0,,,,,,,,,24.0,38.0,39.0,62.0,63.0,114.0,115.0,137.0,138.0,323.0,,,,,,,,,LateGC,HA-uMT,OVA,VH,0-1,HA,sanger,LateGC_HA-uMT_OVA_VH_0-1,IGH,IGH,,TCCTGTGCAGCCTCTGGATTCACTTTCAGTAGCTATGGCATGTCTT...,300,300,300,TCCTGTGTTGCCTCTGGATTCACTTTCAGTACCTATGGCATGTCTT...,300,0,0,False


In [46]:
grouping=dfs.groupby(by=['status', 'mouse', 'dataset', 'chain', 'sub_dataset'])

for grouped, df in grouping:
    suffix='_'.join(grouped)
    df.reset_index(drop=True, inplace=True)
    print(suffix, len(df))

EarlyGC_B18-383_APC_VH_- 133
EarlyGC_B18-383_APC_VL_- 93
EarlyGC_B18-383_CGG_VH_- 54
EarlyGC_B18-383_CGG_VL_- 55
EarlyGC_B18-383_OVA_VH_- 67
EarlyGC_B18-383_OVA_VL_- 84
EarlyGC_HA-uMT_APC_VH_0-1 84
EarlyGC_HA-uMT_APC_VL_0-1 115
EarlyGC_HA-uMT_CGG_VH_0-1 36
EarlyGC_HA-uMT_CGG_VL_0-1 64
EarlyGC_HA-uMT_OVA_VH_0-1 48
EarlyGC_HA-uMT_OVA_VL_0-1 73
LateGC_B18-383_APC_VH_- 95
LateGC_B18-383_APC_VL_- 111
LateGC_B18-383_CGG_VH_- 48
LateGC_B18-383_CGG_VL_- 77
LateGC_B18-383_OVA_VH_- 141
LateGC_B18-383_OVA_VL_- 214
LateGC_B18-383_OVA-CTLA4_VH_- 122
LateGC_B18-383_OVA-CTLA4_VL_- 174
LateGC_B18-383_OVA-Isotype_VH_- 155
LateGC_B18-383_OVA-Isotype_VL_- 283
LateGC_HA-WT_APC_VH_1-1 29
LateGC_HA-WT_APC_VH_1-100 79
LateGC_HA-WT_APC_VH_1-1000 27
LateGC_HA-WT_APC_VL_1-1 62
LateGC_HA-WT_APC_VL_1-100 154
LateGC_HA-WT_APC_VL_1-1000 56
LateGC_HA-WT_CGG_VH_1-1 48
LateGC_HA-WT_CGG_VH_1-100 66
LateGC_HA-WT_CGG_VH_1-1000 64
LateGC_HA-WT_CGG_VL_1-1 93
LateGC_HA-WT_CGG_VL_1-100 101
LateGC_HA-WT_CGG_VL_1-1000 128
Late

In [47]:
set(dfs['sequencing'])

{'deep', 'sanger', 'sc-vdj'}

In [48]:
dfs[dfs['frameshift']!=False]

Unnamed: 0,sequence_id,sequence,sequence_aa,locus,stop_codon,vj_in_frame,v_frameshift,productive,rev_comp,complete_vdj,d_frame,v_call,d_call,j_call,sequence_alignment,germline_alignment,sequence_alignment_aa,germline_alignment_aa,v_alignment_start,v_alignment_end,d_alignment_start,d_alignment_end,j_alignment_start,j_alignment_end,v_sequence_alignment,v_sequence_alignment_aa,v_germline_alignment,v_germline_alignment_aa,d_sequence_alignment,d_sequence_alignment_aa,d_germline_alignment,d_germline_alignment_aa,j_sequence_alignment,j_sequence_alignment_aa,j_germline_alignment,j_germline_alignment_aa,fwr1,fwr1_aa,cdr1,cdr1_aa,fwr2,fwr2_aa,cdr2,cdr2_aa,fwr3,fwr3_aa,fwr4,fwr4_aa,cdr3,cdr3_aa,junction,junction_length,junction_aa,junction_aa_length,v_score,d_score,j_score,v_cigar,d_cigar,j_cigar,v_support,d_support,j_support,v_identity,d_identity,j_identity,v_sequence_start,v_sequence_end,v_germline_start,v_germline_end,d_sequence_start,d_sequence_end,d_germline_start,d_germline_end,j_sequence_start,j_sequence_end,j_germline_start,j_germline_end,fwr1_start,fwr1_end,cdr1_start,cdr1_end,fwr2_start,fwr2_end,cdr2_start,cdr2_end,fwr3_start,fwr3_end,fwr4_start,fwr4_end,cdr3_start,cdr3_end,np1,np1_length,np2,np2_length,status,mouse,dataset,chain,sub_dataset,mouse_DB,sequencing,label,locus_v_call,locus_consensus,stopcodon,seq_reference,len_reference,len_sequence_alignment,len_germline_alignment,new_sequence,len_new_sequence,insertions,deletions,frameshift


### Filter based on length

In [49]:
len(dfs.loc[dfs['len_sequence_alignment'] <= dfs['len_reference'] * 0.7])

331

In [50]:
update_stats_seq(dfs.loc[dfs['len_sequence_alignment'] <= dfs['len_reference'] * 0.7], 'length_0.7_ref_c')
df_stats_seq

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,total,no_alignment,no_locus,no_locus_consensus,stopcodons,10X_chain_filtering,10X_Vgene_filtering,10X_Jgene_filtering,no_alignment_c,no_locus_consensus_c,stopcodon_c,frameshift_c,length_0.7_ref_c
status,mouse,dataset,chain,sub_dataset,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
EarlyGC,B18-383,APC,VH,-,312,27.0,3.0,,95.0,0.0,0.0,0.0,8.0,,,46.0,25.0
EarlyGC,B18-383,APC,VL,-,255,4.0,,,23.0,0.0,0.0,0.0,,,,135.0,4.0
EarlyGC,B18-383,CGG,VH,-,128,4.0,,,39.0,0.0,0.0,0.0,5.0,,,26.0,8.0
EarlyGC,B18-383,CGG,VL,-,134,32.0,,,18.0,0.0,0.0,0.0,10.0,,,19.0,14.0
EarlyGC,B18-383,OVA,VH,-,158,,1.0,,55.0,0.0,0.0,0.0,,,,35.0,29.0
EarlyGC,B18-383,OVA,VL,-,157,,,,22.0,0.0,0.0,0.0,2.0,,,49.0,7.0
EarlyGC,HA-uMT,APC,VH,0-1,160,57.0,,,10.0,0.0,0.0,0.0,1.0,,,8.0,11.0
EarlyGC,HA-uMT,APC,VL,0-1,159,10.0,,,25.0,0.0,0.0,0.0,1.0,,,8.0,18.0
EarlyGC,HA-uMT,CGG,VH,0-1,65,1.0,,,11.0,0.0,0.0,0.0,3.0,,,14.0,3.0
EarlyGC,HA-uMT,CGG,VL,0-1,206,1.0,,,37.0,0.0,0.0,0.0,,,,104.0,6.0


In [51]:
dfs = dfs.loc[dfs['len_sequence_alignment'] > dfs['len_reference'] * 0.7].copy()
len(dfs)

96680

In [52]:
update_stats_seq(dfs, 'final_c')
df_stats_seq

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,total,no_alignment,no_locus,no_locus_consensus,stopcodons,10X_chain_filtering,10X_Vgene_filtering,10X_Jgene_filtering,no_alignment_c,no_locus_consensus_c,stopcodon_c,frameshift_c,length_0.7_ref_c,final_c
status,mouse,dataset,chain,sub_dataset,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
EarlyGC,B18-383,APC,VH,-,312,27.0,3.0,,95.0,0.0,0.0,0.0,8.0,,,46.0,25.0,108.0
EarlyGC,B18-383,APC,VL,-,255,4.0,,,23.0,0.0,0.0,0.0,,,,135.0,4.0,89.0
EarlyGC,B18-383,CGG,VH,-,128,4.0,,,39.0,0.0,0.0,0.0,5.0,,,26.0,8.0,46.0
EarlyGC,B18-383,CGG,VL,-,134,32.0,,,18.0,0.0,0.0,0.0,10.0,,,19.0,14.0,41.0
EarlyGC,B18-383,OVA,VH,-,158,,1.0,,55.0,0.0,0.0,0.0,,,,35.0,29.0,38.0
EarlyGC,B18-383,OVA,VL,-,157,,,,22.0,0.0,0.0,0.0,2.0,,,49.0,7.0,77.0
EarlyGC,HA-uMT,APC,VH,0-1,160,57.0,,,10.0,0.0,0.0,0.0,1.0,,,8.0,11.0,73.0
EarlyGC,HA-uMT,APC,VL,0-1,159,10.0,,,25.0,0.0,0.0,0.0,1.0,,,8.0,18.0,97.0
EarlyGC,HA-uMT,CGG,VH,0-1,65,1.0,,,11.0,0.0,0.0,0.0,3.0,,,14.0,3.0,33.0
EarlyGC,HA-uMT,CGG,VL,0-1,206,1.0,,,37.0,0.0,0.0,0.0,,,,104.0,6.0,58.0


In [53]:
df_stats_seq.sum()

total                   101706.0
no_alignment               192.0
no_locus                     7.0
no_locus_consensus           7.0
stopcodons                2439.0
10X_chain_filtering        978.0
10X_Vgene_filtering        205.0
10X_Jgene_filtering          5.0
no_alignment_c              42.0
no_locus_consensus_c         1.0
stopcodon_c                 38.0
frameshift_c               781.0
length_0.7_ref_c           331.0
final_c                  96680.0
dtype: float64

In [54]:
df_stats_seq.reset_index().to_csv(f"{sect_out_folder}/df_stats_seq.tsv", sep = '\t', header=True, index=False)


In [55]:
dfs.to_csv(f"{sect_out_folder}/dfs.tsv", sep = '\t', header=True, index=False)


In [56]:
df_stats_seq

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,total,no_alignment,no_locus,no_locus_consensus,stopcodons,10X_chain_filtering,10X_Vgene_filtering,10X_Jgene_filtering,no_alignment_c,no_locus_consensus_c,stopcodon_c,frameshift_c,length_0.7_ref_c,final_c
status,mouse,dataset,chain,sub_dataset,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
EarlyGC,B18-383,APC,VH,-,312,27.0,3.0,,95.0,0.0,0.0,0.0,8.0,,,46.0,25.0,108.0
EarlyGC,B18-383,APC,VL,-,255,4.0,,,23.0,0.0,0.0,0.0,,,,135.0,4.0,89.0
EarlyGC,B18-383,CGG,VH,-,128,4.0,,,39.0,0.0,0.0,0.0,5.0,,,26.0,8.0,46.0
EarlyGC,B18-383,CGG,VL,-,134,32.0,,,18.0,0.0,0.0,0.0,10.0,,,19.0,14.0,41.0
EarlyGC,B18-383,OVA,VH,-,158,,1.0,,55.0,0.0,0.0,0.0,,,,35.0,29.0,38.0
EarlyGC,B18-383,OVA,VL,-,157,,,,22.0,0.0,0.0,0.0,2.0,,,49.0,7.0,77.0
EarlyGC,HA-uMT,APC,VH,0-1,160,57.0,,,10.0,0.0,0.0,0.0,1.0,,,8.0,11.0,73.0
EarlyGC,HA-uMT,APC,VL,0-1,159,10.0,,,25.0,0.0,0.0,0.0,1.0,,,8.0,18.0,97.0
EarlyGC,HA-uMT,CGG,VH,0-1,65,1.0,,,11.0,0.0,0.0,0.0,3.0,,,14.0,3.0,33.0
EarlyGC,HA-uMT,CGG,VL,0-1,206,1.0,,,37.0,0.0,0.0,0.0,,,,104.0,6.0,58.0


In [57]:
grouping=dfs.groupby(by=['status', 'mouse', 'dataset', 'chain', 'sub_dataset'])

for grouped, df in grouping:
    suffix='_'.join(grouped)
    df.reset_index(drop=True, inplace=True)
    print(suffix, len(df))

EarlyGC_B18-383_APC_VH_- 108
EarlyGC_B18-383_APC_VL_- 89
EarlyGC_B18-383_CGG_VH_- 46
EarlyGC_B18-383_CGG_VL_- 41
EarlyGC_B18-383_OVA_VH_- 38
EarlyGC_B18-383_OVA_VL_- 77
EarlyGC_HA-uMT_APC_VH_0-1 73
EarlyGC_HA-uMT_APC_VL_0-1 97
EarlyGC_HA-uMT_CGG_VH_0-1 33
EarlyGC_HA-uMT_CGG_VL_0-1 58
EarlyGC_HA-uMT_OVA_VH_0-1 44
EarlyGC_HA-uMT_OVA_VL_0-1 67
LateGC_B18-383_APC_VH_- 95
LateGC_B18-383_APC_VL_- 111
LateGC_B18-383_CGG_VH_- 48
LateGC_B18-383_CGG_VL_- 77
LateGC_B18-383_OVA_VH_- 141
LateGC_B18-383_OVA_VL_- 214
LateGC_B18-383_OVA-CTLA4_VH_- 122
LateGC_B18-383_OVA-CTLA4_VL_- 174
LateGC_B18-383_OVA-Isotype_VH_- 155
LateGC_B18-383_OVA-Isotype_VL_- 283
LateGC_HA-WT_APC_VH_1-1 29
LateGC_HA-WT_APC_VH_1-100 79
LateGC_HA-WT_APC_VH_1-1000 27
LateGC_HA-WT_APC_VL_1-1 62
LateGC_HA-WT_APC_VL_1-100 154
LateGC_HA-WT_APC_VL_1-1000 56
LateGC_HA-WT_CGG_VH_1-1 48
LateGC_HA-WT_CGG_VH_1-100 66
LateGC_HA-WT_CGG_VH_1-1000 63
LateGC_HA-WT_CGG_VL_1-1 93
LateGC_HA-WT_CGG_VL_1-100 101
LateGC_HA-WT_CGG_VL_1-1000 128
LateG