# Notebook for processing length polymorphic libraries

A key component is this notebook uses prior assignments of barcodes to junction identities from Mendes and Russell 2021 as well as plasmid counts from those same libraries. This is because the same libraries were used. Those seeking to repeat these analyses would want to refer to the original code to assign their own libraries.

In [1]:
import pandas as pd
import sys

segmentsToTarget = ['HA_vRNA','PB1_vRNA']
scripts = 'Scripts'
sys.path.append(scripts)
import deletionLibrary
#an adapter in our mapping, remove from here
adapter = 'CCCTGAGACCAATA'
UPadd = 'CGTGTAGGCGATGGC'
DWNadd = 'ACTGCTTGCGATGAT'
junctionAdapter = 'Database/junctionSeqAdapter.fa'
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams.update({'font.size': 12, 'axes.titlesize':'medium',})
import seaborn as sns
from string import ascii_uppercase
import glob
import numpy as np
import math
import os 
#same colors for the same segments throughout this paper. Use a colorblind-friendly palette
colormap = 'colorblind'
colors = sns.color_palette(colormap, 8)
#different order here as the 2nd and 4th colors of this palette are too similar
segments = ['PB1_vRNA','HA_vRNA','PB2_vRNA',  'PA_vRNA','NP_vRNA','NA_vRNA','M_vRNA','NS_vRNA']
fluColors = {}
for color, segment in enumerate(segments):
    fluColors[segment] = colors[color]
    
twoColorPalette = [sns.color_palette('colorblind',8)[-2],sns.color_palette('dark', 8)[-1]]
#for some light parallel processing
from multiprocessing import Pool
lengths = {'PB1_vRNA':2341, 'HA_vRNA':1775}

numProcessors = 4

Extract barcodes from sequencing

In [3]:
def barcode(pandas):
    read1 = pandas[pandas.read == 'R1'].file.iloc[0]
    read1Unzip = read1[:-3]
    read2 = pandas[pandas.read == 'R2'].file.iloc[0]
    read2Unzip = read2[:-3]
    outFile = directory + 'library_' + str(pandas.library.iloc[0]) + '_barcodes.tsv'
    !gunzip -c $read1  > $read1Unzip
    !gunzip -c $read2  > $read2Unzip
    print(outFile)
    deletionLibrary.barcodeCount(Read1=read1Unzip, Read2 = read2Unzip, adjSeq = 'GTGCTCTTCCGGCCATCGCCTACACGACGCTTC',
                barcodeLen=12,  outfile=outFile)
    !rm -f $read1Unzip
    !rm -f $read2Unzip



filesToProcess = []
directory = 'Sequencing/LengthPolymorph/'
for file in glob.glob(directory + '*.gz'):
    seg = file.split('/')[-1].split('_')
    read = seg[-2]
    NEP = seg[-3].split('-')[-1]
    segment = seg[-3].split('-')[1]
    library = []
    currFrame = pd.DataFrame({'read':[read], 'NEP':[NEP], 'file':[file], 'segment':segment[0], 'replicate':segment[1], 
                                     'library':['_'.join([segment[0],segment[1],NEP])] })
    outFile = directory + 'library_' + str(currFrame.library.iloc[0]) + '_barcodes.tsv'
    if not os.path.isfile(outFile):
        filesToProcess += [currFrame]
filesToProcess = pd.concat(filesToProcess)

popList = set(filesToProcess.library)
while len(popList) > 0:
    with Pool(processes = numProcessors) as p:
        p.map(barcode, [filesToProcess[filesToProcess.library == popList.pop()] for i in range(numProcessors) if len(popList) != 0])



Sequencing/LengthPolymorph/library_H_1_withNEP_barcodes.tsv
Sequencing/LengthPolymorph/library_H_3_withNEP_barcodes.tsv
Sequencing/LengthPolymorph/library_P_3_withNEP_barcodes.tsv
Sequencing/LengthPolymorph/library_H_1_noNEP_barcodes.tsv
Sequencing/LengthPolymorph/library_H_2_withNEP_barcodes.tsv
Sequencing/LengthPolymorph/library_H_2_noNEP_barcodes.tsv
Sequencing/LengthPolymorph/library_P_1_withNEP_barcodes.tsv
Sequencing/LengthPolymorph/library_P_2_withNEP_barcodes.tsv
Sequencing/LengthPolymorph/library_P_1_noNEP_barcodes.tsv
Sequencing/LengthPolymorph/library_H_3_noNEP_barcodes.tsv
Sequencing/LengthPolymorph/library_P_3_noNEP_barcodes.tsv
Sequencing/LengthPolymorph/library_P_2_noNEP_barcodes.tsv
