## Generating supplementary data

In [1]:
import pandas as pd

## CP70 - AGM primary

In [2]:
cp70_pdna = pd.read_csv('../data/raw/M-AD81_AACH02_XPR050_G0_CP0070_ScoresSum.csv')
cp70_sars2_reads = pd.read_table('../data/raw/counts-GPP1498_Wilen_run2.txt')
sars2_condition_map = pd.read_csv('../data/interim/condition_mapping.csv')
cp70_pan_cov_reads = pd.read_table('../data/raw/counts-JD_GPP1868_Alfajaro_Wilen_CP0070.txt')
pan_cov_condition_map = pd.read_csv('../data/interim/other_coronaviruses_condition_mapping.csv')
cp70_chip = pd.read_table('../data/interim/CP0070_Chlorocebus_sabeus_remapped.chip')

### Filtered Chip

In [3]:
cp70_chip_filtered = (cp70_chip[['Barcode Sequence', 'Gene', 'Gene ID']]
                      .rename(columns = {'Barcode Sequence': 'Construct Barcode'}))
cp70_chip_filtered

Unnamed: 0,Construct Barcode,Gene,Gene ID
0,AAAAAAAAAAACTCAAAGAT,SET,103239793
1,AAAAAAAAACTGGAATCATG,LOC103248058,103248058
2,AAAAAAACAAAGTGTGGCGT,LOC103243876,103243876
3,AAAAAAACCTCTCGCTCCTG,LOC103243369,103243369
4,AAAAAAAGAGACACTGTTGT,ICOS,103217691
...,...,...,...
88866,TTTGTTTACGGCTTCGGCAA,FAM25A,103215859
88867,TTTGTTTCCACAAACATGTA,HSPB11,103224784
88868,TTTGTTTCCTCTATTCTACC,RPAP2,103224434
88869,TTTGTTTGCGGGTCACTTCG,DEPDC1B,103221934


### pDNA

In [4]:
cp70_pdna = (cp70_pdna.rename(columns = {'count':'pDNA'})
             [['Construct Barcode', 'pDNA']])
cp70_pdna

Unnamed: 0,Construct Barcode,pDNA
0,TGATAGTAGGATAATAGCGA,67
1,AGGGTTGTAGTAGTCCGTAA,48
2,GCCTTCTATGAGGTCGAAGG,66
3,GATGGTGTAGAGAGTAGTGG,84
4,CCCTCAACAACCTACTATCG,68
...,...,...
84958,ATGGTACGTCGCGAACGATG,64
84959,CGGCGCCAATCGACGTGTCG,44
84960,GCCTCGACCGTGCGACGATA,70
84961,GGCTTAACGCCGCGTACAAG,37


### SARS2

In [5]:
cp70_sars2_reads_filtered = cp70_sars2_reads.loc[:,~cp70_sars2_reads.columns.str.contains('Second|Construct IDs')]
sars2_condition_map_dict = pd.Series(sars2_condition_map.label.values,index=sars2_condition_map.condition).to_dict()
cp70_sars2_reads_filtered = cp70_sars2_reads_filtered.rename(sars2_condition_map_dict, axis=1)
cp70_sars2_reads_filtered = cp70_pdna.merge(cp70_sars2_reads_filtered, how = 'inner', on = 'Construct Barcode')
cp70_sars2_reads_filtered

Unnamed: 0,Construct Barcode,pDNA,Cas9-v1 D10 Mock,Cas9-v1 D10 Lo-MOI,Cas9-v1 D10 Hi-MOI,Cas9-v2 D5 Mock,Cas9-v2 D10 5e6 Hi-MOI,Cas9-v2 D5 5e6 Hi-MOI,Cas9-v2 D2 5e6 Hi-MOI,Cas9-v2 D5 2.5e6 Lo-MOI,Cas9-v2 D5 2.5e6 Hi-MOI
0,TGATAGTAGGATAATAGCGA,67,185,240,87,126,54,12,19,28,149
1,AGGGTTGTAGTAGTCCGTAA,48,189,110,50,137,32,32,13,114,26
2,GCCTTCTATGAGGTCGAAGG,66,311,107,79,241,122,166,186,128,211
3,GATGGTGTAGAGAGTAGTGG,84,403,235,179,437,139,101,96,135,213
4,CCCTCAACAACCTACTATCG,68,316,86,116,414,114,71,243,98,282
...,...,...,...,...,...,...,...,...,...,...,...
84958,ATGGTACGTCGCGAACGATG,64,223,151,158,320,114,133,192,89,267
84959,CGGCGCCAATCGACGTGTCG,44,170,95,51,297,57,103,35,131,182
84960,GCCTCGACCGTGCGACGATA,70,399,28,115,392,114,123,77,113,122
84961,GGCTTAACGCCGCGTACAAG,37,168,36,17,265,70,112,142,133,203


### Pan-Cov

In [6]:
cp70_pan_cov_reads_filtered = cp70_pan_cov_reads.loc[:,~cp70_pan_cov_reads.columns.str.contains('CP1560|Construct IDs')]
pan_cov_condition_map_dict = pd.Series(pan_cov_condition_map.label.values,index=pan_cov_condition_map.condition).to_dict()
cp70_pan_cov_reads_filtered = cp70_pan_cov_reads_filtered.rename(pan_cov_condition_map_dict, axis=1)
cp70_pan_cov_reads_filtered = cp70_pdna.merge(cp70_pan_cov_reads_filtered, how = 'inner', on = 'Construct Barcode')
cp70_pan_cov_reads_filtered

Unnamed: 0,Construct Barcode,pDNA,Mock,SARS1-Bat#1,MERS-WT #1,MERS-T1015 #1,VSV-SARS2#1,SARS1-Bat#2,MERS-WT #2,MERS-T1015 #2,VSV-SARS2#2
0,TGATAGTAGGATAATAGCGA,67,244,231,109,194,95,111,151,354,238
1,AGGGTTGTAGTAGTCCGTAA,48,231,153,133,122,109,60,73,29,106
2,GCCTTCTATGAGGTCGAAGG,66,1027,576,258,155,277,162,1,42,299
3,GATGGTGTAGAGAGTAGTGG,84,523,355,297,182,42,259,232,111,48
4,CCCTCAACAACCTACTATCG,68,489,282,307,394,135,66,285,538,233
...,...,...,...,...,...,...,...,...,...,...,...
84958,ATGGTACGTCGCGAACGATG,64,494,246,307,368,232,40,676,304,608
84959,CGGCGCCAATCGACGTGTCG,44,190,182,223,101,56,52,129,0,90
84960,GCCTCGACCGTGCGACGATA,70,659,213,368,598,166,192,531,282,139
84961,GGCTTAACGCCGCGTACAAG,37,728,374,278,331,264,288,427,273,565


## CP1564

In [7]:
cp1564_pdna = pd.read_table('../data/raw/scores-BF20200608_B03_AAGB03_RDA120_G1_CP1564_M-AK46.txt')
cp1564_chip = pd.read_table('../data/raw/CP1564_GRCh38_NCBI_strict_gene_20200612.chip')
cp1564_reads = pd.read_table('../data/raw/counts-JD_GPP1883_Renata_CP1564.txt')

### Filtered Chip

In [8]:
cp1564_chip_filtered = (cp1564_chip
                      .rename(columns = {'Barcode Sequence': 'Construct Barcode'}))
cp1564_chip_filtered

Unnamed: 0,Construct Barcode,Gene Symbol,Gene ID
0,AAAAAAAGGGAGGACCACTG,WDR55,54853
1,AAAAAAGTCTGAAGTATGCA,OR2A12,346525
2,AAAAACTGCCAACCAACTGA,RAD50,10111
3,AAAAAGGGTTGCCCAAGTAT,ATG10,83734
4,AAAAAGTACCAAATCAAGAA,GLMN,11146
...,...,...,...
6619,TTTTTCGTGGATCTGCACCA,INACTIVE_5T_165,INACTIVE_5T_165
6620,TTTTTCTTCAGGGTATGACA,FH,2271
6621,TTTTTCTTCAGGGTATGACA,INACTIVE_5T_166,INACTIVE_5T_166
6622,TTTTTCTTCTAGACTACTCG,ELL,8178


### pDNA

In [9]:
cp1564_pdna = (cp1564_pdna.rename(columns = {'B03_AAGB03_RDA120_G1_CP1564_M-AK46':'pDNA'})
             [['Construct Barcode', 'pDNA']])
cp1564_pdna

Unnamed: 0,Construct Barcode,pDNA
0,AAAAAAAGGGAGGACCACTG,51
1,AAAAAAGTCTGAAGTATGCA,53
2,AAAAACTGCCAACCAACTGA,66
3,AAAAAGGGTTGCCCAAGTAT,56
4,AAAAAGTACCAAATCAAGAA,48
...,...,...
6203,TTTTTCCTAACGGACCTCAC,41
6204,TTTTTCCTTGACAAGCTGGG,82
6205,TTTTTCGTGGATCTGCACCA,61
6206,TTTTTCTTCAGGGTATGACA,54


### CP1564

In [10]:
cp1564_reads_filtered = cp1564_reads.loc[:,~cp1564_reads.columns.str.contains('CP1561|CP0041|Construct IDs|GFP')]
cp1564_reads_filtered.columns = [x[0] for x in cp1564_reads_filtered.columns.str.split('_')]
cp1564_condition_map_dict = {'SARS1-bat #1': 'HKU5-SARS-CoV-1-S #1', 'SARS1-bat #2': 'HKU5-SARS-CoV-1-S #2', 
                             'SARS2 #1': 'SARS-CoV-2 #1', 'SARS2 #2': 'SARS-CoV-2 #2', 
                             'rcVSV-SARS2-S #1': 'VSV-SARS-CoV-2-S #1', 
                             'rcVSV-SARS2-S #2': 'VSV-SARS-CoV-2-S #2', 
                             'MERS-WT #1': 'MERS-CoV #1', 
                             'MERS-WT #2': 'MERS-CoV #2', 
                             'MERS-T1015 #1': 'MERS-CoV T1015N #1', 
                             'MERS-T1015 #2': 'MERS-CoV T1015N #2'}
cp1564_reads_filtered = cp1564_reads_filtered.rename(columns=cp1564_condition_map_dict)
cp1564_reads_filtered = cp1564_pdna.merge(cp1564_reads_filtered, how = 'inner', on = 'Construct Barcode')
cp1564_reads_filtered

Unnamed: 0,Construct Barcode,pDNA,Mock #1,Mock #2,IAV-WSN,HKU5-SARS-CoV-1-S #1,HKU5-SARS-CoV-1-S #2,SARS-CoV-2 #1,SARS-CoV-2 #2,VSV-SARS-CoV-2-S #1,VSV-SARS-CoV-2-S #2,MERS-CoV #1,MERS-CoV #2,MERS-CoV T1015N #1,MERS-CoV T1015N #2,EMCV #1,EMCV #2
0,AAAAAAAGGGAGGACCACTG,51,315,449,567,232,230,289,199,394,140,348,107,141,42,14,0
1,AAAAAAGTCTGAAGTATGCA,53,1828,2578,1825,1961,1588,2131,1403,1808,1268,1795,1868,1707,1231,550,758
2,AAAAACTGCCAACCAACTGA,66,1212,1609,2054,871,806,828,1280,1243,1038,789,909,795,436,320,161
3,AAAAAGGGTTGCCCAAGTAT,56,1347,1673,475,476,697,563,728,949,758,265,497,948,872,36,10
4,AAAAAGTACCAAATCAAGAA,48,438,603,357,289,498,671,546,282,282,44,229,187,20,141,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6203,TTTTTCCTAACGGACCTCAC,41,599,634,504,878,523,356,555,715,506,599,269,269,549,332,376
6204,TTTTTCCTTGACAAGCTGGG,82,1551,1670,1070,1060,922,1050,563,805,735,686,1053,1730,961,260,106
6205,TTTTTCGTGGATCTGCACCA,61,1146,1163,1400,1061,924,697,637,1047,886,1056,1112,814,913,895,622
6206,TTTTTCTTCAGGGTATGACA,54,601,670,464,471,425,218,203,330,170,228,535,199,268,124,90


## CP1560

In [11]:
cp1560_reads = pd.read_table('../data/raw/counts-JD_GPP1868_Alfajaro_Wilen_CP1560.txt')
cp1560_chip = pd.read_csv('../data/raw/cp1560_guide_mapping.csv')
cp1560_pdna = pd.read_table('../data/raw/scores-BF20200608_B05_AAGB04_RDA208_G1_CP1560_M-AK47.txt')

### pDNA

In [12]:
cp1560_pdna = (cp1560_pdna.rename(columns = {'B05_AAGB04_RDA208_G1_CP1560_M-AK47':'pDNA'})
               [['Construct Barcode', 'pDNA']])
cp1560_pdna

Unnamed: 0,Construct Barcode,pDNA
0,CCAAAGGCGAGAGATAGTTG,56
1,TGCTGCTCAGTCCACCATTG,73
2,AACATCTTCATGCCTATGTG,67
3,GGTCTTCCTAATATGACTCA,58
4,CTCCCAATATACAAGCCCGA,44
...,...,...
143,ATCTACACAGCCGACACAGA,61
144,GTGGCCGGAATAGATCTGCA,41
145,TCAGGTGCAAGTGAAAGTGT,64
146,TTATGCATCAAATCAATGGG,68


### CP1560

In [13]:
cp1560_reads_filtered = cp1560_reads.loc[:,~cp1560_reads.columns.str.contains('CP0070|Construct IDs')]
cp1560_condition_map_dict = {'Calu3-Mock_CP1560_Perturb': 'Mock', 
                             'Calu3-SARS2_CP1560_Perturb': 'SARS-CoV-2'}
cp1560_reads_filtered = cp1560_reads_filtered.rename(columns=cp1560_condition_map_dict)
cp1560_reads_filtered = cp1560_pdna.merge(cp1560_reads_filtered, how = 'inner', on = 'Construct Barcode')
cp1560_reads_filtered

Unnamed: 0,Construct Barcode,pDNA,Mock,SARS-CoV-2
0,CCAAAGGCGAGAGATAGTTG,56,56352,59846
1,TGCTGCTCAGTCCACCATTG,73,65969,67139
2,AACATCTTCATGCCTATGTG,67,63470,62851
3,GGTCTTCCTAATATGACTCA,58,56117,26316
4,CTCCCAATATACAAGCCCGA,44,23376,7586
...,...,...,...,...
143,ATCTACACAGCCGACACAGA,61,25496,9705
144,GTGGCCGGAATAGATCTGCA,41,30787,9164
145,TCAGGTGCAAGTGAAAGTGT,64,46671,16116
146,TTATGCATCAAATCAATGGG,68,45636,12887


## Outputs

In [24]:
writer = pd.ExcelWriter('../data/supplementary data/supplementary_reads_v1.xlsx', engine='xlsxwriter')
dfs = {'VeroE6 SARS-2 genomewide reads': cp70_sars2_reads_filtered, 'VeroE6 pan-cov genomewide reads': cp70_pan_cov_reads_filtered, 
       'C. sabaeus genomewide mapping': cp70_chip_filtered, 'VeroE6 pan-cov secondary reads': cp1564_reads_filtered, 
       'Secondary mapping': cp1564_chip_filtered, 'Calu3 SARS-2 tertiary reads': cp1560_reads_filtered, 
       'Tertiary mapping': cp1560_chip}
for sheetname, df in dfs.items():  # loop through `dict` of dataframes
    print(sheetname)
    df.to_excel(writer, sheet_name=sheetname, index=False)  # send df to writer
    worksheet = writer.sheets[sheetname]  # pull worksheet object
    for idx, col in enumerate(df):  # loop through all columns
        series = df[col]
        max_len = max((
            series.astype(str).map(len).max(),  # len of largest item
            len(str(series.name))  # len of column name/header
            )) + 1  # adding a little extra space
        worksheet.set_column(idx, idx, max_len)  # set column width
writer.save()

VeroE6 SARS-2 genomewide reads
VeroE6 pan-cov genomewide reads
C. sabaeus genomewide mapping
VeroE6 pan-cov secondary reads
Secondary mapping
Calu3 SARS-2 tertiary reads
Tertiary mapping
