### Filtering raw data

Author: Ruth Hanna

This notebook combines pDNA and gDNA sequencing data and applies two filters. One is to remove any sgRNAs with an outlier pDNA count. The second is to remove any sgRNAs with > 5 off-targets in Match Bin I.

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from math import log, isnan

First, we merge the files containing lognorms for pDNA and gDNA.

In [2]:
def calc_lognorm(df, col):
    col_sum = np.sum(df[col])
    df.loc[:,'lognorm'] = [np.log2((x*1000000/float(col_sum))+1) for x in df[col]]
    return df

In [3]:
# Read in plasmid DNA counts
pdna = pd.read_csv('../data/raw/M-AD81_AACH02_XPR050_G0_CP0070_ScoresSum.csv')

# Take lognorm
pdna = calc_lognorm(pdna, 'count')
pdna = pdna.rename(columns = {'lognorm':'pDNA'})
pdna.shape

(84963, 4)

In [4]:
# Read in lognorms 
lognorms = pd.read_table('../data/raw/lognorm-JD_GPP1868_Alfajaro_Wilen_CP0070.txt')
lognorms = lognorms.drop(['Calu3-Mock_CP1560_Perturb', 'Calu3-SARS2_CP1560_Perturb'], axis=1)
lognorms

Unnamed: 0,Construct Barcode,Construct IDs,Mock_CP0070_Vero-E6-Cas9v2,SARS1-Bat#1_CP0070_Vero-E6-Cas9v2,MERS-WT #1_CP0070_Vero-E6-Cas9v2,MERS-T1015 #1_CP0070_Vero-E6-Cas9v2,VSV-SARS2#1_CP0070_Vero-E6-Cas9v2,SARS1-Bat#2_CP0070_Vero-E6-Cas9v2,MERS-WT #2_CP0070_Vero-E6-Cas9v2,MERS-T1015 #2_CP0070_Vero-E6-Cas9v2,VSV-SARS2#2_CP0070_Vero-E6-Cas9v2
0,AAAAAAAAAAACTCAAAGAT,BRDN0003986161,0.000000,0.173573,0.595734,0.000000,0.573473,0.000000,0.255746,0.000000,0.000000
1,AAAAAAAAACTGGAATCATG,BRDN0004006111,2.592409,3.906831,3.431170,4.280312,3.654484,3.963582,4.538532,5.443169,3.712646
2,AAAAAAACAAAGTGTGGCGT,BRDN0003996241,1.148732,0.768219,0.000000,2.390305,0.687175,0.000000,0.000000,0.000000,1.780735
3,AAAAAAACCTCTCGCTCCTG,BRDN0003994634,3.914452,3.607238,4.093186,4.104811,4.359246,0.000000,4.620444,3.153911,2.536655
4,AAAAAAAGAGACACTGTTGT,BRDN0003931926,2.833198,4.230426,4.598422,4.014567,4.044153,5.632985,4.612840,4.548194,4.523167
...,...,...,...,...,...,...,...,...,...,...,...
84958,TTTGTTTACGGCTTCGGCAA,BRDN0003927931,4.604633,4.118075,5.087953,5.177655,3.411428,1.407604,5.413972,4.076097,3.533470
84959,TTTGTTTCCACAAACATGTA,BRDN0003948722,2.799964,0.655642,2.522869,0.354123,2.220567,0.000000,0.000000,1.121217,1.126448
84960,TTTGTTTCCTCTATTCTACC,BRDN0003947974,1.636820,1.830647,2.171736,2.446611,0.000000,0.000000,2.449142,0.886613,0.000000
84961,TTTGTTTGCGGGTCACTTCG,BRDN0003941657,1.636820,0.595879,1.299062,2.953556,2.398047,0.000000,0.000000,0.985077,1.525672


In [10]:
# Merge pDNA lognorms with gDNA conditions
all_lognorms = pd.merge(pdna[['Construct Barcode', 'pDNA']], lognorms, on='Construct Barcode', how='inner')
all_lognorms.shape

(84963, 12)

In [11]:
filtered_sars_cov2 = pd.read_csv('../data/interim/lognorms_with_filters.csv')
filter_cols = filtered_sars_cov2[['Construct Barcode', 'Construct IDs', 'pDNA_filter', 'Match Bin I sum','off_target_filter']]
filter_cols

Unnamed: 0,Construct Barcode,Construct IDs,pDNA_filter,Match Bin I sum,off_target_filter
0,TGATAGTAGGATAATAGCGA,BRDN0003924380,False,1.0,False
1,AGGGTTGTAGTAGTCCGTAA,BRDN0003924381,False,2.0,False
2,GCCTTCTATGAGGTCGAAGG,BRDN0003924382,False,1.0,False
3,GATGGTGTAGAGAGTAGTGG,BRDN0003924383,False,2.0,False
4,CCCTCAACAACCTACTATCG,BRDN0003924384,False,0.0,False
...,...,...,...,...,...
84958,ATGGTACGTCGCGAACGATG,BRDN0004009338,False,0.0,False
84959,CGGCGCCAATCGACGTGTCG,BRDN0004009339,False,0.0,False
84960,GCCTCGACCGTGCGACGATA,BRDN0004009340,False,0.0,False
84961,GGCTTAACGCCGCGTACAAG,BRDN0004009341,False,0.0,False


In [12]:
all_lognorms = all_lognorms.merge(filter_cols, how='inner', 
                                  on=['Construct Barcode', 'Construct IDs'])
all_lognorms

Unnamed: 0,Construct Barcode,pDNA,Construct IDs,Mock_CP0070_Vero-E6-Cas9v2,SARS1-Bat#1_CP0070_Vero-E6-Cas9v2,MERS-WT #1_CP0070_Vero-E6-Cas9v2,MERS-T1015 #1_CP0070_Vero-E6-Cas9v2,VSV-SARS2#1_CP0070_Vero-E6-Cas9v2,SARS1-Bat#2_CP0070_Vero-E6-Cas9v2,MERS-WT #2_CP0070_Vero-E6-Cas9v2,MERS-T1015 #2_CP0070_Vero-E6-Cas9v2,VSV-SARS2#2_CP0070_Vero-E6-Cas9v2,pDNA_filter,Match Bin I sum,off_target_filter
0,TGATAGTAGGATAATAGCGA,3.756240,BRDN0003924380,3.446231,3.978788,3.163620,3.857296,3.654484,3.111858,3.427913,4.591817,4.134622,False,1.0,False
1,AGGGTTGTAGTAGTCCGTAA,3.316768,BRDN0003924381,3.374672,3.430338,3.421369,3.245703,3.838022,2.359635,2.515893,1.532838,3.066543,False,2.0,False
2,GCCTTCTATGAGGTCGAAGG,3.736162,BRDN0003924382,5.415098,5.241089,4.310550,3.558340,5.121060,3.603774,0.090380,1.903954,4.446951,False,1.0,False
3,GATGGTGTAGAGAGTAGTGG,4.060697,BRDN0003924383,4.473766,4.566393,4.504063,3.771727,2.614727,4.235591,3.999911,3.043993,2.117966,False,2.0,False
4,CCCTCAACAACCTACTATCG,3.776043,BRDN0003924384,4.381296,4.249946,4.549766,4.828000,4.127092,2.471356,4.279880,5.175066,4.105751,False,0.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84958,ATGGTACGTCGCGAACGATG,3.695147,BRDN0004009338,4.395272,4.063963,4.549766,4.733094,4.873317,1.908789,5.482332,4.381946,5.436861,False,0.0,False
84959,CGGCGCCAATCGACGTGTCG,3.204340,BRDN0004009339,3.122485,3.659261,4.111589,3.004460,2.969644,2.195794,3.223417,0.000000,2.860770,False,0.0,False
84960,GCCTCGACCGTGCGACGATA,3.814851,BRDN0004009340,4.793771,3.869461,4.800986,5.412517,4.409814,3.830225,5.142807,4.278958,3.416089,False,0.0,False
84961,GGCTTAACGCCGCGTACAAG,2.983674,BRDN0004009341,4.932493,4.638515,4.413023,4.586271,5.053752,4.380975,4.838242,4.234611,5.333573,False,0.0,False


In [13]:
filtered = all_lognorms.loc[(all_lognorms['pDNA_filter'] == False) & (all_lognorms['off_target_filter'] == False), :]
filtered.shape

(84208, 15)

In [14]:
filtered.to_csv('../data/interim/other_corona_filtered_lognorms.csv')
all_lognorms.to_csv('../data/interim/other_corona_lognorms_with_filters.csv')