In [48]:
import pandas as pd
import os
import Levenshtein

#PRETRAIN_GRNAS_PATH = '../data/main_dataframes/pretrain_grnas.csv'
PRETRAIN_GRNAS_PATH = 'crispr_on_grnas.csv'
LEENAY_PATH = 'leenay_full_data.csv'
U6T7_PATH = '13059_2016_1012_MOESM14_ESM.tsv'




In [49]:
def get_pretrain_grnas(no_GG = False):
    pretrain_grnas = pd.read_csv(PRETRAIN_GRNAS_PATH)
    pretrain_grnas = set(pretrain_grnas['sequence'].values)

    if no_GG:
        # remove the last two characters from each sequence
        pretrain_grnas = set([seq[:-2] for seq in pretrain_grnas])
    
    return pretrain_grnas



In [50]:
def is_leaking(seq, grnas_set):
    for grna in grnas_set:
        distance = Levenshtein.hamming(seq, grna)
        if distance < 4:
            return True
    return False

In [51]:
def remove_leakage_leenay():
    leenay_df=  pd.read_csv(LEENAY_PATH)

    # print the number of sequences
    print('Number of sequences before removing leakage:', len(leenay_df))

    pretrain_grnas = get_pretrain_grnas(no_GG=True)
    leenay_df['is_leaking'] = leenay_df['21mer'].apply(lambda x: is_leaking(x, pretrain_grnas))
    leenay_df = leenay_df[leenay_df['is_leaking'] == False]
    # remove the is_leaking column
    leenay_df = leenay_df.drop(columns=['is_leaking'])

    leenay_df.to_csv('leenay_no_leakage.csv', index=False)

    # print the number of sequences
    print('Number of sequences after removing leakage:', len(leenay_df))


In [52]:
remove_leakage_leenay()

Number of sequences before removing leakage: 1555
Number of sequences after removing leakage: 1539


In [53]:
def remove_leakage_U6T7():
    u6t7_df = pd.read_csv(U6T7_PATH, sep='\t')

    # print the number of sequences
    print('Number of sequences before removing leakage:', len(u6t7_df))

    pretrain_grnas = get_pretrain_grnas(no_GG=False)
    u6t7_df['is_leaking'] = u6t7_df['seq'].apply(lambda x: is_leaking(x, pretrain_grnas))
    u6t7_df = u6t7_df[u6t7_df['is_leaking'] == False]
    # remove the is_leaking column
    u6t7_df = u6t7_df.drop(columns=['is_leaking'])

    # save to tsv
    u6t7_df.to_csv('U6T7_no_leakage.tsv', sep='\t', index=False)

    # print the number of sequences
    print('Number of sequences after removing leakage:', len(u6t7_df))


In [54]:
remove_leakage_U6T7()

Number of sequences before removing leakage: 31625
Number of sequences after removing leakage: 25970


In [55]:
df_u6t7 = pd.read_csv(U6T7_PATH, sep='\t')
df_u6t7_no_leakage = pd.read_csv('U6T7_no_leakage.tsv', sep='\t')


# get set of all datasets
u6t7_set = set(df_u6t7['dataset'].values)
u6t6_set = set(df_u6t7_no_leakage['dataset'].values)

# for each dataset, get the number of sequences
set_to_size = {}
for dataset in u6t7_set:
    set_to_size[dataset] = len(df_u6t7[df_u6t7['dataset'] == dataset])

set_to_size_no_leakage = {}
for dataset in u6t6_set:
    set_to_size_no_leakage[dataset] = len(df_u6t7_no_leakage[df_u6t7_no_leakage['dataset'] == dataset])



# order by size
set_to_size = {k: v for k, v in sorted(set_to_size.items(), key=lambda item: item[1], reverse=True)}

# print
for dataset, size in set_to_size.items():
    # print with and without leakage
    print(f'{dataset}: {size} -> {set_to_size_no_leakage[dataset]}')


hart2016-HelaLib1Avg: 4256 -> 4217
hart2016-Hct1162lib1Avg: 4239 -> 4199
hart2016-Rpe1Avg: 4214 -> 4175
hart2016-HelaLib2Avg: 3845 -> 3816
doench2016_hg19: 2333 -> 430
doench2016plx_hg19: 2333 -> 430
xu2015TrainKbm7: 2076 -> 2056
xu2015TrainHl60: 2076 -> 2056
chari2015TrainK562: 1239 -> 1198
chari2015Train293T: 1234 -> 1193
morenoMateos2015: 1020 -> 1017
doench2014-Mm: 951 -> 145
doench2014-Hs: 881 -> 110
liu2016_mm9: 205 -> 205
gagnon2014: 111 -> 111
varshney2015: 102 -> 102
shkumatavaOthers: 84 -> 84
ghandi2016_ci2: 72 -> 72
shkumatavaPerrine: 62 -> 62
farboud2015: 50 -> 50
ren2015: 39 -> 39
xu2015: 35 -> 35
teboulVivo_mm9: 30 -> 30
concordet2-Hs: 26 -> 26
xu2015AAVS1: 20 -> 20
eschstruth: 18 -> 18
concordet2-Mm: 18 -> 18
shkumatavaAngelo: 17 -> 17
schoenigRn: 15 -> 15
xu2015FOX-AR: 15 -> 15
schoenigMm: 6 -> 6
schoenigHs: 3 -> 3
