In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import json
import re

from sklearn.ensemble import IsolationForest


###########################################
#               FUNCTIONS                 #
###########################################


def clean_reference(ref,outliers):
    for i in outliers:
        ref = ref.drop(labels=i,axis=1)

    return ref

def norm_ref(ref):
    med = ref.median(axis=0)
    norm_ref = ref/med
    return norm_ref, med

def create_synthetic(norm_ref,med,N):
    synt = pd.DataFrame(index=norm_ref.index,columns=range(N))
    for j in range(N):
        for i in norm_ref.index:
            synt[j][i] = np.random.choice(norm_ref.loc[norm_ref.index==i].values.flatten())

        synt[j] = synt[j] * np.random.choice(med)
    synt.columns = ["sample_" + str(i) for i in range(synt.shape[1])]
    return synt

def add_features(synt,sample,gene,factor,exon=None):
    if exon is None:
        pattern = gene
        tmp = [str(synt.index[i]).split("_")[0] for i in range(synt.shape[0])]
        synt.loc[[tmp[i]==pattern for i in range(len(tmp))],sample] = synt.loc[[tmp[i]==pattern for i in range(len(tmp))]][sample] * factor
        
    if exon is not None:
        pattern = gene + "_" + exon
        tmp = [str(synt.index[i]).split("_")[0] + "_" + str(synt.index[i]).split("_")[1] for i in range(synt.shape[0])] 
        synt.loc[[tmp[i]==pattern for i in range(len(tmp))],sample] = synt.loc[[tmp[i]==pattern for i in range(len(tmp))]][sample] * factor
    
    res = synt
    return res


def openJson(path,n):
    """
    Opens json files in path to create a reads matrix
    """
    tmp = os.listdir(path)
    tmp = np.array(tmp)[np.array([bool(re.findall("depths.json$",tmp[i])) for i in range(len(tmp))])]
    reads = np.zeros((n,len(tmp)))
    amplicons = ["" for x in range(n)]
    q=0
    for p in tmp:
        with open(path+p) as json_file:
            data = json.load(json_file)
            for i in range(n):
                amplicons[i] = data[i]['name']
                for j in data[i]['depths']:
                    reads[i,q] = int(data[i]['depths'][j]['min'])
        q=q+1
    reads = pd.DataFrame(data = reads,index=amplicons)
    reads.columns = [i.split('_')[0]+'_'+i.split('_')[1] for i in tmp]
    return reads

def sumLibraries(reads):
    samples = np.unique([i.split('_')[0] for i in reads.columns])
    reads_f = np.zeros((reads.shape[0],len(samples)))
    q=0
    for i in samples:
        sub = reads.filter(regex="^"+i)
        reads_f[:,q] = sub.sum(axis=1)
        q=q+1
    reads_f = pd.DataFrame(data = reads_f,index=reads.index)
    reads_f.columns = list(samples)
    return(reads_f)

def correctIndex(reads,correspondance):
    l = ["" for x in range(len(reads.index))]
    q=0
    for i in reads.index:
        l[q] = i[(len(i)-9):len(i)]
        q=q+1
    final = reads[[correspondance["amplicon"][0] in l[x] for x in range(len(l))]]
    for i in correspondance["amplicon"]:
        if i!=correspondance["amplicon"][0]:
            final = pd.concat([final,reads[[i in l[x] for x in range(len(l))]]])
    final.index = correspondance["gene_exon"] + "_" + correspondance["amplicon"]
    return final


def filterReads(reads,N,output_path):
    reads = reads.loc[:,reads.sum(axis=0)>N]
    reads = reads.filter(regex="^(?!MSI)",axis=0)
    reads = reads.filter(regex="^(?!TN)")
    reads = reads.filter(regex="^(?!TP)")
    reads = reads.filter(regex="^(?!HD)")
    reads = reads.filter(regex="^(?!H2)")
    reads.to_csv(output_path, sep="\t",index=True)
    return(reads)


def normalizeReads(reads,output_path,save=False):
    reads_norm=reads/reads.median(axis=0)
    reads = np.log(reads+1)
    if save==True:
        reads_norm.to_csv(output_path, sep="\t",index=True)
    return(reads_norm)


def aberrantSamples(reads,conta='auto'):
    reads = reads/np.sum(reads)
    
    tmp = np.percentile(reads, 99, axis = 0)/np.mean(reads, axis = 0)
    random_data = np.array(tmp).reshape(-1,1)
    clf = IsolationForest(contamination=conta).fit(random_data)
    preds = clf.predict(random_data)
    res_amp = np.array(reads.columns)[preds==-1]
    
    tmp = np.percentile(reads, 1, axis = 0)/np.mean(reads, axis = 0)
    random_data = np.array(tmp).reshape(-1,1)
    clf = IsolationForest(contamination=conta).fit(random_data)
    preds = clf.predict(random_data)
    res_del = np.array(reads.columns)[preds==-1]
    
    res = np.unique(np.concatenate((res_amp,res_del)))
    
    return(res)


def aberrantSamples2(reads):
    read = reads.astype("float")
    tmp = np.percentile(reads, 5, axis = 0)/np.mean(reads, axis = 0)
    random_data = np.array(tmp).reshape(-1,1)

    clf = IsolationForest(contamination=0.1).fit(random_data)
    preds = clf.predict(random_data)
    res = np.array(reads.columns)[preds==-1]
    return(res)


def aberrantAmplicons(reads_norm,abSamples):
    for name in res:
        random_data = np.array(reads_norm[name]).reshape(-1,1)
        clf = IsolationForest(contamination=0.001).fit(np.array(np.mean(reads_norm, axis = 1)).reshape(-1,1))
        preds = clf.predict(random_data)
        print(name)
        print(np.array(reads_norm.index)[preds==-1])

def aberrantAmpliconsPerSample(name,reads_norm,conta='auto',verbose=False):
    random_data = np.array(reads_norm[name]).reshape(-1,1)
    clf = IsolationForest(contamination=conta).fit(np.array(np.mean(reads_norm, axis = 1)).reshape(-1,1))
    preds = clf.predict(random_data)
    if verbose:
        print(name)
        print(np.array(reads_norm.index)[preds==-1])
    return(np.array(reads_norm.index)[preds==-1])


def aberrantAmpliconsPerSample2(name,reads,abSamples,verbose=False):
    ab = [i in abSamples for i in reads.columns]
    normalReads = reads[np.delete(reads.columns,ab)]
    med = np.percentile(normalReads, 99, axis = 1)
    reads = (reads.T/med).T
    random_data = np.array(reads[name]).reshape(-1,1)
    clf = IsolationForest(contamination=0.05).fit(np.array(np.median(reads, axis = 1)).reshape(-1,1))
    preds = clf.predict(random_data)
    if verbose:
        print(name)
        print(np.array(reads.index)[preds==-1])
    return(np.array(reads.index)[preds==-1])



def percentagePerExon(amplified,reads,verbose=False):
    genes = [i.split('_')[0] for i in reads.index]
    exons = [i.split('_')[1] for i in reads.index]
    g_e = [genes[i]+'_'+exons[i] for i in range(len(genes))]
    n_ge = np.array([g_e.count(i) for i in np.unique(g_e)])
    ag = [i.split('_')[0] for i in amplified]
    ae = [i.split('_')[1] for i in amplified]
    age = [ag[i]+'_'+ae[i] for i in range(len(amplified))]
    f = pd.DataFrame(index=np.unique(age),columns=["percentage"])
    f = f.fillna(0)
    for i in range(len(np.unique(age))):
        f['percentage'][i] = 100*float(age.count(''.join(np.unique(age)[i]))/n_ge[np.unique(g_e)==''.join(np.unique(age)[i])])
        if verbose:
            if f['percentage'][i]>50:
                print(np.unique(age)[i] + ": " + str(round(f['percentage'][i]))+'%'+' des amplicons de l\'exon sont aberrants')
    return(f)

def percentagePerGene(amplified,reads,verbose=False):
    genes = [i.split('_')[0] for i in reads.index]
    ag = [i.split('_')[0] for i in amplified]
    n_g = np.array([genes.count(i) for i in np.unique(genes)])
    f = pd.DataFrame(index=np.unique(ag),columns=["percentage"])
    f = f.fillna(0)
    for i in range(len(np.unique(ag))):
        f['percentage'][i] = 100*float(ag.count(''.join(np.unique(ag)[i]))/n_g[np.unique(genes)==''.join(np.unique(ag)[i])])
        if verbose:
            if f['percentage'][i]>50:
                print(np.unique(ag)[i] + ": " + str(round(f['percentage'][i]))+'%'+' des amplicons du gene sont aberrants')
    return(f)

def amplifEvalGene(reads,abSamples,gene,sample):
    reads_m = reads/reads.median(axis=0)
    sub = reads_m
    for i in abSamples:
        sub = sub.drop(labels=i,axis=1)
    reads_m = reads_m.filter(regex="^"+gene,axis=0)
    reads_m = reads_m[sample]   
    val = np.mean(reads_m)/np.mean(sub.mean())
    if val==np.inf:
        val = 100
    return val

def scoreAmplif(k,n,N):
    p = n/N
    x = np.log(1/((p**k)*(1-p)**(n-k)))*(k/n)
    # score = 1/(1+np.exp(-x))
    score = x/390 + 190/390
    
    return x

def aberrantAmpliconsFinal(reads, reads_norm, abSamples,abSamples2,run,threshold):
    f = pd.DataFrame(columns=["run","name","gene","amplif","score"])
    q=0 
    for name in abSamples2:
        #abAmp = aberrantAmpliconsPerSample2(name,reads_norm,abSamples,verbose=False)
        abAmp = aberrantAmpliconsPerSample(name,reads_norm,verbose=False)
        if abAmp.shape!=(0,):
            genes = np.unique([i.split('_')[0] for i in abAmp])
            for gene in genes:
                r = re.compile(gene)
                abEx = list(filter(r.match, abAmp))
                exons1 = [i.split('_')[0]+"_"+i.split('_')[1] for i in abEx]
                tmp = reads.filter(regex="^"+gene,axis=0)
                exons2 = [i.split('_')[0]+"_"+i.split('_')[1] for i in tmp.index]
                
                score = scoreAmplif(len(abEx),tmp.shape[0],reads.shape[0])
                
                amplif = amplifEvalGene(reads, abSamples, gene, name)

                if score>threshold:
                    if amplif>1:
                        f.loc[q] = [run,name,gene,amplif,score]
                        q=q+1
                    #if amplif<1:
                    #    f.loc[q] = [run,name,gene,amplif,score]
                    #    q=q+1

    return(f)


def aberrantAmpliconsFinal2(reads, reads_norm, abSamples,abSamples2,run,threshold):
    f = pd.DataFrame(columns=["run","name","gene","amplif","score"])
        
    q=0 
    for name in abSamples2:
        #abAmp = aberrantAmpliconsPerSample2(name,reads_norm,abSamples,verbose=False)
        abAmp = aberrantAmpliconsPerSample(name,reads_norm,verbose=False)
        if abAmp.shape!=(0,):
            genes = abAmp
            for gene in genes:
                r = re.compile(gene)
                abEx = list(filter(r.match, abAmp))
                #print(abEx)
                tmp = reads.filter(regex="^"+gene,axis=0)                
                score = scoreAmplif(len(abEx),tmp.shape[0],reads.shape[0])
                
                amplif = amplifEvalGene(reads, abSamples, gene, name)

                if score>threshold:
                    if amplif>1:
                        f.loc[q] = [run,name,gene,amplif,score]
                        q=q+1
                    #if amplif<1:
                    #    f.loc[q] = [run,name,gene,amplif,score]
                    #    q=q+1

    return(f)




In [4]:
data = pd.read_csv("/Users/admin/Documents/CNV/article/test_dataset_capture.txt",sep="\t",index_col=0)
run = "test_capture"
output_path = "/Users/admin/Documents/CNV/"
abSamples = pd.DataFrame(aberrantSamples(data))
abSamples.to_csv(output_path+"absamples_capture.txt", sep="\t",index=True)

In [94]:
data = pd.read_csv("/Users/admin/Documents/CNV/article/test_dataset_capture_2.txt",sep="\t",index_col=0)
data/np.sum(data)

Unnamed: 0_level_0,17296,17297,17298,17299,17300,17301,17302,17303,17304,17305,...,17394,17395,17396,17397,17398,17399,17400,17401,17402,17403
targets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SDHB_chr1_17345375_17345454,0.001704,0.001663,0.001707,0.001618,0.001666,0.001561,0.001737,0.001700,0.001657,0.001639,...,0.001966,0.001590,0.001483,0.001651,0.001594,0.001631,0.001639,0.001771,0.001708,0.001487
SDHB_chr1_17349102_17349226,0.000887,0.000788,0.000815,0.000835,0.000697,0.000861,0.000779,0.000885,0.000789,0.000780,...,0.000937,0.000899,0.000921,0.000813,0.000930,0.000804,0.000797,0.000750,0.000897,0.000933
SDHB_chr1_17350467_17350570,0.001577,0.001447,0.001363,0.001431,0.001527,0.001669,0.001449,0.001485,0.001550,0.001647,...,0.001789,0.001848,0.001691,0.001659,0.001708,0.001654,0.001847,0.001671,0.001601,0.001790
SDHB_chr1_17354243_17354361,0.001568,0.001234,0.001388,0.001312,0.001381,0.001489,0.001518,0.001455,0.001416,0.001388,...,0.001519,0.001466,0.001469,0.001516,0.001449,0.001510,0.001437,0.001296,0.001663,0.001488
SDHB_chr1_17355094_17355232,0.002091,0.001983,0.001766,0.001918,0.002041,0.002056,0.002110,0.002199,0.002037,0.001928,...,0.002343,0.002247,0.002165,0.002038,0.002048,0.001998,0.002035,0.002001,0.002189,0.002210
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NBN_chr8_90996752_90996790,0.000416,0.000444,0.000380,0.000515,0.000496,0.000408,0.000363,0.000253,0.000407,0.000449,...,0.000718,0.000704,0.000610,0.000580,0.000729,0.000686,0.000723,0.000714,0.000552,0.000594
CDKN2A_chr9_21968227_21968242,0.002900,0.002907,0.003176,0.003207,0.003048,0.002855,0.002929,0.003148,0.003263,0.003097,...,0.004053,0.003007,0.002721,0.002884,0.002937,0.002968,0.003311,0.003081,0.002932,0.002811
CDKN2A_chr9_21970900_21971208,0.000987,0.001479,0.001605,0.001692,0.001471,0.001166,0.000918,0.000704,0.001205,0.001608,...,0.002514,0.002113,0.002240,0.002045,0.002168,0.002016,0.002219,0.001982,0.002105,0.002202
CDKN2A_chr9_21974676_21974827,0.000387,0.000499,0.000637,0.000623,0.000578,0.000464,0.000354,0.000294,0.000512,0.000607,...,0.001219,0.000842,0.000883,0.000883,0.000858,0.000849,0.000947,0.000914,0.000858,0.000829


In [4]:
from tqdm.notebook import tqdm
data = pd.read_csv("/Users/admin/Documents/CNV/article/test_dataset_capture_2.txt",sep="\t",index_col=0)
run = "test_capture"
output_path = "/Users/admin/Documents/CNV/article/"
normal = pd.read_csv("/Users/admin/Documents/CNV/article/capture_normal_samples.tsv",sep="\t",index_col=None)
normal = normal["neg"]
normal = [str(i) for i in normal]
anormal = pd.read_csv("/Users/admin/Documents/CNV/article/capture_anormal_samples.tsv",sep="\t",index_col=None)
anormal = anormal["ref"]
anormal = [str(i) for i in anormal]

def amplifEval(reads,abSamples,abAmp,sample):
    reads_m = reads/reads.median(axis=0)
    sub = reads_m
    for i in abSamples:
        sub = sub.drop(labels=i,axis=1)
    reads_m = reads_m.filter(regex="^"+abAmp,axis=0)
    reads_m = reads_m[sample]
    val = reads_m/np.mean(sub.mean())
    if np.mean(sub.mean())==0:
        val = 100
    return float(val)

def normalizeReads(reads,output_path,save=False):
    reads_norm=reads/reads.median(axis=0)
    if save==True:
        reads_norm.to_csv(output_path, sep="\t",index=True)
    return(reads_norm)

def aberrantAmpliconsPerSample(name,reads_norm,conta='auto',verbose=False,normal=None):
    random_data = np.array(reads_norm[name]).reshape(-1,1)
    to_compare = reads_norm.loc[:,np.in1d(reads_norm.columns,normal)]
    clf = IsolationForest(contamination=conta).fit(np.array(np.mean(to_compare, axis = 1)).reshape(-1,1))

    preds = clf.predict(random_data)
    if verbose:
        print(name)
        print(np.array(reads_norm.index)[preds==-1])
    return(np.array(reads_norm.index)[preds==-1])

final_norm = normalizeReads(data,output_path+'/reads_'+run+'_norm_capture.tsv',save=False)
#ff = aberrantAmpliconsFinal(data,final_norm,abSamples,data.columns,run,3)
#ff.to_csv(output_path+"res_capture.tsv", sep="\t",index=True)
abSamples = anormal
f = pd.DataFrame(columns=["run","name","target","amplif"])
q=0
for name in tqdm(anormal):
    #abAmp = aberrantAmpliconsPerSample2(name,reads_norm,abSamples,verbose=False)
    abAmp = aberrantAmpliconsPerSample(str(name),final_norm,verbose=False,normal=normal,conta=0.05)
    for i in abAmp:
        amplif = amplifEval(data,abSamples,i,str(name))
        f.loc[q] = [run,name,i,amplif]
        q=q+1

#f.to_csv(output_path+"res_capture_005.tsv", sep="\t",index=True)


HBox(children=(FloatProgress(value=0.0, max=66.0), HTML(value='')))




In [81]:
from tqdm.notebook import tqdm
import random

def aberrationDetection(reads,conta='auto'):
    reads_norm = reads/np.median(reads)
    reads_norm = reads_norm.T/np.sum(reads_norm.T)
    reads_norm = reads_norm.T
    final = pd.DataFrame(0,index=reads.index,columns=reads.columns)
    for i in reads.index:
        tmp = reads_norm.loc[i]
        random_data = np.array(tmp).reshape(-1,1)
        clf = IsolationForest(contamination=conta).fit(random_data)
        preds = clf.predict(random_data)
        final.loc[i] = 0.5*(1-preds)

    return(final)


data = pd.read_csv("/Users/admin/Documents/CNV/article/test_dataset_capture_3.txt",sep="\t",index_col=0)
run = "test_capture"
output_path = "/Users/admin/Documents/CNV/article/"
normal = pd.read_csv("/Users/admin/Documents/CNV/article/capture_normal_samples.tsv",sep="\t",index_col=None)
normal = normal["neg"]
normal = [str(i) for i in normal]

anormal = pd.read_csv("/Users/admin/Documents/CNV/article/capture_anormal_samples.tsv",sep="\t",index_col=None)
anormal = anormal["ref"]
#anormal = pd.read_csv("/Users/admin/Documents/CNV/article/multi_cnv_samples.txt",sep="\t",index_col=None)
#anormal = anormal["samples"]
anormal = [str(i) for i in anormal]


p=[]
q=[]
r=[]
s=[]
for k in tqdm(range(30)):
    norms = random.sample(normal,19)
    anorms = random.sample(anormal,2)
    select = norms+anorms
    sub = data[select]

    res = aberrationDetection(sub,conta=0.01)
    det = res.columns[np.sum(res)>0]
    ndet = sub.columns[~np.in1d(sub.columns,det)]
    pos = anorms
    p.extend([sum(np.in1d(det,pos))])
    q.extend([sum(np.in1d(det,neg))])
    r.extend([sum(np.in1d(ndet,pos))])
    s.extend([sum(np.in1d(ndet,neg))])

TP = np.mean(p)
FP = np.mean(q)
FN = np.mean(r)
TN = np.mean(s)
print("All: ")
print("TP: "+str(TP))
print("FP: "+str(FP))
print("FN: "+str(FN))
print("TN: "+str(TN))

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))


All: 
TP: 1.9333333333333333
FP: 5.8
FN: 0.06666666666666667
TN: 6.766666666666667


In [82]:
output_path = "/Users/admin/Documents/CNV/article/"
pd.DataFrame(p).to_csv(output_path+"tp_capture_all_auto30_2samples_v3.txt", sep="\t",index=False)
pd.DataFrame(q).to_csv(output_path+"fp_capture_all_auto30_2samples_v3.txt", sep="\t",index=False)
pd.DataFrame(r).to_csv(output_path+"fn_capture_all_auto30_2samples_v3.txt", sep="\t",index=False)
pd.DataFrame(s).to_csv(output_path+"tn_capture_all_auto30_2samples_v3.txt", sep="\t",index=False)

In [None]:

data = pd.read_csv("/Users/admin/Documents/CNV/article/test_dataset_capture_2.txt",sep="\t",index_col=0)
run = "test_capture"
output_path = "/Users/admin/Documents/CNV/article/"
normal = pd.read_csv("/Users/admin/Documents/CNV/article/capture_normal_samples.tsv",sep="\t",index_col=None)
normal = normal["neg"]
normal = [str(i) for i in normal]

# All ---------------------
anormal = pd.read_csv("/Users/admin/Documents/CNV/article/capture_anormal_samples.tsv",sep="\t",index_col=None)
anormal = anormal["ref"]
anormal = [str(i) for i in anormal]

p=[]
q=[]
r=[]
s=[]
for k in tqdm(range(2)):
    norms = random.sample(normal,19)
    anorms = random.sample(anormal,2)
    select = norms+anorms
    sub = data[select]
    det = aberrantSamples(sub,conta=1/sub.shape[1])
    ndet = sub.columns[~np.in1d(sub.columns,det)]
    pos = anorms
    neg = norms
    p.extend([sum(np.in1d(det,pos))])
    q.extend([sum(np.in1d(det,neg))])
    r.extend([sum(np.in1d(ndet,pos))])
    s.extend([sum(np.in1d(ndet,neg))])

#pd.DataFrame(p).to_csv(output_path+"tp_capture_all_05_2samples_v2.txt", sep="\t",index=False)
#pd.DataFrame(q).to_csv(output_path+"fp_capture_all_05_2samples_v2.txt", sep="\t",index=False)
#pd.DataFrame(r).to_csv(output_path+"fn_capture_all_05_2samples_v2.txt", sep="\t",index=False)
#pd.DataFrame(s).to_csv(output_path+"tn_capture_all_05_2samples_v2.txt", sep="\t",index=False)
TP = np.mean(p)
FP = np.mean(q)
FN = np.mean(r)
TN = np.mean(s)
print("All: ")
print("TP: "+str(TP))
print("FP: "+str(FP))
print("FN: "+str(FN))
print("TN: "+str(TN))

HBox(children=(FloatProgress(value=0.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=565.0), HTML(value='')))




  mask |= (ar1 == a)


HBox(children=(FloatProgress(value=0.0, max=565.0), HTML(value='')))




  mask |= (ar1 == a)


HBox(children=(FloatProgress(value=0.0, max=565.0), HTML(value='')))

In [84]:
import random

data = pd.read_csv("/Users/admin/Documents/CNV/article/test_dataset_capture_2.txt",sep="\t",index_col=0)
run = "test_capture"
output_path = "/Users/admin/Documents/CNV/article/"
normal = pd.read_csv("/Users/admin/Documents/CNV/article/capture_normal_samples.tsv",sep="\t",index_col=None)
normal = normal["neg"]
normal = [str(i) for i in normal]

# All ---------------------
anormal = pd.read_csv("/Users/admin/Documents/CNV/article/capture_anormal_samples.tsv",sep="\t",index_col=None)
anormal = anormal["ref"]
anormal = [str(i) for i in anormal]

p=[]
q=[]
r=[]
s=[]
for k in tqdm(range(1000)):
    norms = random.sample(normal,19)
    anorms = random.sample(anormal,2)
    select = norms+anorms
    sub = data[select]
    det = aberrantSamples(sub,conta=0.5)
    ndet = sub.columns[~np.in1d(sub.columns,det)]
    pos = anorms
    neg = norms
    p.extend([sum(np.in1d(det,pos))])
    q.extend([sum(np.in1d(det,neg))])
    r.extend([sum(np.in1d(ndet,pos))])
    s.extend([sum(np.in1d(ndet,neg))])

pd.DataFrame(p).to_csv(output_path+"tp_capture_all_05_2samples_v2.txt", sep="\t",index=False)
pd.DataFrame(q).to_csv(output_path+"fp_capture_all_05_2samples_v2.txt", sep="\t",index=False)
pd.DataFrame(r).to_csv(output_path+"fn_capture_all_05_2samples_v2.txt", sep="\t",index=False)
pd.DataFrame(s).to_csv(output_path+"tn_capture_all_05_2samples_v2.txt", sep="\t",index=False)
TP = np.mean(p)
FP = np.mean(q)
FN = np.mean(r)
TN = np.mean(s)
print("All: ")
print("TP: "+str(TP))
print("FP: "+str(FP))
print("FN: "+str(FN))
print("TN: "+str(TN))

# Multi ---------------------
anormal = pd.read_csv("/Users/admin/Documents/CNV/article/multi_cnv_samples.txt",sep="\t",index_col=None)
anormal = anormal["samples"]
anormal = [str(i) for i in anormal]

p=[]
q=[]
r=[]
s=[]
for k in tqdm(range(1000)):
    norms = random.sample(normal,19)
    anorms = random.sample(anormal,2)
    select = norms+anorms
    sub = data[select]
    det = aberrantSamples(sub,conta=0.5)
    ndet = sub.columns[~np.in1d(sub.columns,det)]
    pos = anorms
    neg = norms
    p.extend([sum(np.in1d(det,pos))])
    q.extend([sum(np.in1d(det,neg))])
    r.extend([sum(np.in1d(ndet,pos))])
    s.extend([sum(np.in1d(ndet,neg))])

pd.DataFrame(p).to_csv(output_path+"tp_capture_2samples_multi_05_v2.txt", sep="\t",index=False)
pd.DataFrame(q).to_csv(output_path+"fp_capture_2samples_multi_05_v2.txt", sep="\t",index=False)
pd.DataFrame(r).to_csv(output_path+"fn_capture_2samples_multi_05_v2.txt", sep="\t",index=False)
pd.DataFrame(s).to_csv(output_path+"tn_capture_2samples_multi_05_v2.txt", sep="\t",index=False)
TP = np.mean(p)
FP = np.mean(q)
FN = np.mean(r)
TN = np.mean(s)
print("Multi: ")
print("TP: "+str(TP))
print("FP: "+str(FP))
print("FN: "+str(FN))
print("TN: "+str(TN))


# Single ---------------------
anormal = pd.read_csv("/Users/admin/Documents/CNV/article/single_cnv_samples.txt",sep="\t",index_col=None)
anormal = anormal["samples"]
anormal = [str(i) for i in anormal]

p=[]
q=[]
r=[]
s=[]
for k in tqdm(range(1000)):
    norms = random.sample(normal,19)
    anorms = random.sample(anormal,2)
    select = norms+anorms
    sub = data[select]
    det = aberrantSamples(sub,conta=0.5)
    ndet = sub.columns[~np.in1d(sub.columns,det)]
    pos = anorms
    neg = norms
    p.extend([sum(np.in1d(det,pos))])
    q.extend([sum(np.in1d(det,neg))])
    r.extend([sum(np.in1d(ndet,pos))])
    s.extend([sum(np.in1d(ndet,neg))])

pd.DataFrame(p).to_csv(output_path+"tp_capture_single_2samples_05_v2.txt", sep="\t",index=False)
pd.DataFrame(q).to_csv(output_path+"fp_capture_single_2samples_05_v2.txt", sep="\t",index=False)
pd.DataFrame(r).to_csv(output_path+"fn_capture_single_2samples_05_v2.txt", sep="\t",index=False)
pd.DataFrame(s).to_csv(output_path+"tn_capture_single_2samples_05_v2.txt", sep="\t",index=False)
TP = np.mean(p)
FP = np.mean(q)
FN = np.mean(r)
TN = np.mean(s)
print("Single: ")
print("TP: "+str(TP))
print("FP: "+str(FP))
print("FN: "+str(FN))
print("TN: "+str(TN))


# Single del ---------------------
anormal = pd.read_csv("/Users/admin/Documents/CNV/article/singledel_cnv_samples.txt",sep="\t",index_col=None)
anormal = anormal["samples"]
anormal = [str(i) for i in anormal]

p=[]
q=[]
r=[]
s=[]
for k in tqdm(range(1000)):
    norms = random.sample(normal,19)
    anorms = random.sample(anormal,2)
    select = norms+anorms
    sub = data[select]
    det = aberrantSamples(sub,conta=0.5)
    ndet = sub.columns[~np.in1d(sub.columns,det)]
    pos = anorms
    neg = norms
    p.extend([sum(np.in1d(det,pos))])
    q.extend([sum(np.in1d(det,neg))])
    r.extend([sum(np.in1d(ndet,pos))])
    s.extend([sum(np.in1d(ndet,neg))])

pd.DataFrame(p).to_csv(output_path+"tp_capture_singledel_2samples_05_v2.txt", sep="\t",index=False)
pd.DataFrame(q).to_csv(output_path+"fp_capture_singledel_2samples_05_v2.txt", sep="\t",index=False)
pd.DataFrame(r).to_csv(output_path+"fn_capture_singledel_2samples_05_v2.txt", sep="\t",index=False)
pd.DataFrame(s).to_csv(output_path+"tn_capture_singledel_2samples_05_v2.txt", sep="\t",index=False)
TP = np.mean(p)
FP = np.mean(q)
FN = np.mean(r)
TN = np.mean(s)
print("Single del: ")
print("TP: "+str(TP))
print("FP: "+str(FP))
print("FN: "+str(FN))
print("TN: "+str(TN))


# Single dup ---------------------
anormal = pd.read_csv("/Users/admin/Documents/CNV/article/singledup_cnv_samples.txt",sep="\t",index_col=None)
anormal = anormal["samples"]
anormal = [str(i) for i in anormal]

p=[]
q=[]
r=[]
s=[]
for k in tqdm(range(1000)):
    norms = random.sample(normal,19)
    anorms = random.sample(anormal,2)
    select = norms+anorms
    sub = data[select]
    det = aberrantSamples(sub,conta=0.5)
    ndet = sub.columns[~np.in1d(sub.columns,det)]
    pos = anorms
    neg = norms
    p.extend([sum(np.in1d(det,pos))])
    q.extend([sum(np.in1d(det,neg))])
    r.extend([sum(np.in1d(ndet,pos))])
    s.extend([sum(np.in1d(ndet,neg))])

pd.DataFrame(p).to_csv(output_path+"tp_capture_singledup_2samples_05_v2.txt", sep="\t",index=False)
pd.DataFrame(q).to_csv(output_path+"fp_capture_singledup_2samples_05_v2.txt", sep="\t",index=False)
pd.DataFrame(r).to_csv(output_path+"fn_capture_singledup_2samples_05_v2.txt", sep="\t",index=False)
pd.DataFrame(s).to_csv(output_path+"tn_capture_singledup_2samples_05_v2.txt", sep="\t",index=False)
TP = np.mean(p)
FP = np.mean(q)
FN = np.mean(r)
TN = np.mean(s)
print("Single dup: ")
print("TP: "+str(TP))
print("FP: "+str(FP))
print("FN: "+str(FN))
print("TN: "+str(TN))


# Multi del ---------------------
anormal = pd.read_csv("/Users/admin/Documents/CNV/article/multidel_cnv_samples.txt",sep="\t",index_col=None)
anormal = anormal["samples"]
anormal = [str(i) for i in anormal]

p=[]
q=[]
r=[]
s=[]
for k in tqdm(range(1000)):
    norms = random.sample(normal,19)
    anorms = random.sample(anormal,2)
    select = norms+anorms
    sub = data[select]
    det = aberrantSamples(sub,conta=0.5)
    ndet = sub.columns[~np.in1d(sub.columns,det)]
    pos = anorms
    neg = norms
    p.extend([sum(np.in1d(det,pos))])
    q.extend([sum(np.in1d(det,neg))])
    r.extend([sum(np.in1d(ndet,pos))])
    s.extend([sum(np.in1d(ndet,neg))])

pd.DataFrame(p).to_csv(output_path+"tp_capture_multidel_2samples_05_v2.txt", sep="\t",index=False)
pd.DataFrame(q).to_csv(output_path+"fp_capture_multidel_2samples_05_v2.txt", sep="\t",index=False)
pd.DataFrame(r).to_csv(output_path+"fn_capture_multidel_2samples_05_v2.txt", sep="\t",index=False)
pd.DataFrame(s).to_csv(output_path+"tn_capture_multidel_2samples_05_v2.txt", sep="\t",index=False)
TP = np.mean(p)
FP = np.mean(q)
FN = np.mean(r)
TN = np.mean(s)
print("Multi del: ")
print("TP: "+str(TP))
print("FP: "+str(FP))
print("FN: "+str(FN))
print("TN: "+str(TN))



# Multi dup ---------------------
anormal = pd.read_csv("/Users/admin/Documents/CNV/article/multidup_cnv_samples.txt",sep="\t",index_col=None)
anormal = anormal["samples"]
anormal = [str(i) for i in anormal]

p=[]
q=[]
r=[]
s=[]
for k in tqdm(range(1000)):
    norms = random.sample(normal,19)
    anorms = random.sample(anormal,2)
    select = norms+anorms
    sub = data[select]
    det = aberrantSamples(sub,conta=0.5)
    ndet = sub.columns[~np.in1d(sub.columns,det)]
    pos = anorms
    neg = norms
    p.extend([sum(np.in1d(det,pos))])
    q.extend([sum(np.in1d(det,neg))])
    r.extend([sum(np.in1d(ndet,pos))])
    s.extend([sum(np.in1d(ndet,neg))])

pd.DataFrame(p).to_csv(output_path+"tp_capture_multidup_2samples_05_v2.txt", sep="\t",index=False)
pd.DataFrame(q).to_csv(output_path+"fp_capture_multidup_2samples_05_v2.txt", sep="\t",index=False)
pd.DataFrame(r).to_csv(output_path+"fn_capture_multidup_2samples_05_v2.txt", sep="\t",index=False)
pd.DataFrame(s).to_csv(output_path+"tn_capture_multidup_2samples_05_v2.txt", sep="\t",index=False)
TP = np.mean(p)
FP = np.mean(q)
FN = np.mean(r)
TN = np.mean(s)
print("Multi dup: ")
print("TP: "+str(TP))
print("FP: "+str(FP))
print("FN: "+str(FN))
print("TN: "+str(TN))



HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))


All: 
TP: 1.424
FP: 13.861
FN: 0.576
TN: 5.139


FileNotFoundError: [Errno 2] No such file or directory: '/Users/admin/Documents/CNV/article/multi_cnv_samples.txt'

In [101]:
reads_norm = data/np.max(data)

final = pd.DataFrame(0,index=reads_norm.index,columns=reads_norm.columns)
for i in reads_norm.index:
    tmp = reads_norm.loc[i]
    print(tmp)

17296    0.464036
17297    0.453032
17298    0.462632
17299    0.442460
17300    0.461669
           ...   
17399    0.478432
17400    0.461271
17401    0.502998
17402    0.485184
17403    0.430301
Name: SDHB_chr1_17345375_17345454, Length: 96, dtype: float64
17296    0.241616
17297    0.214597
17298    0.220750
17299    0.228289
17300    0.193174
           ...   
17399    0.235877
17400    0.224443
17401    0.213165
17402    0.254708
17403    0.269965
Name: SDHB_chr1_17349102_17349226, Length: 96, dtype: float64
17296    0.429568
17297    0.394222
17298    0.369330
17299    0.391546
17300    0.423024
           ...   
17399    0.485437
17400    0.519775
17401    0.474739
17402    0.454875
17403    0.518122
Name: SDHB_chr1_17350467_17350570, Length: 96, dtype: float64
17296    0.427126
17297    0.336066
17298    0.376160
17299    0.358762
17300    0.382735
           ...   
17399    0.443085
17400    0.404360
17401    0.368187
17402    0.472552
17403    0.430788
Name: SDHB_chr1_173542

Name: BRCA2_chr13_32903579_32903630, Length: 96, dtype: float64
17296    0.495908
17297    0.555676
17298    0.632090
17299    0.609973
17300    0.558477
           ...   
17399    0.559972
17400    0.593133
17401    0.536801
17402    0.472110
17403    0.470979
Name: BRCA2_chr13_32905055_32905168, Length: 96, dtype: float64
17296    0.497164
17297    0.497355
17298    0.473302
17299    0.502783
17300    0.520942
           ...   
17399    0.568985
17400    0.554470
17401    0.554315
17402    0.533098
17403    0.525774
Name: BRCA2_chr13_32906408_32907525, Length: 96, dtype: float64
17296    0.556274
17297    0.534796
17298    0.499702
17299    0.553544
17300    0.550512
           ...   
17399    0.607125
17400    0.594275
17401    0.599631
17402    0.566854
17403    0.564823
Name: BRCA2_chr13_32910401_32915334, Length: 96, dtype: float64
17296    0.556370
17297    0.591582
17298    0.591412
17299    0.588619
17300    0.568384
           ...   
17399    0.545497
17400    0.464232
17401 

Name: NF1_chr17_29683477_29683601, Length: 96, dtype: float64
17296    0.661141
17297    0.661965
17298    0.633097
17299    0.654584
17300    0.671945
           ...   
17399    0.706493
17400    0.646817
17401    0.660676
17402    0.628864
17403    0.621070
Name: NF1_chr17_29683977_29684109, Length: 96, dtype: float64
17296    0.307779
17297    0.307510
17298    0.306115
17299    0.272477
17300    0.306066
           ...   
17399    0.351509
17400    0.329502
17401    0.336896
17402    0.360897
17403    0.340870
Name: NF1_chr17_29684286_29684388, Length: 96, dtype: float64
17296    0.358257
17297    0.344438
17298    0.284073
17299    0.318003
17300    0.339982
           ...   
17399    0.367122
17400    0.315719
17401    0.346637
17402    0.335565
17403    0.367424
Name: NF1_chr17_29685497_29685641, Length: 96, dtype: float64
17296    0.435474
17297    0.581551
17298    0.651500
17299    0.537902
17300    0.536840
           ...   
17399    0.539668
17400    0.497658
17401    0.562

Name: MSH2_chr2_47703505_47703711, Length: 96, dtype: float64
17296    0.489956
17297    0.423896
17298    0.430436
17299    0.455892
17300    0.450076
           ...   
17399    0.512762
17400    0.489904
17401    0.492430
17402    0.499866
17403    0.538849
Name: MSH2_chr2_47705410_47705659, Length: 96, dtype: float64
17296    0.551823
17297    0.538145
17298    0.547658
17299    0.565212
17300    0.566220
           ...   
17399    0.554019
17400    0.513746
17401    0.568386
17402    0.587235
17403    0.587646
Name: MSH2_chr2_47707834_47708011, Length: 96, dtype: float64
17296    0.638365
17297    0.668034
17298    0.664411
17299    0.665938
17300    0.638902
           ...   
17399    0.660443
17400    0.581106
17401    0.682805
17402    0.573012
17403    0.595939
Name: MSH2_chr2_47709917_47710089, Length: 96, dtype: float64
17296    0.216023
17297    0.335275
17298    0.400798
17299    0.407361
17300    0.342872
           ...   
17399    0.494903
17400    0.532508
17401    0.543

Name: NSD1_chr5_176618884_176619021, Length: 96, dtype: float64
17296    0.535867
17297    0.536486
17298    0.537824
17299    0.504017
17300    0.543062
           ...   
17399    0.580782
17400    0.664595
17401    0.549837
17402    0.558446
17403    0.596539
Name: NSD1_chr5_176631120_176631294, Length: 96, dtype: float64
17296    0.606535
17297    0.573415
17298    0.521714
17299    0.551492
17300    0.590696
           ...   
17399    0.639475
17400    0.654662
17401    0.618112
17402    0.666388
17403    0.686958
Name: NSD1_chr5_176636636_176639197, Length: 96, dtype: float64
17296    0.539263
17297    0.541005
17298    0.499602
17299    0.517815
17300    0.516093
           ...   
17399    0.534500
17400    0.518455
17401    0.545003
17402    0.507547
17403    0.498599
Name: NSD1_chr5_176662821_176662947, Length: 96, dtype: float64
17296    0.429549
17297    0.474268
17298    0.452423
17299    0.450449
17300    0.464053
           ...   
17399    0.525031
17400    0.517925
17401 

In [82]:
TP = np.mean(p)
FP = np.mean(q)
FN = np.mean(r)
TN = np.mean(s)

print(TP)
print(FP)
print(FN)
print(TN)

0.525
9.472
0.475
9.528


In [90]:

data = pd.read_csv("/Users/admin/Documents/CNV/article/test_dataset_capture_2.txt",sep="\t",index_col=0)
run = "test_capture"
output_path = "/Users/admin/Documents/CNV/article/"
normal = pd.read_csv("/Users/admin/Documents/CNV/article/capture_normal_samples.tsv",sep="\t",index_col=None)
normal = normal["neg"]
normal = [str(i) for i in normal]

# All ---------------------
anormal = pd.read_csv("/Users/admin/Documents/CNV/article/capture_anormal_samples.tsv",sep="\t",index_col=None)
anormal = anormal["ref"]
anormal = [str(i) for i in anormal]

p=[]
q=[]
r=[]
s=[]
for k in tqdm(range(1000)):
    norms = random.sample(normal,19)
    anorms = random.sample(anormal,1)
    select = norms+anorms
    sub = data[select]
    det = random.sample(select,10)
    ndet = sub.columns[~np.in1d(sub.columns,det)]
    pos = anorms
    neg = norms
    p.extend([sum(np.in1d(det,pos))])
    q.extend([sum(np.in1d(det,neg))])
    r.extend([sum(np.in1d(ndet,pos))])
    s.extend([sum(np.in1d(ndet,neg))])

pd.DataFrame(p).to_csv(output_path+"tp_capture_all_rand_1samples.txt", sep="\t",index=False)
pd.DataFrame(q).to_csv(output_path+"fp_capture_all_rand_1samples.txt", sep="\t",index=False)
pd.DataFrame(r).to_csv(output_path+"fn_capture_all_rand_1samples.txt", sep="\t",index=False)
pd.DataFrame(s).to_csv(output_path+"tn_capture_all_rand_1samples.txt", sep="\t",index=False)
TP = np.mean(p)
FP = np.mean(q)
FN = np.mean(r)
TN = np.mean(s)
print("All: ")
print("TP: "+str(TP))
print("FP: "+str(FP))
print("FN: "+str(FN))
print("TN: "+str(TN))

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))


All: 
TP: 0.509
FP: 9.491
FN: 0.491
TN: 9.509


In [None]:
from tqdm.notebook import tqdm

synt = pd.read_csv("/home/scabello/Documents/run_juno/CNV/synthetic_raw.tsv",sep="\t",index_col=0)
#synt[synt==0] = 0.000000001

p=[]
q=[]
r=[]
s=[]

for k in tqdm(range(1,round(synt.shape[0]/2))): 
    factor = 10
    col = "sample_"+str(np.random.randint(32))
    row = synt.index[np.random.randint(synt.shape[0],size=k)]
    
    normal = synt.columns[~np.in1d(synt.columns,col)]
    
    for i in row:
        synt.at[i,col] = synt.at[i,col]*factor

    synt_norm = synt/synt.median(axis=0)
    #synt_norm = np.log(synt_norm)
    tmp = synt_norm[col]


    random_data = np.array(tmp).reshape(-1,1)
    clf = IsolationForest(contamination=pow(10,-6)).fit(np.array(np.mean(synt_norm[normal], axis = 1)).reshape(-1,1))
    preds = clf.predict(random_data)

    det = np.array(synt_norm.index)[preds==-1]
    ndet = synt.index[~np.in1d(synt.index,det)]
    pos = row
    neg = synt.index[~np.in1d(synt.index,pos)]
    
    p.extend([sum(np.in1d(det,pos))])
    q.extend([sum(np.in1d(det,neg))])
    r.extend([sum(np.in1d(ndet,pos))])
    s.extend([sum(np.in1d(ndet,neg))])

pd.DataFrame(p).to_csv("/home/scabello/Documents/run_juno/CNV/article/rows/tp_rows_10-6_4.tsv", sep="\t",index=False)
pd.DataFrame(q).to_csv("/home/scabello/Documents/run_juno/CNV/article/rows/fp_rows_10-6_4.tsv", sep="\t",index=False)
pd.DataFrame(r).to_csv("/home/scabello/Documents/run_juno/CNV/article/rows/fn_rows_10-6_4.tsv", sep="\t",index=False)
pd.DataFrame(s).to_csv("/home/scabello/Documents/run_juno/CNV/article/rows/tn_rows_10-6_4.tsv", sep="\t",index=False)
