In [4]:
import os
import json
import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import argparse

from sklearn.ensemble import IsolationForest



###########################################
#               FUNCTIONS                 #
###########################################


def clean_reference(ref,outliers):
    for i in outliers:
        ref = ref.drop(labels=i,axis=1)

    return ref

def createReadsMatrix(pathToBam, bedFile, pathToBedtools, output=None, verbose=False):
    cmd = ["ls", pathToBam]
    res = subprocess.check_output(cmd)
    final=pd.DataFrame()

    for i in res.decode('utf-8').split("\n"):
        if i.endswith(".bam"):
            if verbose==True:
                print("Processing sample "+i[:-4]+"...")
            command = [
                pathToBedtools,
                "multicov",
                "-bams", pathToBam+"/"+i,
                "-bed", bedFile]

            res = subprocess.check_output(command)
            data = io.StringIO(res.decode("utf-8"))
            df = pd.read_csv(data, sep='\t',header=None)
            nam = i[:-4]
            final[nam] = df[6]
            if verbose==True:
                print(i[:-4]+" Done")
    final.index = list(df[3])

    if output is not None:
        if verbose==True:
            print("Reads matrix created !")
        final.to_csv(output,sep="\t")

    return(final)



def filterReads(reads,N,regtar=None,regsamp=None):
    if regtar is not None:
        reads = reads.filter(regex=regtar,axis=0)
    if regsamp is not None:
        reads = reads.filter(regex=regsamp)
    reads = reads.filter(regex="^(?!MSI)",axis=0)
    reads = reads.filter(regex="^(?!TN)")
    reads = reads.filter(regex="^(?!TP)")
    reads = reads.filter(regex="^(?!HD)")
    reads = reads.filter(regex="^(?!H2)")
    col = reads.columns
    reads = reads.loc[:,reads.sum(axis=0)>N]
    filtered_samples = col[~np.in1d(col,reads.columns)]
    return(reads, filtered_samples)


def normalizeReads(reads):
    reads_norm=reads/reads.sum(axis=0)
    return(reads_norm)


def aberrantSamples(reads,conta='auto'):    
    tmp = np.percentile(reads, 99, axis = 0)/np.mean(reads, axis = 0)
    random_data = np.array(tmp).reshape(-1,1)
    clf = IsolationForest(contamination=conta).fit(random_data)
    preds = clf.predict(random_data)
    res_amp = np.array(reads.columns)[preds==-1]
    
    tmp = np.percentile(reads, 1, axis = 0)/np.mean(reads, axis = 0)
    random_data = np.array(tmp).reshape(-1,1)
    clf = IsolationForest(contamination=conta).fit(random_data)
    preds = clf.predict(random_data)
    res_del = np.array(reads.columns)[preds==-1]
    
    res = np.unique(np.concatenate((res_amp,res_del)))
    norm = reads.columns[~np.in1d(reads.columns,res)]
    
    return(res, norm)


def aberrantAmpliconsPerSample(name,reads_norm,conta="auto"):
    random_data = np.array(reads_norm[name]).reshape(-1,1)
    clf = IsolationForest(contamination=conta).fit(np.array(np.mean(reads_norm, axis = 1)).reshape(-1,1))
    preds = clf.predict(random_data)
    if verbose:
        print(name)
        print(np.array(reads_norm.index)[preds==-1])
    return(np.array(reads_norm.index)[preds==-1])



def amplifEvalGene(reads,abSamples,gene,sample):
    reads_m = reads/reads.median(axis=0)
    reads_m = reads_m.filter(regex="^"+gene,axis=0)
    sub = reads_m
    for i in abSamples:
        sub = sub.drop(labels=i,axis=1)
    reads_m = reads_m[sample]
    val = np.mean(reads_m)/np.mean(sub.mean())
    if val==np.inf:
        val = 100
    return val


def scoreAmplif(k,n,N,mu):
    p = n/N
    x = np.log(1/((p**k)*(1-p)**(n-k)))*(k/n)
    return x

def aberrantAmpliconsFinal(reads, reads_norm, abSamples,scoreThreshold,ampThreshold,mode="extensive",run="ifCNV"):
    f = pd.DataFrame(columns=["run","name","gene","ratio","score"])
    
    #if mode=="extensive":
        #samples = abSamples[]

    q=0
    for name in samples:
        abAmp = aberrantAmpliconsPerSample(name,reads_norm)
        if abAmp.shape!=(0,):
            genes = np.unique([i.split('_')[0] for i in abAmp])
            for gene in genes:
                r = re.compile(gene)
                abEx = list(filter(r.match, abAmp))
                exons1 = [i.split('_')[0]+"_"+i.split('_')[1] for i in abEx]
                tmp = reads.filter(regex="^"+gene,axis=0)
                exons2 = [i.split('_')[0]+"_"+i.split('_')[1] for i in tmp.index]

                score = scoreAmplif(len(abEx),tmp.shape[0],reads.shape[0],len(abEx)/tmp.shape[0])

                amplif = amplifEvalGene(reads_norm, abSamples, gene, name)

                if score>scoreThreshold and amplif>ampThreshold:
                    f.loc[q] = [run,name,gene,amplif,score]
                    q=q+1


    return(f)



###########################################
#               MAIN                      #
###########################################
parser = argparse.ArgumentParser(description='ifCNV')
parser.add_argument('-i', '--input', type=str, help='Path to the input bam folder')
parser.add_argument('-b', '--bed', type=str, help='Path to the bed file')
parser.add_argument('-t', '--bedtools', type=str, help='Path to bedtools')
parser.add_argument('-m', '--mode', type=str, default='fast' help='fast or extensive')
parser.add_argument('-cs', '--contaSamples', default = "auto", help='Contamination parameter for the AberrantSamples function')
parser.add_argument('-ct', '--contaTargets', default = "auto", help='Contamination parameter for the AberrantTargets function')
parser.add_argument('-sT', '--scoreThreshold', type=int, default=5, help='Threshold on the localisation score')
parser.add_argument('-aT', '--ampThreshold', type=float, default=1.2, help='Threshold on the amplification ratio')
parser.add_argument('-rS', '--regSample', type=str, help='A pattern ')
parser.add_argument('-i', '--input', type=str, help='Path to the input bam folder')
parser.add_argument('-i', '--input', type=str, help='Path to the input bam folder')
parser.add_argument('-v', '--verbose', type=str, help='A boolean, default ')
args = parser.parse_args()

PATH = "/mnt/Bioinfo/BioTS/Results//ADIVaR/FDG_Juno_v3/"
output_path = "/mnt/chu-ngs/Labos/BioTS/SOMAT/DIAG/Juno/MET_amp/"
correspondance = pd.read_csv("/mnt/Bioinfo/BioTS/Projets/CNV/correspondance.txt",sep="\t")
run = args.run

if run is None:
   print("-r has no default value")
   ld = []
else:
   ld = os.listdir(PATH)
   ld = np.array(ld)[np.array([bool(re.findall(run,ld[i])) for i in range(len(ld))])]

N = 1536 #number of amplicons

res = pd.DataFrame(columns=["run","name","gene","ratio","score"])
k = 0

if len(ld)>0:
    path = PATH+ld[0]+'/data/'
    reads = openJson(path,N)
    reads = sumLibraries(reads)
    reads = correctIndex(reads,correspondance)
    final, filtered_samples = filterReads(reads,N*200,output_path=output_path+"reads_"+args.run+".tsv")
    q=0
    #filtered_samples = filtered_samples[[bool(re.search("^P", i)) for i in filtered_samples]]
    if len(filtered_samples)>0:
        for i in filtered_samples:
            tmp = [run,i,"-","Non Analysable","-"]
            res.loc[q] = tmp
            q=q+1
    final_norm = normalizeReads(final)
    abSamples = aberrantSamples(final)
    #allSamples = final.filter(regex="^P").columns

    ff = aberrantAmpliconsFinal(final,final_norm,abSamples,final.columns,run,1)
    if ff.shape[0]>0:
        res = res.append(ff)

    #res = res.loc[np.in1d(res["gene"],"MET"),:]
    res.index=range(res.shape[0])
    #negSamples = allSamples[~np.in1d(allSamples,res["name"])]
    #q = res.shape[0]
    #if len(negSamples)>0:
    #    for i in negSamples:
    #        q=q+1
    #        res.loc[q] = [run,i,"MET","Negatif","-"]

    res.to_csv(output_path+"CNV_Juno_all_"+run+".tsv", sep="\t",index=False)

else:
    print("Erreur. Verifier le nom du run.")


usage: ipykernel_launcher.py [-h] [-r RUN]
ipykernel_launcher.py: error: unrecognized arguments: -f /Users/admin/Library/Jupyter/runtime/kernel-ded14f25-2461-46e2-b892-e2e358060b54.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
