Skip to content
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
executable file 131 lines (122 sloc) 6.92 KB
#Main pipeline connects all the scripts together
#Original Programmer:
#Refactored by:
#Usage: python input.sam output_prefix match_length mismatch_number pcr_rm fdr_cluster clip_type fdr_mutation species
#Required packages: pysam, ghmm, pybedtools
import sys
import argparse
import logging
import os
from time import gmtime, strftime
from lib import *
def prepare_argparser():
description = "Find mutations"
epilog = "For command line options of each command, type %(prog)s COMMAND -h"
argparser = argparse.ArgumentParser(description=description, epilog = epilog)
argparser.add_argument("-i","--input",dest = "infile", type = str, required = True, help = "input bam file")
argparser.add_argument("-t","--control",dest="ctrlfile",type=str,required=False, help = "control bam file")
argparser.add_argument("-o","--output",dest = "outfile", type = str,required = True, help = "output file, default is stdout")
argparser.add_argument("-l","--matchLength",dest = "matchLength", type = int ,required = True, help = "shorted matched segment length")
argparser.add_argument("-m","--mismatch",dest = "mismatch", type = int,required = True, help = "maximum mismatch number")
argparser.add_argument("-c","--clipType",dest = "clipType", type = int,required = True, help = "CLIP type (0)HITS-CLIP; (1)PAR-4SU; (2)PAR-6SG; (3)iCLIP", choices=[0,1,2,3])
argparser.add_argument("-r","--rmdup",dest = "dupRemove", type = int,required = True, help = "Remove PCR duplicate (0)No removal; (1)Remove by read start; (2)Remove by sequence; ", choices=[0,1,2])
argparser.add_argument("-M","--fdrMutation",dest = "fdrMutation", type = float,required = True, help = "FDR for reliable mutations")
argparser.add_argument("-C","--fdrCluster",dest = "fdrCluster", type = float,required = True, help = "FDR for enriched clusters")
argparser.add_argument("-s","--species",dest = "species", type = str, help = "Species [\"mm10\",\"hg19\"]",choices=["mm10","hg19"])
def runPipeClip(infile,control,outputPrefix,matchLength,mismatch,rmdup,fdrEnrichedCluster,clipType,fdrReliableMutation,species):
myClip = CLIP.CLIP(infile,outputPrefix)
controlFlag = False
if control != None:
controlClip = CLIP.CLIP(control,outputPrefix+"Control")"Start to run")
if myClip.testInput():#check input"Input file OK,start to run PIPE-CLIP")"Species info %s" % species)
if control != None: #test control file
if controlClip.testInput():"Control file OK. Use control in mutation enrichment.")
controlFlag = True
else:"Control file format error. Continue without control.")
if myClip.readfile():
if controlFlag:"Read in control file")
if myClip.clusterCount>0:"Get enriched clusters")
status = Enrich.clusterEnrich_outsource(myClip,fdrEnrichedCluster)
if status:"Found %d enriched clusters" % myClip.sigClusterCount)
logging.error("There is no enriched cluster found. Exit program")
logging.error("There is no clusters found. Please check input.Exit program.")
if myClip.mutationCount>0:"Get reliable mutations")
if controlFlag: #use control
Enrich.mutationEnrich(myClip,fdrReliableMutation)"There are %d reliable mutations" % myClip.sigMutationCount)
logging.warning("There is no mutation found in this BAM file.")
#Start to get crosslinking sites
if myClip.sigClusterCount > 0 and myClip.sigMutationCount>0:"Get cross-linking sites")
if (len(myClip.crosslinking.keys())>0):
outfilelist = myClip.printCrosslinkingSites()
logging.warning("There is no crosslinking found. May be caused by no reliable mutations in enriched clusters. Print out enriched clusters instead.")
outfilelist = myClip.printEnrichClusters()
if myClip.sigClusterCount <= 0:
logging.error("There is no enriched clusters for this sample, please check your input file. Exit.")
elif myClip.sigMutationCount <=0:
logging.warning("There is no reliable mutations found. PIPE-CLIP will provide enriched clusters as crosslinking candidates.")
outfilelist = myClip.printEnrichClusters()
#annotation if possible
if species in ["mm10","mm9","hg19"]:"Started to annotate cross-linking sits using HOMER")
for name in outfilelist:
#logging.debug("Start to do annotation for %s" % name)
#output a status log file
logfile = open(outputPrefix+".pipeclip.summary.log","w")
print >>logfile,"PIPE-CLIP run finished. Parameters are:"
print >> logfile,"Input BAM: %s \nOutput prefix: %s \nMinimum matched length: %d \nMaximum mismatch count: %d \nPCR duplicate removal code: %d \nFDR for enriched clusters: %f \nFDR for reliable mutations: %f" % (infile,outputPrefix,matchLength,mismatch,rmdup,fdrEnrichedCluster,fdrReliableMutation)
print >> logfile, "There are %d mapped reads in input BAM file. After filtering,%d reads left" % (myClip.originalMapped,myClip.filteredAlignment)
print >> logfile, "%d out of %d clusters are enriched." % (myClip.sigClusterCount,len(myClip.clusters))
print >> logfile, "%d out of %d mutations are reliable." % (myClip.sigMutationCount,myClip.mutationCount)
print >> logfile, "%d crosslinking site candidates are found, with %d supporting reliable mutations." % (len(myClip.crosslinking.keys()),len(myClip.crosslinkingMutations))
logfile.close()"PIPE-CLIP finished the job, please check your results. :)")
logging.error("File corruputed, program exit.")
if __name__=="__main__":
arg_parser = prepare_argparser()
args = arg_parser.parse_args()
infile = args.infile # Input SAM/BAM file
control = args.ctrlfile
outputPrefix = args.outfile # Output prefix
matchLength = args.matchLength # Shorted matched segment length
mismatch = args.mismatch # Maximum mismatch number
rmcode = args.dupRemove
fdrEnrichedCluster = args.fdrCluster # FDR for enriched clusters
clipType =args.clipType # CLIP type (0)HITS-CLIP; (1)PAR-4SU; (2)PAR-6SG; (3)iCLIP
fdrReliableMutation = args.fdrMutation# FDR for reliable mutations
species = args.species # Species ["mm10","hg19"]
You can’t perform that action at this time.