In [None]:
import os
from pathlib import Path
from Bio import SeqIO
import re

In [None]:
# provide the path for saving the filtered new fasta files
dirPath = "provide the path for saving the filtered new fasta files"

#common extension of all the fasta files if its different than the .fasta
ext = ('_contigs.fasta') 

# empty contigs string to store the files in the target directory 
contigs = ''

In [None]:
def is_fasta(filepath):
    """
    -------------------------------------------------------------------
        is_fasta
        Takes in the path of the file as an argument 
        Checks if the file is a Fasta file or wrong file format 
    -------------------------------------------------------------------
    
    """

    with open(filepath) as file:
        # checking if the file is readable or not
        one_char = file.read(1)
        try:
            # checking for the fasta file format identifier
            if one_char == '>':
                print('File is good')
            elif one_char != '>':
                print('WARNING: Does not contain fasta file formatting')

        except:
            print('Cannot read the file')

In [None]:

def fastainfo(fileHandle):
    """
    -------------------------------------------------------------------
    fastainfo 
    Process a fasta file  
    returns the count of the number of sequnces and the total number of 
    residues in those sequences.
    -------------------------------------------------------------------
    
    """
    seq_info = []
    seqCount = 0
    bpCount = 0

    # Scan through file line by line counting sequences and basepairs 
    for line in fileHandle:
      seqMatch = re.search(r'>', line)
      if seqMatch:
        seqCount += 1
      else:
        bpCount += len(line.strip())
    
    # Close the file    
    fileHandle.close()


    # Store seqCount and bpCount in seq_info and return it 
    seq_info.append(seqCount)
    seq_info.append(bpCount)
    return (print(f'Number of contigs: {seq_info[0]}\nNumber of total base pairs in the file: {seq_info[1]}'))

In [None]:
def bp_length(filepath,len):
    """
    -------------------------------------------------------------------
    bp_length
    
    Process a fasta file
    Generates a new fasta file with contigs with user defined base pair length.  
    -------------------------------------------------------------------
    """

    filtered =  []
    
    # looping through all the contigs 
    for seq_record in SeqIO.parse(filepath,'fasta'):
        
        #splitting the header with seperator '_'
        heading = seq_record.id.split('_') 

        # extracting the 3rd element that is the bp length
        length = int(heading[3])

        # for filtering the lengths which are more than user provided bp
        if(length > len):
            id = seq_record.id
            filtered_seq = (seq_record.seq)
            filtered.append(str('>' + id+ ('\n') + str(filtered_seq) + '\n'))

        
    # writing the sequences into a new fasta file
    completeName = os.path.join(save_path,name+'_'+str(len)+".fasta") 
    fasta = open(completeName,'a')
    for i in filtered:
        fasta.write(i + '\n')
    fasta.close()

In [None]:
for files in (os.listdir(dirPath)):
    if files.endswith(ext):
        contigs = files
        oldfilename = contigs.split('_')
        filename = oldfilename[0]
        
        
        # for generating the name of the ouput file from the inpur file
        # the path where the fasta files to be filtered are saved for getting the name of the file
        filepath = f'{dirPath}/{contigs}' 
        
        # the path where the filtered files should be stored
        save_path = f'{dirPath}filtered_fasta' 
        
        name = Path(filepath).stem
        filehandle = open(filepath,'r')
      
        print(f'File name: {name}')
        # checking the fasta file
        is_fasta(filepath)
        
        #getting the general description of the fasta file 
        fastainfo(filehandle)
        
        # for getting a fasta file with contigs greater than 500 bp 
        bp_length(filepath,500)
        
        print('\n')

In [None]:
# the path of the directory where the filtered fasta is stored 
savedDir = f'{dirPath}/filtered_fasta/'

def newfastainfo(savedDir):
    """
    Takes in the path of the folder where the filtered fasta is stored and gives the general description of the 
    fasta file with the fastainfo function
    
    """
    ext = '.fasta'
    for files in (os.listdir(savedDir)):
            if files.endswith(ext):
                contigs = files
                
                filepath = f'{savedDir}{contigs}'
                filehandle = open(filepath,'r')
                name = Path(filepath).stem
                
                #generating the information about filtered file
                print(f'File name: {name}')
                fastainfo(filehandle)
                
                print('\n')

In [None]:
newfastainfo(savedDir)