In [11]:
#count total number of sequences
import os
from Bio import SeqIO
import pandas as pd

In [None]:
#count sequence from a file
def count_sequences_in_fasta(fasta_file):
    count = 0
    with open(fasta_file, "r") as f:
        for record in SeqIO.parse(f, "fasta"):
            count += 1
    return count


if __name__ == "__main__":
    fasta_file_path = "G:\\Projects\\HIV_updated\\8th_output_graphs\\freq_logo_input_files\\merged_all_aa.fasta"
    total_sequences = count_sequences_in_fasta(fasta_file_path)
    print("Total number of sequences:", total_sequences)

In [27]:
#count sequences from yearwise subdirectories
def count_sequences_in_fasta(file_path):
    with open(file_path, "r") as fasta_file:
        return sum(1 for _ in SeqIO.parse(fasta_file, "fasta"))

def process_directory(directory_path):
    subdirectory_data = []
    for subdir in os.listdir(directory_path):
        subdir_path = os.path.join(directory_path, subdir)
        if os.path.isdir(subdir_path):
            total_sequences = 0
            for file in os.listdir(subdir_path):
                file_path = os.path.join(subdir_path, file)
                if file.endswith((".fasta", ".fas")):
                    total_sequences += count_sequences_in_fasta(file_path)
            subdirectory_data.append((subdir, total_sequences))
    return subdirectory_data


def create_excel_file(data, output_path):
    df = pd.DataFrame(data, columns=["Subdirectory", "Total Sequences"])
    df.to_excel(output_path, index=False)

if __name__ == "__main__":
    # Replace 'your_directory_path' with the path to your main directory
    main_directory = 'G:\\Projects\\HIV_updated\\1st_output_nt_sep'
    excel_output_path = 'G:\\Projects\\HIV_updated\\1st_output_nt_sep\\raw_seq.xlsx'

    subdirectory_data = process_directory(main_directory)
    create_excel_file(subdirectory_data, excel_output_path)

In [18]:
#seq count for each subtype
def count_sequences_in_fasta(file_path):
    with open(file_path, "r") as fasta_file:
        return sum(1 for _ in SeqIO.parse(fasta_file, "fasta"))

def process_fasta_files(directory_path):
    fasta_data = []
    for file in os.listdir(directory_path):
        file_path = os.path.join(directory_path, file)
        if file.endswith((".fasta", ".fas")):
            num_sequences = count_sequences_in_fasta(file_path)
            fasta_data.append((file, num_sequences))
    return fasta_data

def create_excel_file(data, output_path):
    df = pd.DataFrame(data, columns=["Fasta File", "Sequence Number"])
    df.to_excel(output_path, index=False)

if __name__ == "__main__":
    # Replace 'your_directory_path' with the path to your directory containing the fasta files
    fasta_directory = 'G:\\Projects\\HIV_updated\\4th_output_nt\\2016'
    excel_output_path = 'G:\\Projects\\HIV_updated\\4th_output_nt\\4seq_count.xlsx'

    fasta_data = process_fasta_files(fasta_directory)
    create_excel_file(fasta_data, excel_output_path)

In [12]:
#count se for sorted files (pure, reccom, hybrid)

def count_sequences_in_fasta(file_path):
    with open(file_path, "r") as fasta_file:
        return sum(1 for _ in SeqIO.parse(fasta_file, "fasta"))

def count_sequences_in_second_subdirectories(first_subdir_path):
    total_sequences = 0
    for subdir in os.listdir(first_subdir_path):
        second_subdir_path = os.path.join(first_subdir_path, subdir)
        if os.path.isdir(second_subdir_path):
            for filename in os.listdir(second_subdir_path):
                file_path = os.path.join(second_subdir_path, filename)
                if filename.endswith((".fasta", ".fas")) and os.path.isfile(file_path):
                    total_sequences += count_sequences_in_fasta(file_path)
    return total_sequences

def process_directories(main_directory):
    subdirectory_data = []
    for first_subdir in os.listdir(main_directory):
        first_subdir_path = os.path.join(main_directory, first_subdir)
        if os.path.isdir(first_subdir_path):
            total_sequences = count_sequences_in_second_subdirectories(first_subdir_path)
            subdirectory_data.append((first_subdir, total_sequences))

    return subdirectory_data

def create_excel_file(data, output_path):
    df = pd.DataFrame(data, columns=["First Subdirectory", "Total Sequences"])
    df.to_excel(output_path, index=False)

if __name__ == "__main__":
    # Replace 'your_main_directory' with the path to your main directory containing the first subdirectories
    main_directory = 'G:\\Projects\\HIV_updated\\7th_output_no_con'
    excel_output_path = 'G:\\Projects\\HIV_updated\\7th_output_no_con\\output.xlsx'

    subdirectory_data = process_directories(main_directory)
    create_excel_file(subdirectory_data, excel_output_path)


In [29]:
#count yearwise and subtype wise sequences
import os
from Bio import SeqIO
import pandas as pd

def count_sequences_in_fasta(fasta_file):
    return sum(1 for _ in SeqIO.parse(fasta_file, 'fasta'))

def count_sequences_in_directory(directory_path):
    count = 0
    file_counts = {}
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            if file.endswith(('.fasta', '.fas')):
                file_path = os.path.join(root, file)
                file_count = count_sequences_in_fasta(file_path)
                count += file_count
                file_name = os.path.splitext(file)[0]
                file_counts[file_name] = file_count
    return count, file_counts

def main(base_directory):
    data = []
    second_subdir_names = ['pure', 'hybrid', 'recom']
    first_subdir_counts = {}
    second_subdir_counts = {subdir: [] for subdir in second_subdir_names}
    
    for first_subdir in os.listdir(base_directory):
        first_subdir_path = os.path.join(base_directory, first_subdir)
        if os.path.isdir(first_subdir_path):
            second_subdirs = [d for d in os.listdir(first_subdir_path) if os.path.isdir(os.path.join(first_subdir_path, d))]
            first_subdir_count = 0
            first_subdir_file_counts = {}  # Initialize first_subdir_file_counts here

            if second_subdirs:
                second_subdir_counts = {subdir: 0 for subdir in second_subdir_names}  # Initialize second_subdir_counts here
                for i, second_subdir in enumerate(second_subdirs):
                    second_subdir_path = os.path.join(first_subdir_path, second_subdir)
                    second_subdir_count, file_counts = count_sequences_in_directory(second_subdir_path)
                    first_subdir_count += second_subdir_count
                    second_subdir_counts[second_subdir] = second_subdir_count
                    first_subdir_file_counts.update(file_counts)

            first_subdir_counts[first_subdir] = first_subdir_file_counts
            data.append([first_subdir, first_subdir_count] + list(second_subdir_counts.values()))

    columns = ['First Subdirectory', 'Total'] + second_subdir_names
    df = pd.DataFrame(data, columns=columns)
    df.to_excel('G:\\Projects\\HIV_updated\\2nd_output_nt_no_con_sorted\\seq_count.xlsx', index=False)

    # Create a separate sheet for the file counts
    with pd.ExcelWriter('G:\\Projects\\HIV_updated\\2nd_output_nt_no_con_sorted\\seq_count.xlsx', engine='openpyxl', mode='a') as writer:
        for subdir, file_counts in first_subdir_counts.items():
            file_counts_df = pd.DataFrame(list(file_counts.items()))
            file_counts_df.to_excel(writer, sheet_name=f'{subdir}', index=False, header=False)

if __name__ == "__main__":
    base_directory = 'G:\\Projects\\HIV_updated\\2nd_output_nt_no_con_sorted'
    main(base_directory)
