# Bacterial Blast Data Handling 

## 1.1 Create Bacterial Master file

In [2]:
import os

# Directory containing blast result files
blast_output_directory = "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts"

# Name of the master file
master_filename = "bacterial_master_blaster.txt"

# Path to the master file
master_filepath = os.path.join(blast_output_directory, master_filename)

# Open the master file in append mode
with open(master_filepath, "a") as master_file:
    # Iterate through each file in the blast output directory
    for filename in os.listdir(blast_output_directory):
        # Check if the file is a blast result file
        if filename.endswith("_blastp_results.txt"):
            # Get the full path of the blast result file
            blast_result_filepath = os.path.join(blast_output_directory, filename)
            # Open the blast result file and read its content
            with open(blast_result_filepath, "r") as blast_result_file:
                # Read the content of the blast result file and write it to the master file
                master_file.write(blast_result_file.read())
                # Write a newline character to separate the content of each blast result file
                master_file.write("\n")

print("All blast_results.txt files have been concatenated into the master file:", master_filepath)


All blast_results.txt files have been concatenated into the master file: /home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/bacterial_master_blaster.txt


### 1.1.1 Make Masterfile Csv

In [3]:
import os

# Directory containing blast result files
blast_output_directory = "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts"

# Name of the master file
master_filename = "bacterial_master_blaster.txt"

# Path to the master file
master_filepath = os.path.join(blast_output_directory, master_filename)

# Counter for the number of files processed
file_count = 0

# Open the master file in append mode
with open(master_filepath, "a") as master_file:
    # Iterate through each file in the blast output directory
    for filename in os.listdir(blast_output_directory):
        # Check if the file is a blast result file
        if filename.endswith("_blastp_results.txt"):
            file_count += 1  # Increment the file count
            # Get the full path of the blast result file
            blast_result_filepath = os.path.join(blast_output_directory, filename)
            # Open the blast result file and read its content
            with open(blast_result_filepath, "r") as blast_result_file:
                # Read the content of the blast result file and write it to the master file
                master_file.write(blast_result_file.read())
                # Write a newline character to separate the content of each blast result file
                master_file.write("\n")

print(f"{file_count} blast_results.txt files have been concatenated into the master file:", master_filepath)


44 blast_results.txt files have been concatenated into the master file: /home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/bacterial_master_blaster.txt


### 1.1.2 Make additions to masterfile to analyse hits in regard to protein type and name

In [63]:
import pandas as pd

# File paths
bacterial_master_blaster_filepath = "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/bacterial_master_blaster_results.csv"
protein_names_filepath = "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/processed_seqs/protein_names.csv"
output_csv_path = "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/bacterial_master_blaster_proteins_results.csv"

# Read the bacterial master blaster CSV
bacterial_master_blaster_df = pd.read_csv(bacterial_master_blaster_filepath)

# Read the protein names CSV
protein_names_df = pd.read_csv(protein_names_filepath)

# Merge the dataframes on 'Accession ID'
merged_df = pd.merge(bacterial_master_blaster_df, protein_names_df[['Accession ID', 'Protein Names', 'Protein Type']], on='Accession ID', how='left')

# Reorder columns to place 'Protein Names' and 'Protein Type' at the start
cols = ['Protein Names', 'Protein Type'] + [col for col in bacterial_master_blaster_df.columns if col not in ['Protein Names', 'Protein Type']]
merged_df = merged_df[cols]

# Write the merged dataframe to a new CSV file
merged_df.to_csv(output_csv_path, index=False)

# Count the number of unique protein names
unique_protein_names = merged_df['Protein Names'].nunique()
print(f"Number of unique protein names: {unique_protein_names}")

# Breakdown of each protein type
protein_type_counts = merged_df['Protein Type'].value_counts()
print("Breakdown of protein types:")
print(protein_type_counts)

print(f"Merged results saved to '{output_csv_path}'")

Number of unique protein names: 41
Breakdown of protein types:
Antiviral Proteins                            8
Cell Cycle Regulation Proteins                6
Cell Death and Survival Related               5
Structural                                    4
Cardiovascular and Renal Hormones             4
Neuropeptides and Neurotransmitter            4
Immune System - Cytokines                     3
Gastrointestinal                              2
Metabolic Hormones                            2
Growth Factors/ Developmental Proteins        1
Tumour Suppressors and DNA Repair Proteins    1
Hypothalamic-Pituitary Axis Hormones          1
Name: Protein Type, dtype: int64
Merged results saved to '/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/bacterial_master_blaster_proteins_results.csv'


## 1.2 Print Total Hits and Accession ID's

In [14]:
from collections import defaultdict
import csv

# Path to the master file
master_filepath = "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/bacterial_master_blaster.txt"

# Initialize a defaultdict to store query counts and hits
query_hits = defaultdict(list)

# Read the file and extract query IDs and hits
try:
    with open(master_filepath, "r") as master_file:
        current_query_id = None
        for line in master_file:
            if "|" in line:  
                current_query_id = line.split("|")[1].strip()
            elif ">" in line:  
                if current_query_id:
                    query_hits[current_query_id].append(line.split(">")[1].strip())  
except FileNotFoundError:
    print(f"Error: File '{master_filepath}' not found.")
    exit(1)
except Exception as e:
    print(f"An error occurred: {e}")
    exit(1)

# Calculate total hits for each accession ID
total_hits = {query_id: len(hits) for query_id, hits in query_hits.items()}

# Rank and print total hits
print("Ranked total hits for each accession ID:")
for rank, (query_id, hits) in enumerate(sorted(total_hits.items(), key=lambda x: x[1], reverse=True), start=1):
    print(f"{rank}. Accession ID: {query_id}, Total Hits: {hits}")

# Write results to CSV file
output_csv_path = "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/bacterial_master_blaster_results.csv"
try:
    with open(output_csv_path, mode='w', newline='') as csv_file:
        fieldnames = ['Query ID', 'Hits']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        
        writer.writeheader()
        for query_id, hits in query_hits.items():
            writer.writerow({'Query ID': query_id, 'Hits': ', '.join(hits)})
    print(f"Results saved to '{output_csv_path}'")
except Exception as e:
    print(f"An error occurred while writing to CSV: {e}")



Ranked total hits for each accession ID:
1. Accession ID: P11802, Total Hits: 250
2. Accession ID: Q01629, Total Hits: 250
3. Accession ID: P09914, Total Hits: 250
4. Accession ID: P60484, Total Hits: 250
5. Accession ID: Q01628, Total Hits: 250
6. Accession ID: Q00535, Total Hits: 250
7. Accession ID: P06493, Total Hits: 250
8. Accession ID: P13164, Total Hits: 250
9. Accession ID: Q00534, Total Hits: 250
10. Accession ID: O14879, Total Hits: 250
11. Accession ID: P55210, Total Hits: 250
12. Accession ID: Q13325, Total Hits: 250
13. Accession ID: Q00526, Total Hits: 250
14. Accession ID: P24941, Total Hits: 250
15. Accession ID: P01019, Total Hits: 250
16. Accession ID: P01966, Total Hits: 247
17. Accession ID: O43745, Total Hits: 133
18. Accession ID: P19883, Total Hits: 104
19. Accession ID: Q99653, Total Hits: 68
20. Accession ID: P42574, Total Hits: 56
21. Accession ID: O43808, Total Hits: 42
22. Accession ID: P47898, Total Hits: 28
23. Accession ID: P32247, Total Hits: 27
24. Acc

In [56]:
from collections import defaultdict
import csv

# Path to the master file
master_filepath = "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/bacterial_master_blaster.txt"

# Initialize a defaultdict to store query counts and hits
query_hits = defaultdict(list)

# Read the file and extract query IDs and hits
try:
    with open(master_filepath, "r") as master_file:
        current_query_id = None
        for line in master_file:
            if "|" in line:  # Check if the line contains an accession ID
                current_query_id = line.split("|")[1].strip()  # Extract the query ID between "|"
            elif ">" in line:  # Check if the line contains a hit
                if current_query_id:
                    query_hits[current_query_id].append(line.split(">")[1].strip())  # Extract hit and append to list
except FileNotFoundError:
    print(f"Error: File '{master_filepath}' not found.")
    exit(1)
except Exception as e:
    print(f"An error occurred: {e}")
    exit(1)

# Calculate total hits for each accession ID
total_hits = {query_id: len(hits) for query_id, hits in query_hits.items()}

# Write results to CSV file
output_csv_path = "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/bacterial_master_blaster_results.csv"
try:
    with open(output_csv_path, mode='w', newline='') as csv_file:
        fieldnames = ['Rank', 'Accession ID', 'Total Hits', 'Hits']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        
        writer.writeheader()
        for rank, (query_id, hits) in enumerate(sorted(query_hits.items(), key=lambda x: len(x[1]), reverse=True), start=1):
            writer.writerow({'Rank': rank, 'Accession ID': query_id, 'Total Hits': total_hits[query_id], 'Hits': ', '.join(hits)})
    print(f"Results saved to '{output_csv_path}'")
except Exception as e:
    print(f"An error occurred while writing to CSV: {e}")


Results saved to '/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/bacterial_master_blaster_results.csv'


In [17]:
from collections import defaultdict
import csv

# Path to the master file
master_filepath = "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/bacterial_master_blaster.txt"

# Initialize a defaultdict to store query counts and hits
query_hits = defaultdict(list)

# Read the file and extract query IDs and hits
try:
    with open(master_filepath, "r") as master_file:
        current_query_id = None
        for line in master_file:
            if "|" in line:  # Check if the line contains an accession ID
                current_query_id = line.split("|")[1].strip()  # Extract the query ID between "|"
            elif ">" in line:  # Check if the line contains a hit
                if current_query_id:
                    query_hits[current_query_id].append(line.split(">")[1].strip())  # Extract hit and append to list
except FileNotFoundError:
    print(f"Error: File '{master_filepath}' not found.")
    exit(1)
except Exception as e:
    print(f"An error occurred: {e}")
    exit(1)

# Calculate total hits for each accession ID
total_hits = {query_id: len(hits) for query_id, hits in query_hits.items()}

# Write results to CSV file
output_csv_path = "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/bacterial_master_blaster_results.csv"
try:
    with open(output_csv_path, mode='w', newline='') as csv_file:
        fieldnames = ['Accession ID', 'Total Hits', 'Hits']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        
        writer.writeheader()
        for query_id, hits in sorted(query_hits.items(), key=lambda x: len(x[1]), reverse=True):
            writer.writerow({'Accession ID': query_id, 'Total Hits': total_hits[query_id], 'Hits': ', '.join(hits)})
    print(f"Results saved to '{output_csv_path}'")
except Exception as e:
    print(f"An error occurred while writing to CSV: {e}")

# Print ranked total hits to screen
print("Ranked total hits for each accession ID:")
for rank, (query_id, hits) in enumerate(sorted(total_hits.items(), key=lambda x: x[1], reverse=True), start=1):
    print(f"{rank}. Accession ID: {query_id}, Total Hits: {hits}")


Results saved to '/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/bacterial_master_blaster_results.csv'
Ranked total hits for each accession ID:
1. Accession ID: P11802, Total Hits: 250
2. Accession ID: Q01629, Total Hits: 250
3. Accession ID: P09914, Total Hits: 250
4. Accession ID: P60484, Total Hits: 250
5. Accession ID: Q01628, Total Hits: 250
6. Accession ID: Q00535, Total Hits: 250
7. Accession ID: P06493, Total Hits: 250
8. Accession ID: P13164, Total Hits: 250
9. Accession ID: Q00534, Total Hits: 250
10. Accession ID: O14879, Total Hits: 250
11. Accession ID: P55210, Total Hits: 250
12. Accession ID: Q13325, Total Hits: 250
13. Accession ID: Q00526, Total Hits: 250
14. Accession ID: P24941, Total Hits: 250
15. Accession ID: P01019, Total Hits: 250
16. Accession ID: P01966, Total Hits: 247
17. Accession ID: O43745, Total Hits: 133
18. Accession ID: P19883, Total Hits: 104
19. Accession ID: Q99653, Total Hits: 68
20. Accession ID: P42574, Total Hits: 56
21. Accession I

In [30]:
import csv

# Path to the input text file
input_file_path = "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/bacterial_master_blaster.txt"
# Path to the output CSV file
output_csv_path = "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/bacteria_results.csv"

# Initialize lists to store accession IDs and hit names
accession_ids = []
hit_names = []

# Read the input file and extract accession IDs and hit names
try:
    with open(input_file_path, "r") as input_file:
        current_accession_id = None
        for line in input_file:
            if "|" in line:
                current_accession_id = line.split("|")[1].strip()
            elif line.startswith(">"):
                if current_accession_id:
                    accession_ids.append(current_accession_id)
                    hit_name = line.strip()[1:] 
                    hit_names.append(hit_name)
except FileNotFoundError:
    print(f"Error: File '{input_file_path}' not found.")
    exit(1)
except Exception as e:
    print(f"An error occurred: {e}")
    exit(1)

# Write accession IDs and hit names to CSV file
try:
    with open(output_csv_path, mode='w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(["Accession ID", "Hit Name"])  
        for accession_id, hit_name in zip(accession_ids, hit_names):
            writer.writerow([accession_id, hit_name])
    print(f"Accession IDs and hit names saved to '{output_csv_path}'")
except Exception as e:
    print(f"An error occurred while writing to CSV: {e}")


Accession IDs and hit names saved to '/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/bacteria_results.csv'


## 1.3 Filter for Queries which Produced Hits 

In [33]:
import os
import shutil

# Function to check if the given file contains the phrase "No hits found" (case-insensitive)
def contains_no_hits(file_path):
    with open(file_path, 'r') as f:
        content = f.read()
        return "no hits found" not in content.lower()  

# Source directory containing blast result files
source_dir = "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/"

# Destination directory for files with hits
dest_dir = "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits"

# Create destination directory if it doesn't exist
if not os.path.exists(dest_dir):
    os.makedirs(dest_dir)

# Loop through all files in the source directory
for file_name in os.listdir(source_dir):
    if file_name.endswith(".txt"):  # Check if it's a text file
        file_path = os.path.join(source_dir, file_name)
        if contains_no_hits(file_path):
            # Move the file to the destination directory if it contains hits
            shutil.move(file_path, os.path.join(dest_dir, file_name))

## 1.4 Create Initial Species CSV file from Master File

In [35]:
import os
import csv

# Source directory containing blast result files with hits
source_dir = "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits"

# Output CSV file path
output_csv = "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/bacterial_hits_data.csv"

# Function to extract query ID from file name
def extract_query_id(file_name):
    return file_name.replace("_blastp_results.txt", "")

# Function to parse data from blast result file
def parse_blast_results(file_path):
    hits_data = []
    with open(file_path, 'r') as f:
        query_id = extract_query_id(os.path.basename(file_path))
        hit_number = 0  
        hit_id = None  
        species_id = None  
        for line in f:
            line = line.strip()
            if line.startswith(">"):
                hit_number += 1  
                # Extract Hit ID
                hit_id = line.split(">")[1].split(" ")[0]
                # Extract species ID if present
                species_split = line.split("[")
                if len(species_split) > 1:
                    species_id = species_split[1].split("]")[0]
                else:
                    species_id = None
                    
                # Manually add species ID for specific hit ID
                if hit_id == "MDO8640940.1":
                    species_id = "Nitrosarchaeum sp."

            elif hit_id and "Length=" in line:
                seq_length = int(line.split("=")[-1].strip())
                hits_data.append([query_id, hit_id, hit_number, seq_length, species_id])
    return hits_data

# List to store parsed data
parsed_data = []

# Loop through all files in the source directory
for file_name in os.listdir(source_dir):
    if file_name.endswith("_blastp_results.txt"):  # Check if it's a blast result file
        file_path = os.path.join(source_dir, file_name)
        parsed_data.extend(parse_blast_results(file_path))

# Write parsed data to CSV file
with open(output_csv, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    # Write header
    writer.writerow(['Query ID', 'Hit ID', 'Hit Number', 'Sequence Length', 'Species ID'])
    # Write parsed data
    writer.writerows(parsed_data)

print("CSV file created successfully")



CSV file created successfully


## 1.5 Create comprehensive CSV table containing blast alignment results

In [None]:
import os
import csv

# Source directory containing blast result files with hits
source_dir = "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits"

# Output CSV file path
output_csv = "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/comp_bacterial_data.csv"

# Function to extract query ID from file name
def extract_query_id(file_name):
    return file_name.replace("_blastp_results.txt", "")

# Parse out required data from blast reaults
def parse_blast_results(file_path):
    hits_data = []
    with open(file_path, 'r') as f:
        query_id = extract_query_id(os.path.basename(file_path))
        hit_number = 0 
        hit_id = None 
        species_id = None 
        seq_length = None  
        evalue = None 
        score = None  
        identities = None  
        identities_percent = None
        positives = None 
        positives_percent = None  
        gaps = None  
        gaps_percent = None  
        for line in f:
            line = line.strip()
            if line.startswith(">"):
                hit_number += 1 
                hit_id = line.split(">")[1].split(" ")[0]
                species_split = line.split("[")
                if len(species_split) > 1:
                    species_id = species_split[1].split("]")[0]
                else:
                    species_id = None
               

            elif hit_id and "Length=" in line:
                seq_length = int(line.split("=")[-1].strip())
            elif hit_id and "Expect =" in line:
                evalue = line.split("Expect = ")[1].split(",")[0].strip()
                score = line.split("Score = ")[1].split(" bits")[0].strip()
            elif hit_id and "Identities =" in line:
                identities_str = line.split("Identities = ")[1].split(",")[0].strip()
                identities = identities_str.split(" ")[0]
                identities_fraction = identities.split("/")
                identities_percent = (int(identities_fraction[0]) / int(identities_fraction[1])) * 100
                positives_str = line.split("Positives = ")[1].split(",")[0].strip()
                positives = positives_str.split(" ")[0]
                positives_fraction = positives.split("/")
                positives_percent = (int(positives_fraction[0]) / int(positives_fraction[1])) * 100
                gaps_str = line.split("Gaps = ")[1].split(" ")[0].strip()
                gaps = gaps_str
                gaps_fraction = gaps.split("/")
                gaps_percent = (int(gaps_fraction[0]) / int(gaps_fraction[1])) * 100
                # Append all values to hits_data
                hits_data.append([
                    query_id, hit_id, hit_number, seq_length, species_id, evalue, score, 
                    identities, f"{identities_percent:.2f}%", positives, f"{positives_percent:.2f}%", gaps, f"{gaps_percent:.2f}%"
                ])
    return hits_data

# List to store parsed data
parsed_data = []

# Loop through all files in the source directory
for file_name in os.listdir(source_dir):
    if file_name.endswith("_blastp_results.txt"): 
        file_path = os.path.join(source_dir, file_name)
        parsed_data.extend(parse_blast_results(file_path))

# Write parsed data to CSV file
with open(output_csv, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    # Write header
    writer.writerow([
        'Accession ID', 'Hit ID', 'Hit Number', 'Sequence Length', 'Species ID', 'Evalue', 'Score', 
        'Identities', 'Identities (%)', 'Positives', 'Positives (%)', 'Gaps', 'Gaps (%)'
    ])
    # Write parsed data
    writer.writerows(parsed_data)

print("CSV file created successfully")


In [51]:
import os
import pandas as pd
import shutil

# Define file paths and directories
csv_file_path = "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/bac_vs_vir_hits.csv"
source_directory = "/home/osheakes/Research_Project_MMM/Fasta/Viral_Blasts/processed_viral_seq"
destination_directory = "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/processed_seqs"

# Ensure the destination directory exists
os.makedirs(destination_directory, exist_ok=True)

# Read the CSV file to get the Accession IDs from the "bacterial hits" column
df = pd.read_csv(csv_file_path)
bacterial_hits = df['bacterial hits'].dropna().unique()

# Function to check if a file has a matching Accession ID
def has_matching_accession_id(file_name, accession_ids):
    # Extract the Accession ID from the file name
    accession_id = os.path.splitext(file_name)[0]
    return accession_id in accession_ids

# Counter to keep track of the number of files copied
num_files_copied = 0

# Scan the source directory for matching fasta files and copy them
for file_name in os.listdir(source_directory):
    if file_name.endswith(".fasta") and has_matching_accession_id(file_name, bacterial_hits):
        source_file_path = os.path.join(source_directory, file_name)
        destination_file_path = os.path.join(destination_directory, file_name)
        shutil.copy2(source_file_path, destination_file_path)
        num_files_copied += 1
        print(f"Copied: {file_name}")

print(f"Total files copied: {num_files_copied}")


Copied: Q07812.fasta
Copied: Q8NFJ6.fasta
Copied: P46663.fasta
Copied: P32247.fasta
Copied: P60484.fasta
Copied: O14879.fasta
Copied: Q00534.fasta
Copied: P06493.fasta
Copied: P24941.fasta
Copied: P01966.fasta
Copied: P47898.fasta
Copied: P11802.fasta
Copied: Q00535.fasta
Copied: Q00526.fasta
Copied: Q16611.fasta
Copied: O43808.fasta
Copied: P01019.fasta
Copied: P25445.fasta
Copied: P55210.fasta
Total files copied: 19


## 2.0 Upload bacterial Peptides

In [60]:
#Command to create a fasta file for each Accession ID. 
# The command denotes queries of different amino acid sequences under the same accession ID using "_{number}" sequentially. 
# Command also parses out fasta information out and adds it to the newly made fasta's. 

file_path = "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/processed_bacterial_seqs.txt"
output_dir = "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits"


import os

def process_fasta(file_path, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    accession_count = {}
    sequence_data = {}
    
    with open(file_path, 'r') as file:
        lines = file.readlines()
    
    current_header = None
    current_sequence = []
    
    for line in lines:
        if line.startswith('>'):
            if current_header:
                accession_id = current_header.split('|')[1]
                accession_id_with_suffix = f"{accession_id}_{accession_count[accession_id]}"
                sequence_data[accession_id_with_suffix] = (current_header, ''.join(current_sequence))
            
            parts = line.split('|')
            accession_id = parts[1]
            
            if accession_id not in accession_count:
                accession_count[accession_id] = 1
            else:
                accession_count[accession_id] += 1
            
            current_header = line.strip()
            current_sequence = []
        else:
            current_sequence.append(line.strip())
    
    if current_header:
        accession_id = current_header.split('|')[1]
        accession_id_with_suffix = f"{accession_id}_{accession_count[accession_id]}"
        sequence_data[accession_id_with_suffix] = (current_header, ''.join(current_sequence))
    
    for accession_id_with_suffix, (header, sequence) in sequence_data.items():
        print(accession_id_with_suffix)
        fasta_filename = os.path.join(output_dir, f"{accession_id_with_suffix}.fasta")
        with open(fasta_filename, 'w') as fasta_file:
            fasta_file.write(f"{header}\n{sequence}\n")
            
process_fasta(file_path, output_dir)

O43745_1
O75381_1
P01210_1
P01210_2
P01210_3
P01210_4
P01210_5
P01210_6
P01210_7
P01210_8
P01210_9
P01210_10
P01210_11
P01213_1
P01213_2
P01213_3
P01213_4
P01213_5
P01213_6
P01213_7
P01213_8
P01213_9
P01213_10
P01213_11
P01241_1
P01568_1
P01574_1
P01579_1
P09912_1
P09914_1
P13164_1
P19883_1
Q01453_1
Q01628_1
Q01629_1
Q10589_1
Q13325_1
Q96BS2_1
Q99653_1
Q9Y5Q6_1
Q9Y5Q6_2
Q9Y5Y5_1


In [22]:
import os
import shutil

# Source directory containing FASTA files
source_dir = "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/"

# Destination directory to move the files
destination_dir = "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/processed_seqs/"

try:
    # Create the destination directory if it doesn't exist
    os.makedirs(destination_dir, exist_ok=True)

    # Iterate over all files in the source directory
    for filename in os.listdir(source_dir):
        if filename.endswith(".fasta") or filename.endswith(".fa") or filename.endswith(".txt"):
            # Construct full paths to source and destination files
            source_file = os.path.join(source_dir, filename)
            destination_file = os.path.join(destination_dir, filename)
            
            # Move the file from source to destination
            shutil.move(source_file, destination_file)
            print(f"Moved {filename} to {destination_dir}")
        else:
            print(f"Skipped {filename} (not a FASTA file)")

    print("All files moved successfully.")

except FileNotFoundError:
    print(f"Error: Source directory '{source_dir}' not found.")
except Exception as e:
    print(f"An error occurred: {e}")


Moved Q02388_blastp_results.txt to /home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/processed_seqs/
Moved P46663_blastp_results.txt to /home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/processed_seqs/
Moved Q12965_blastp_results.txt to /home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/processed_seqs/
Moved P55265_blastp_results.txt to /home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/processed_seqs/
Moved P42224_blastp_results.txt to /home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/processed_seqs/
Moved P19883_1.fasta to /home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/processed_seqs/
Skipped bacterial_hits_data.csv (not a FASTA file)
Moved Q01629_1.fasta to /home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/processed_seqs/
Moved P01210_4.fasta to /home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hit

## 3.0 Perform Blast on Bacterial Hits

In [4]:
!mkdir home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/processed_seqs/processed_bacterial_results

mkdir: cannot create directory ‘home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/processed_seqs/processed_bacterial_results’: File exists


In [6]:
import os

# Define the directory path
directory = "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/processed_seqs/processed_bacterial_results"

# Create the directory
os.makedirs(directory, exist_ok=True)

# Verify that the directory has been created
if os.path.exists(directory):
    print(f"The directory {directory} has been created.")
else:
    print(f"Failed to create the directory {directory}.")

The directory /home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/processed_seqs/processed_bacterial_results has been created.


In [None]:
import subprocess
import os
import time
from datetime import datetime, timezone

# Path to the directory containing fasta files
fasta_directory = "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/processed_seqs/"

# Directory to store blast results
blast_output_directory = "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/processed_seqs/processed_bacterial_results"

# Create the blast output directory if it doesn't exist
os.makedirs(blast_output_directory, exist_ok=True)

# Record the start time of the entire process in GMT
start_time = datetime.now(timezone.utc)
print(f"Process started at: {start_time.strftime('%Y-%m-%d %H:%M:%S')} GMT")

# Iterate through each fasta file in the directory
for filename in os.listdir(fasta_directory):
    if filename.endswith(".fasta"):
        fasta_file = os.path.join(fasta_directory, filename)
        output_file = os.path.join(blast_output_directory, f"{os.path.splitext(filename)[0]}_blastp_results.txt")
        
        # Skip if output file already exists
        if os.path.exists(output_file):
            print(f"Skipping {filename} as output file already exists.")
            continue
        
        # Construct the blastp command
        blastp_command = [
            "blastp",
            "-db",
            "nr",
            "-query",
            fasta_file,
            "-out",
            output_file,
            "-evalue",
            "0.06",
            "-remote",
            "-entrez_query",
            "bacteria[orgn]"
        ]
        
        # Record start time for this file
        file_start_time = time.time()
        
        # Execute the blastp command
        print(f"Running blast search for {filename}...")
        try:
            result = subprocess.run(blastp_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
            result.check_returncode()  # This will raise an error if the command failed
            print(f"Blast search for {filename} completed successfully.")
        except subprocess.CalledProcessError as e:
            print(f"Blast search for {filename} failed with return code {e.returncode}.")
            print(f"Error: {e.stderr}")
        
        # Calculate elapsed time for this file
        file_elapsed_time = time.time() - file_start_time
        
        # Print filename and elapsed time
        print(f"Blast search for {filename} completed in {file_elapsed_time:.2f} seconds.")
        
        # Check if 30 minutes have elapsed since the start of the process
        if (datetime.now(timezone.utc) - start_time).seconds >= 1800:
            print(f"Process has been running for 30 minutes. Current time: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')} GMT")
            start_time = datetime.now(timezone.utc)  

print("Blastp searches completed for all fasta files.")


Process started at: 2024-07-03 12:29:24 GMT
Skipping Q07812.fasta as output file already exists.
Skipping Q8NFJ6.fasta as output file already exists.
Running blast search for P19883_1.fasta...
Blast search for P19883_1.fasta completed successfully.
Blast search for P19883_1.fasta completed in 1895.24 seconds.
Process has been running for 30 minutes. Current time: 2024-07-03 13:00:59 GMT
Running blast search for Q01629_1.fasta...
Blast search for Q01629_1.fasta completed successfully.
Blast search for Q01629_1.fasta completed in 996.34 seconds.
Running blast search for P01210_4.fasta...
Blast search for P01210_4.fasta completed successfully.
Blast search for P01210_4.fasta completed in 988.44 seconds.
Process has been running for 30 minutes. Current time: 2024-07-03 13:34:04 GMT
Running blast search for Q01453_1.fasta...
Blast search for Q01453_1.fasta completed successfully.
Blast search for Q01453_1.fasta completed in 578.41 seconds.
Running blast search for P01568_1.fasta...
Blast se

# 4.0 Process Bacterial Processed Results

## 4.1 Create Master file for processed hits

In [14]:
import os

# Directory containing blast result files
blast_output_directory = "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/processed_seqs/processed_bacterial_results"

# Name of the master file
master_filename = "bacterial_processed_master_blaster.txt"

# Path to the master file
master_filepath = os.path.join(blast_output_directory, master_filename)

# Open the master file in append mode
with open(master_filepath, "a") as master_file:
    # Iterate through each file in the blast output directory
    for filename in os.listdir(blast_output_directory):
        # Check if the file is a blast result file
        if filename.endswith("_blastp_results.txt"):
            # Get the full path of the blast result file
            blast_result_filepath = os.path.join(blast_output_directory, filename)
            # Open the blast result file and read its content
            with open(blast_result_filepath, "r") as blast_result_file:
                # Read the content of the blast result file and write it to the master file
                master_file.write(blast_result_file.read())
                # Write a newline character to separate the content of each blast result file
                master_file.write("\n")

print("All blast_results.txt files have been concatenated into the master file:", master_filepath)

All blast_results.txt files have been concatenated into the master file: /home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/processed_seqs/processed_bacterial_results/bacterial_processed_master_blaster.txt


### 4.2 Count hits

In [20]:
from collections import defaultdict
import csv

# Path to the master file
master_filepath = "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/processed_seqs/processed_bacterial_results/bacterial_processed_master_blaster.txt"

# Initialize a defaultdict to store query counts and hits
query_hits = defaultdict(list)
total_files = 0
files_with_hits = 0
files_without_hits = 0

# Read the file and extract query IDs and hits
try:
    with open(master_filepath, "r") as master_file:
        current_query_id = None
        for line in master_file:
            if "|" in line:  
                current_query_id = line.split("|")[1].strip()
                total_files += 1  # Increment total files counter
            elif ">" in line:  
                if current_query_id:
                    query_hits[current_query_id].append(line.split(">")[1].strip())  

    # Count files with and without hits
    for query_id, hits in query_hits.items():
        if hits:
            files_with_hits += 1
        else:
            files_without_hits += 1
except FileNotFoundError:
    print(f"Error: File '{master_filepath}' not found.")
    exit(1)
except Exception as e:
    print(f"An error occurred: {e}")
    exit(1)

# Calculate total hits for each accession ID
total_hits = {query_id: len(hits) for query_id, hits in query_hits.items()}

# Rank and print total hits
print("Ranked total hits for each accession ID:")
for rank, (query_id, hits) in enumerate(sorted(total_hits.items(), key=lambda x: x[1], reverse=True), start=1):
    print(f"{rank}. Accession ID: {query_id}, Total Hits: {hits}")

print(f"\nTotal files processed: {total_files}")
print(f"Files with hits: {files_with_hits}")
print(f"Files without hits: {files_without_hits}")

# Write results to CSV file
output_csv_path = "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/processed_seqs/processed_bacterial_results/bacterial_processed_master_blaster.csv"
try:
    with open(output_csv_path, mode='w', newline='') as csv_file:
        fieldnames = ['Query ID', 'Hits']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        
        writer.writeheader()
        for query_id, hits in query_hits.items():
            writer.writerow({'Query ID': query_id, 'Hits': ', '.join(hits)})
    print(f"Results saved to '{output_csv_path}'")
except Exception as e:
    print(f"An error occurred while writing to CSV: {e}")


Ranked total hits for each accession ID:
1. Accession ID: P60484, Total Hits: 250
2. Accession ID: Q00535, Total Hits: 250
3. Accession ID: P06493, Total Hits: 250
4. Accession ID: Q00534, Total Hits: 250
5. Accession ID: Q00526, Total Hits: 250
6. Accession ID: P24941, Total Hits: 250
7. Accession ID: P01019, Total Hits: 250
8. Accession ID: P01966, Total Hits: 249
9. Accession ID: O43808, Total Hits: 41
10. Accession ID: P32247, Total Hits: 27
11. Accession ID: P46663, Total Hits: 15
12. Accession ID: Q8NFJ6, Total Hits: 9
13. Accession ID: P55210, Total Hits: 4
14. Accession ID: Q16611, Total Hits: 3
15. Accession ID: Q07812, Total Hits: 2
16. Accession ID: P25445, Total Hits: 1

Total files processed: 25
Files with hits: 16
Files without hits: 0
Results saved to '/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/processed_seqs/processed_bacterial_results/bacterial_processed_master_blaster.csv'


In [None]:
from collections import defaultdict
import csv

# Path to the master file
master_filepath = "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/processed_seqs/processed_bacterial_results/bacterial_processed_master_blaster.txt"

# Initialize a defaultdict to store query counts and hits
query_hits = defaultdict(list)
total_files = 0
files_with_hits = 0
files_without_hits = 0

# Read the file and extract query IDs and hits
try:
    with open(master_filepath, "r") as master_file:
        current_query_id = None
        for line in master_file:
            if "|" in line:  
                current_query_id = line.split("|")[1].strip()
                total_files += 1  # Increment total files counter
            elif ">" in line:  
                if current_query_id:
                    query_hits[current_query_id].append(line.split(">")[1].strip())  

    # Count files with and without hits
    for query_id, hits in query_hits.items():
        if hits:
            files_with_hits += 1
        else:
            files_without_hits += 1
except FileNotFoundError:
    print(f"Error: File '{master_filepath}' not found.")
    exit(1)
except Exception as e:
    print(f"An error occurred: {e}")
    exit(1)

# Calculate total hits for each accession ID
total_hits = {query_id: len(hits) for query_id, hits in query_hits.items()}

# Rank and print total hits
print("Ranked total hits for each accession ID:")
for rank, (query_id, hits) in enumerate(sorted(total_hits.items(), key=lambda x: x[1], reverse=True), start=1):
    print(f"{rank}. Accession ID: {query_id}, Total Hits: {hits}")

print(f"\nTotal files processed: {total_files}")
print(f"Files with hits: {files_with_hits}")
print(f"Files without hits: {files_without_hits}")

# Write results to CSV file
output_csv_path = "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/processed_seqs/processed_bacterial_results/bacterial_processed_master_blaster.csv"
try:
    with open(output_csv_path, mode='w', newline='') as csv_file:
        fieldnames = ['Query ID', 'Hits']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        
        writer.writeheader()
        for query_id, hits in query_hits.items():
            writer.writerow({'Query ID': query_id, 'Hits': ', '.join(hits)})
    print(f"Results saved to '{output_csv_path}'")
except Exception as e:
    print(f"An error occurred while writing to CSV: {e}")

### 4.3 Create comprehensive processed csv

In [9]:
import os
import csv
import re

# Source directory containing blast result files with hits
source_dir = "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/processed_seqs/processed_bacterial_results"

# Output CSV file path
output_csv = "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/processed_seqs/processed_bacterial_results/comp_bacterial_processed_results.csv"

# Function to extract query ID from file name and remove suffixes
def extract_query_id(file_name):
    return re.sub(r'_[0-9]+.*$', '', file_name.replace("_blastp_results.txt", ""))

# Parse out required data from blast results
def parse_blast_results(file_path):
    hits_data = []
    with open(file_path, 'r') as f:
        accession_id = extract_query_id(os.path.basename(file_path))
        hit_number = 0 
        hit_id = None 
        species_id = None 
        seq_length = None  
        evalue = None 
        score = None  
        identities = None  
        positives = None  
        gaps = None  
        query_definition = None  
        for line in f:
            line = line.strip()
            if line.startswith(">"):
                hit_number += 1 
                hit_id = line.split(">")[1].split(" ")[0]
                species_split = line.split("[")
                if len(species_split) > 1:
                    species_id = species_split[1].split("]")[0]
                else:
                    species_id = None
                query_definition = line  # Assuming entire line is query definition

            elif hit_id and "Length=" in line:
                seq_length = int(line.split("=")[-1].strip())
            elif hit_id and "Expect =" in line:
                evalue = line.split("Expect = ")[1].split(",")[0].strip()
                score = line.split("Score = ")[1].split(" bits")[0].strip()
            elif hit_id and "Identities =" in line:
                identities_str = line.split("Identities = ")[1].split(",")[0].strip()
                identities = round(float(identities_str.split(" ")[0].split("/")[0]) / float(identities_str.split(" ")[0].split("/")[1]) * 100, 2)
                positives_str = line.split("Positives = ")[1].split(",")[0].strip()
                positives = positives_str.split(" ")[0]
                gaps_str = line.split("Gaps = ")[1].split(" ")[0].strip()
                gaps = gaps_str
                hits_data.append([
                    os.path.basename(file_path), accession_id, hit_number, hit_id, seq_length, species_id, 
                    query_definition, "N/A", score, evalue, identities, positives, gaps, "Query", "Subject", "Bacteria"
                ])
    return hits_data

# List to store parsed data
parsed_data = []

# List to store excluded file names
excluded_files = []

# Loop through all files in the source directory
for file_name in os.listdir(source_dir):
    if file_name.endswith("_blastp_results.txt"): 
        file_path = os.path.join(source_dir, file_name)
        parsed_data.extend(parse_blast_results(file_path))
    else:
        excluded_files.append(file_name)  

# Write parsed data to CSV file
with open(output_csv, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    # Write header
    writer.writerow([
        "File Name", "Accession ID", "Hit Number", "Hit ID", "Hit Length", "Species", 
        "Query Definition", "Hsp Bit Score", "Hsp Score", "Hsp E-value", 
        "Identity", "Positives", "Gaps", "Hsp Qseq", "Hsp Hseq", "Database"
    ])
    # Write parsed data
    writer.writerows(parsed_data)

print(f"CSV file created successfully with {len(parsed_data)} entries.")

# Print excluded file names
if excluded_files:
    print("\nExcluded files:")
    for file_name in excluded_files:
        print(file_name)
else:
    print("\nNo files were excluded.")


CSV file created successfully with 3587 entries.

Excluded files:
comp_bacterial_processed_results.csv
merged_bacterial_processed_results.csv
species
bacterial_processed_master_blaster.txt
Bacterial_peptide_names.csv
.ipynb_checkpoints
final_bacterial_processed_results.csv
bacterial_processed_results.csv
bacterial_processed_master_blaster.csv
Bacterial_peptide_names.txt


### 4.4 Add Protein Names

In [10]:
import csv

# Input and output file paths
input_file = "/home/osheakes/Research_Project_MMM/Fasta/Archaeal_Blasts/Archaeal_Hits/protein_names.txt"
output_file = "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/processed_seqs/protein_names.csv"

# Initialize a dictionary to track seen combinations of protein type and accession ID
seen_combinations = {}

# Process the lines to split data and prepare for CSV writing
data = []
with open(input_file, 'r', encoding='utf-8') as txt_file:
    try:
        reader = csv.reader(txt_file, delimiter='\t')
        for line_data in reader:
            # Skip empty lines
            if not line_data or all(cell.strip() == '' for cell in line_data):
                continue
            
            # Ensure each line has enough cells to contain at least Protein Type and Accession ID
            if len(line_data) >= 2:
                protein_type = line_data[0].strip()
                accession_id = line_data[1].strip()
                if accession_id: 
                   
                    key = (protein_type, accession_id)
                    if key not in seen_combinations:
                        seen_combinations[key] = True
                        data.append(line_data)
    except UnicodeDecodeError:
        print(f"Error decoding line in file: {input_file}")
        

# Write the processed data to CSV file
with open(output_file, 'w', newline='', encoding='utf-8') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerows(data)

print(f"Converted {input_file} to CSV format at {output_file}")

Converted /home/osheakes/Research_Project_MMM/Fasta/Archaeal_Blasts/Archaeal_Hits/protein_names.txt to CSV format at /home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/processed_seqs/protein_names.csv


In [14]:
import csv

# Input file paths
comp_archaeal_file = "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/processed_seqs/processed_bacterial_results/comp_bacterial_processed_results.csv"
protein_names_file = "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/processed_seqs/protein_names.csv"
output_file = "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/processed_seqs/processed_bacterial_results/bacterial_processed_results.csv"


# Step 1: Load data from protein_names.csv into a dictionary
accession_info = {}
with open(protein_names_file, 'r', encoding='utf-8') as csv_file:
    reader = csv.DictReader(csv_file)
    for row in reader:
        accession_id = row['Accession ID'].strip()
        protein_type = row['Protein Type'].strip()
        protein_names = row['Protein Names'].strip()
        accession_info[accession_id] = {
            'ProteinType': protein_type,
            'ProteinNames': protein_names
        }

# Step 2: Merge data with comp_archaeal_processed_results.csv and add new columns
merged_data = []
with open(comp_archaeal_file, 'r', encoding='utf-8') as csv_file:
    reader = csv.reader(csv_file)
    header = next(reader)
    header.insert(1, 'Protein Type')
    header.insert(2, 'Protein Names')
    merged_data.append(header)

    for row in reader:
        accession_id = row[1].strip()

        if accession_id in accession_info:
            protein_type = accession_info[accession_id]['ProteinType']
            protein_names = accession_info[accession_id]['ProteinNames']
        else:
            protein_type = ''
            protein_names = ''

        row.insert(1, protein_type)
        row.insert(2, protein_names)
        merged_data.append(row)

# Step 3: Write merged data to final_archaeal_processed_results.csv
with open(output_file, 'w', newline='', encoding='utf-8') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerows(merged_data)

print(f"Merged data from {comp_archaeal_file} and {protein_names_file} into {output_file}")


Merged data from /home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/processed_seqs/processed_bacterial_results/comp_bacterial_processed_results.csv and /home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/processed_seqs/protein_names.csv into /home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/processed_seqs/processed_bacterial_results/bacterial_processed_results.csv


## 5.0 Extract some key metrics from the Processed Search

### 5.1 Print Queries which Produced hits

In [1]:
import csv

# Path to the CSV file
csv_file_path = "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/processed_seqs/processed_bacterial_results/merged_bacterial_processed_results.csv"
# Initialize a set to store unique Accession IDs
unique_accession_ids = set()

try:
    # Open and read the CSV file
    with open(csv_file_path, mode='r') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        
        # Loop through each row in the CSV file
        for row in csv_reader:
            # Extract the Accession ID and add it to the set
            accession_id = row['Accession ID']  # Replace 'Accession ID' with the actual column name if different
            unique_accession_ids.add(accession_id)
    
    # Print each unique Accession ID
    for accession_id in unique_accession_ids:
        print(accession_id)
    
    # Print the total number of unique Accession IDs
    print(f"Total number of unique Accession IDs: {len(unique_accession_ids)}")

except FileNotFoundError:
    print(f"Error: File '{csv_file_path}' not found.")
except KeyError:
    print("Error: 'Accession ID' column not found in the CSV file. Please check the column name.")
except Exception as e:
    print(f"An error occurred: {e}")


Q99653_1
O43745_1
Q00534
P06493
Q10589_1
Q07812
Q01628_1
Q00526
O43808
P19883_1
Q8NFJ6
P13164_1
P09914_1
P09912_1
P46663
Q9Y5Q6_1
P01574_1
Q9Y5Y5_1
P25445
Q9Y5Q6_2
Q16611
Q01453_1
P55210
P01966
P01019
P60484
Q00535
Q01629_1
P01241_1
P24941
P32247
Total number of unique Accession IDs: 31


### 5.2 Print the number of input files

In [2]:
import os

# Path to the directory
directory_path = "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/processed_seqs/"

# Initialize a counter for FASTA files
fasta_file_count = 0

# Iterate through the files in the directory
for file_name in os.listdir(directory_path):
    if file_name.endswith(".fasta") or file_name.endswith(".fa"): 
        fasta_file_count += 1

# Print the total number of FASTA files
print(f"Total number of FASTA files: {fasta_file_count}")

Total number of FASTA files: 69


# 

## 6.0 Perform species analysis

### 6.1 Create directory

In [3]:
!mkdir  "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/processed_seqs/processed_bacterial_results/species"

### 6.2 Create Unique list of species

In [None]:
!awk -F "," '{ print $8 }' /home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/processed_seqs/processed_bacterial_results/bacterial_processed_results.csv | sort | uniq > /home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/processed_seqs/processed_bacterial_results/species/unique_bacteria.txt

### 6.3 Run Taxonkit on Unique Species

In [15]:
import subprocess

# Path to the unique-species.txt file
file_path =  "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/processed_seqs/processed_bacterial_results/species/unique_bacteria.txt"
# Path to the taxonkit binary
taxonkit_path = '/home/osheakes/Research_Project_MMM/Fasta/Viral_Blasts/processed_viral_seq/species/taxonkit'

# Read the species names from the file
with open(file_path, 'r') as file:
    species_names = [line.strip() for line in file.readlines()]

# Define a function to apply the command to each species name
def process_species_name(species_name):
    # Construct the command
    cmd = f'echo "{species_name}" | {taxonkit_path} name2taxid | {taxonkit_path} reformat -I 2'
    # Run the command
    result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
    # Debug: Print command and output
    print(f"Command: {cmd}")
    print(f"Stdout: {result.stdout.strip()}")
    print(f"Stderr: {result.stderr.strip()}")
    # Return the output
    return result.stdout.strip()

# Process each species name and store the results
results = {species: process_species_name(species) for species in species_names}

# Print the results
for species, output in results.items():
    print(f'{species}:\n{output}\n')

# Optionally, save the results to a new file
output_file_path = "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/processed_seqs/processed_bacteria_results/species/processed_species.txt"
with open(output_file_path, 'w') as output_file:
    for species, output in results.items():
        output_file.write(f'{species}:\n{output}\n\n')

Command: echo "" | /home/osheakes/Research_Project_MMM/Fasta/Viral_Blasts/processed_viral_seq/species/taxonkit name2taxid | /home/osheakes/Research_Project_MMM/Fasta/Viral_Blasts/processed_viral_seq/species/taxonkit reformat -I 2
Stdout: 
Stderr: 
Command: echo "Acaryochloris sp. SU_5_25" | /home/osheakes/Research_Project_MMM/Fasta/Viral_Blasts/processed_viral_seq/species/taxonkit name2taxid | /home/osheakes/Research_Project_MMM/Fasta/Viral_Blasts/processed_viral_seq/species/taxonkit reformat -I 2
Stdout: Acaryochloris sp. SU_5_25	2720481	Bacteria;Cyanobacteriota;Cyanophyceae;Acaryochloridales;Acaryochloridaceae;Acaryochloris;Acaryochloris sp. SU_5_25
Stderr: 
Command: echo "Acetobacter estunensis" | /home/osheakes/Research_Project_MMM/Fasta/Viral_Blasts/processed_viral_seq/species/taxonkit name2taxid | /home/osheakes/Research_Project_MMM/Fasta/Viral_Blasts/processed_viral_seq/species/taxonkit reformat -I 2
Stdout: Acetobacter estunensis	104097	Bacteria;Pseudomonadota;Alphaproteobacter

FileNotFoundError: [Errno 2] No such file or directory: '/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/Bacterial_Hits/processed_seqs/processed_bacteria_results/species/processed_species.txt'