## Create Multiple Fasta's from the One Input File

In [3]:
def split_fasta(input_file):
    with open(input_file, 'r') as file:
        content = file.read()
        
    # Split the content based on '>'
    fasta_entries = content.split('>')
    
    # The first entry is empty if the file starts with '>'
    if fasta_entries[0] == '':
        fasta_entries = fasta_entries[1:]
    
    for entry in fasta_entries:
        # Split the entry into lines
        lines = entry.strip().split('\n')
        # The first line is the header
        header = lines[0]
        # The rest is the sequence
        sequence = '\n'.join(lines[1:])
        # Get the file name from the header
        file_name = header.split()[0]
        
        # Write the entry to a new file
        with open(f"{file_name}.fasta", 'w') as output_file:
            output_file.write(f">{header}\n{sequence}\n")

# Usage
input_file = "/home/osheakes/Research_Project_MMM/Fasta/Viral_Blasts/safety_net/safety_net.txt"
split_fasta(input_file)


## 1.2 Perform Viral Genome Search on Fasta's 

In [11]:
import os
import subprocess
import time
from datetime import datetime, timezone, timedelta

# Function to convert GMT to Dublin local time
def gmt_to_dublin(gmt_time):
    dublin_offset = timedelta(hours=1)
    return gmt_time + dublin_offset

# Paths
fasta_directory = "/home/osheakes/Research_Project_MMM/Fasta/Viral_Blasts/safety_net/"
blast_output_directory = "/home/osheakes/Research_Project_MMM/Fasta/Viral_Blasts/safety_net/results"

# Create the blast output directory if it doesn't exist
os.makedirs(blast_output_directory, exist_ok=True)

# Record the start time of the entire process in GMT
start_time = datetime.now(timezone.utc)
start_time_dublin = gmt_to_dublin(start_time)
print(f"Process started at: {start_time_dublin.strftime('%Y-%m-%d %H:%M:%S')} Dublin local time")

def get_accession_id(fasta_file):
    with open(fasta_file, 'r') as file:
        for line in file:
            if line.startswith('>'):
                return line.split()[0][1:]
    return None

# Iterate through each fasta file in the directory
for filename in os.listdir(fasta_directory):
    if filename.endswith(".fasta"):
        fasta_file = os.path.join(fasta_directory, filename)
        accession_id = get_accession_id(fasta_file)
        if not accession_id:
            print(f"No accession ID found in {filename}, skipping...")
            continue
        
        output_file = os.path.join(blast_output_directory, f"{filename.split('.')[0]}_blastp_results.txt")
        
        # Skip if output file already exists
        if os.path.exists(output_file):
            print(f"Skipping {filename} as output file {output_file} already exists.")
            continue
        
        # Construct the blastp command for remote search
        blastp_command = [
            "blastp",
            "-query", fasta_file,
            "-db", "genomic/Viruses/Viral_Protein_Sequences",
            "-out", output_file,
            "-evalue", "0.06",
            "-entrez_query", "viruses[orgn]",
            "-remote"
        ]
        
        # Record start time for this file
        file_start_time = time.time()
        
        # Execute the blastp command
        print(f"Running blast search for {filename} (Accession ID: {accession_id})...")
        try:
            result = subprocess.run(blastp_command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            if result.returncode == 0:
                print(f"BLAST command executed successfully for {filename}.")
            else:
                print(f"BLAST command failed for {filename} with return code {result.returncode}.")
        except subprocess.CalledProcessError as e:
            print(f"BLAST command failed for {filename} with error: {e.stderr.decode()}")
        
        # Calculate elapsed time for this file
        file_elapsed_time = time.time() - file_start_time
        
        # Print filename and elapsed time
        print(f"Blast search for {filename} completed in {file_elapsed_time:.2f} seconds.")
        
        # Check if 30 minutes have elapsed since the start of the process
        if (datetime.now(timezone.utc) - start_time).seconds >= 1800:
            print(f"Process has been running for 30 minutes. Current time: {gmt_to_dublin(datetime.now(timezone.utc)).strftime('%Y-%m-%d %H:%M:%S')} Dublin local time")
            start_time = datetime.now(timezone.utc)

print("Blastp searches completed for all fasta files.")


Process started at: 2024-06-18 14:27:55 Dublin local time
Skipping hor006381.fasta as output file /home/osheakes/Research_Project_MMM/Fasta/Viral_Blasts/safety_net/results/hor006381_blastp_results.txt already exists.
Skipping hor003462.fasta as output file /home/osheakes/Research_Project_MMM/Fasta/Viral_Blasts/safety_net/results/hor003462_blastp_results.txt already exists.
Skipping hor005582.fasta as output file /home/osheakes/Research_Project_MMM/Fasta/Viral_Blasts/safety_net/results/hor005582_blastp_results.txt already exists.
Skipping hor004014.fasta as output file /home/osheakes/Research_Project_MMM/Fasta/Viral_Blasts/safety_net/results/hor004014_blastp_results.txt already exists.
Skipping hor003229.fasta as output file /home/osheakes/Research_Project_MMM/Fasta/Viral_Blasts/safety_net/results/hor003229_blastp_results.txt already exists.
Skipping hor001506.fasta as output file /home/osheakes/Research_Project_MMM/Fasta/Viral_Blasts/safety_net/results/hor001506_blastp_results.txt alre

## Run blast on processed sequences...


# 

In [5]:
import os

# Directory containing blast result files
blast_output_directory = "/home/osheakes/Research_Project_MMM/Fasta/Viral_Blasts/safety_net/results"

# Name of the master file
master_filename = "sf_master_blaster.txt"

# Path to the master file
master_filepath = os.path.join(blast_output_directory, master_filename)

# Open the master file in append mode
with open(master_filepath, "a") as master_file:
    # Iterate through each file in the blast output directory
    for filename in os.listdir(blast_output_directory):
        # Check if the file is a blast result file
        if filename.endswith("_blastp_results.txt"):
            # Get the full path of the blast result file
            blast_result_filepath = os.path.join(blast_output_directory, filename)
            # Open the blast result file and read its content
            with open(blast_result_filepath, "r") as blast_result_file:
                # Read the content of the blast result file and write it to the master file
                master_file.write(blast_result_file.read())
                # Write a newline character to separate the content of each blast result file
                master_file.write("\n")

print("All blast_results.txt files have been concatenated into the master file:", master_filepath)


All blast_results.txt files have been concatenated into the master file: /home/osheakes/Research_Project_MMM/Fasta/Viral_Blasts/safety_net/results/sf_master_blaster.txt


In [6]:
from collections import defaultdict
import csv

# Path to the master file
master_filepath =  "/home/osheakes/Research_Project_MMM/Fasta/Viral_Blasts/safety_net/results/sf_master_blaster.txt"

# Initialize a defaultdict to store query counts and hits
query_hits = defaultdict(list)

# Read the file and extract query IDs and hits
try:
    with open(master_filepath, "r") as master_file:
        current_query_id = None
        for line in master_file:
            if "|" in line:  # Check if the line contains an accession ID
                current_query_id = line.split("|")[1].strip()  # Extract the query ID between "|"
            elif ">" in line:  # Check if the line contains a hit
                if current_query_id:
                    query_hits[current_query_id].append(line.split(">")[1].strip())  
except FileNotFoundError:
    print(f"Error: File '{master_filepath}' not found.")
    exit(1)
except Exception as e:
    print(f"An error occurred: {e}")
    exit(1)

# Calculate total hits for each accession ID
total_hits = {query_id: len(hits) for query_id, hits in query_hits.items()}

# Write results to CSV file
output_csv_path = "/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/bacterial_master_blaster_results.csv"
try:
    with open(output_csv_path, mode='w', newline='') as csv_file:
        fieldnames = ['Accession ID', 'Total Hits', 'Hits']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        
        writer.writeheader()
        for query_id, hits in sorted(query_hits.items(), key=lambda x: len(x[1]), reverse=True):
            writer.writerow({'Accession ID': query_id, 'Total Hits': total_hits[query_id], 'Hits': ', '.join(hits)})
    print(f"Results saved to '{output_csv_path}'")
except Exception as e:
    print(f"An error occurred while writing to CSV: {e}")

# Print ranked total hits to screen
print("Ranked total hits for each accession ID:")
for rank, (query_id, hits) in enumerate(sorted(total_hits.items(), key=lambda x: x[1], reverse=True), start=1):
    print(f"{rank}. Accession ID: {query_id}, Total Hits: {hits}")

Results saved to '/home/osheakes/Research_Project_MMM/Fasta/Bacterial_Blasts/bacterial_master_blaster_results.csv'
Ranked total hits for each accession ID:


In [7]:
import os
import shutil

# Function to check if the given file contains the phrase "No hits found" (case-insensitive)
def contains_no_hits(file_path):
    with open(file_path, 'r') as f:
        content = f.read()
        return "no hits found" not in content.lower()  

# Source directory containing blast result files
source_dir = "/home/osheakes/Research_Project_MMM/Fasta/Viral_Blasts/safety_net/results/"

# Destination directory for files with hits
dest_dir = "/home/osheakes/Research_Project_MMM/Fasta/Viral_Blasts/safety_net/results/hits"

# Create destination directory if it doesn't exist
if not os.path.exists(dest_dir):
    os.makedirs(dest_dir)

# Loop through all files in the source directory
for file_name in os.listdir(source_dir):
    if file_name.endswith(".txt"):  # Check if it's a text file
        file_path = os.path.join(source_dir, file_name)
        if contains_no_hits(file_path):
            # Move the file to the destination directory if it contains hits
            shutil.move(file_path, os.path.join(dest_dir, file_name))

In [9]:
import os
import csv

# Source directory containing blast result files with hits
source_dir = "/home/osheakes/Research_Project_MMM/Fasta/Viral_Blasts/safety_net/results/hits"

# Output CSV file path
output_csv = "/home/osheakes/Research_Project_MMM/Fasta/Viral_Blasts/safety_net/results/hits/sf_hits.csv"
# Function to extract query ID from file name
def extract_query_id(file_name):
    return file_name.replace("_blastp_results.txt", "")

# Function to parse data from blast result file
def parse_blast_results(file_path):
    hits_data = []
    with open(file_path, 'r') as f:
        query_id = extract_query_id(os.path.basename(file_path))
        hit_number = 0  # Initialize hit_number
        hit_id = None  # Initialize hit_id variable
        species_id = None  # Initialize species_id variable
        for line in f:
            line = line.strip()
            if line.startswith(">"):
                hit_number += 1  # Increment hit_number for each new hit
                # Extract Hit ID
                hit_id = line.split(">")[1].split(" ")[0]
                # Extract species ID if present
                species_split = line.split("[")
                if len(species_split) > 1:
                    species_id = species_split[1].split("]")[0]
                else:
                    species_id = None
                # Manually add species ID for specific hit ID
                if hit_id == "MDO8640940.1":
                    species_id = "Nitrosarchaeum sp."

            elif hit_id and "Length=" in line:
                seq_length = int(line.split("=")[-1].strip())
                hits_data.append([query_id, hit_id, hit_number, seq_length, species_id])
    return hits_data

# List to store parsed data
parsed_data = []

# Loop through all files in the source directory
for file_name in os.listdir(source_dir):
    if file_name.endswith("_blastp_results.txt"):  # Check if it's a blast result file
        file_path = os.path.join(source_dir, file_name)
        parsed_data.extend(parse_blast_results(file_path))

# Write parsed data to CSV file
with open(output_csv, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    # Write header
    writer.writerow(['Query ID', 'Hit ID', 'Hit Number', 'Sequence Length', 'Species ID'])
    # Write parsed data
    writer.writerows(parsed_data)

print("CSV file created successfully")


CSV file created successfully


In [11]:
import os
import csv

# Source directory containing blast result files with hits
source_dir = "/home/osheakes/Research_Project_MMM/Fasta/Viral_Blasts/safety_net/results/hits/"

# Output CSV file path
output_csv =  "/home/osheakes/Research_Project_MMM/Fasta/Viral_Blasts/safety_net/results/hits/comp_sf_hits.csv"

# Function to extract query ID from file name
def extract_query_id(file_name):
    return file_name.replace("_blastp_results.txt", "")

# Parse out required data from blast reaults
def parse_blast_results(file_path):
    hits_data = []
    with open(file_path, 'r') as f:
        query_id = extract_query_id(os.path.basename(file_path))
        hit_number = 0 
        hit_id = None 
        species_id = None 
        seq_length = None  
        evalue = None 
        score = None  
        identities = None  
        identities_percent = None
        positives = None 
        positives_percent = None  
        gaps = None  
        gaps_percent = None  
        for line in f:
            line = line.strip()
            if line.startswith(">"):
                hit_number += 1 
                hit_id = line.split(">")[1].split(" ")[0]
                species_split = line.split("[")
                if len(species_split) > 1:
                    species_id = species_split[1].split("]")[0]
                else:
                    species_id = None
               

            elif hit_id and "Length=" in line:
                seq_length = int(line.split("=")[-1].strip())
            elif hit_id and "Expect =" in line:
                evalue = line.split("Expect = ")[1].split(",")[0].strip()
                score = line.split("Score = ")[1].split(" bits")[0].strip()
            elif hit_id and "Identities =" in line:
                identities_str = line.split("Identities = ")[1].split(",")[0].strip()
                identities = identities_str.split(" ")[0]
                identities_fraction = identities.split("/")
                identities_percent = (int(identities_fraction[0]) / int(identities_fraction[1])) * 100
                positives_str = line.split("Positives = ")[1].split(",")[0].strip()
                positives = positives_str.split(" ")[0]
                positives_fraction = positives.split("/")
                positives_percent = (int(positives_fraction[0]) / int(positives_fraction[1])) * 100
                gaps_str = line.split("Gaps = ")[1].split(" ")[0].strip()
                gaps = gaps_str
                gaps_fraction = gaps.split("/")
                gaps_percent = (int(gaps_fraction[0]) / int(gaps_fraction[1])) * 100
                # Append all values to hits_data
                hits_data.append([
                    query_id, hit_id, hit_number, seq_length, species_id, evalue, score, 
                    identities, f"{identities_percent:.2f}%", positives, f"{positives_percent:.2f}%", gaps, f"{gaps_percent:.2f}%"
                ])
    return hits_data

# List to store parsed data
parsed_data = []

# Loop through all files in the source directory
for file_name in os.listdir(source_dir):
    if file_name.endswith("_blastp_results.txt"): 
        file_path = os.path.join(source_dir, file_name)
        parsed_data.extend(parse_blast_results(file_path))

# Write parsed data to CSV file
with open(output_csv, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    # Write header
    writer.writerow([
        'Query ID', 'Hit ID', 'Hit Number', 'Sequence Length', 'Species ID', 'Evalue', 'Score', 
        'Identities', 'Identities (%)', 'Positives', 'Positives (%)', 'Gaps', 'Gaps (%)'
    ])
    # Write parsed data
    writer.writerows(parsed_data)

print("CSV file created successfully")


CSV file created successfully


## Convert Identity, positives and gaps

In [12]:
import pandas as pd

# Path to the input CSV file
input_csv_path = "/home/osheakes/Research_Project_MMM/Fasta/Viral_Blasts/safety_net/results/hits/comp_sf_hits.csv"

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(input_csv_path)

# Function to convert fraction to percentage without the % symbol
def convert_fraction_to_percentage(fraction_str):
    try:
        numerator, denominator = map(int, fraction_str.split('/'))
        percentage = (numerator / denominator) * 100
        return f"{percentage:.2f}"
    except ZeroDivisionError:
        return "Error: Division by zero"
    except ValueError:
        return fraction_str 

# Convert fractions to percentages in the entire DataFrame
for col in df.columns:
    df[col] = df[col].apply(lambda x: convert_fraction_to_percentage(x) if isinstance(x, str) and '/' in x else x)

# Save the modified DataFrame back to the original file name
df.to_csv(input_csv_path, index=False)

print(f"\nConverted fractions to percentages and saved to: {input_csv_path}")



Converted fractions to percentages and saved to: /home/osheakes/Research_Project_MMM/Fasta/Viral_Blasts/safety_net/results/hits/comp_sf_hits.csv
