# Fungal Processed Search

## 1.1 Create Fungal Master file

In [40]:
import os

# Directory containing blast result files
blast_output_directory = "/home/osheakes/Research_Project_MMM/Fasta/Fungal_Blasts"

# Name of the master file
master_filename = "fungal_master_blaster.txt"


# Path to the master file
master_filepath = os.path.join(blast_output_directory, master_filename)

# Open the master file in append mode
with open(master_filepath, "a") as master_file:
    # Iterate through each file in the blast output directory
    for filename in os.listdir(blast_output_directory):
        # Check if the file is a blast result file
        if filename.endswith("_blastp_results.txt"):
            # Get the full path of the blast result file
            blast_result_filepath = os.path.join(blast_output_directory, filename)
            # Open the blast result file and read its content
            with open(blast_result_filepath, "r") as blast_result_file:
                # Read the content of the blast result file and write it to the master file
                master_file.write(blast_result_file.read())
                # Write a newline character to separate the content of each blast result file
                master_file.write("\n")

print("All blast_results.txt files have been concatenated into the master file:", master_filepath)

All blast_results.txt files have been concatenated into the master file: /home/osheakes/Research_Project_MMM/Fasta/Fungal_Blasts/fungal_master_blaster.txt


## 1.2 Print Total Hits and Accession ID's

In [41]:
from collections import defaultdict
import csv

# Path to the master file
master_filepath = "/home/osheakes/Research_Project_MMM/Fasta/Fungal_Blasts/fungal_master_blaster.txt"

# Initialize a defaultdict to store query counts and hits
query_hits = defaultdict(list)

# Read the file and extract accession IDs and hits
try:
    with open(master_filepath, "r") as master_file:
        current_accession_id = None
        for line in master_file:
            if "|" in line:  
                current_accession_id = line.split("|")[1].strip()
            elif ">" in line:  
                if current_accession_id:
                    query_hits[current_accession_id].append(line.split(">")[1].strip())  
except FileNotFoundError:
    print(f"Error: File '{master_filepath}' not found.")
    exit(1)
except Exception as e:
    print(f"An error occurred: {e}")
    exit(1)

# Calculate total hits for each accession ID
total_hits = {accession_id: len(hits) for accession_id, hits in query_hits.items()}

# Rank and print total hits
print("Ranked total hits for each accession ID:")
for rank, (accession_id, hits) in enumerate(sorted(total_hits.items(), key=lambda x: x[1], reverse=True), start=1):
    print(f"{rank}. Accession ID: {accession_id}, Total Hits: {hits}")

# Write results to CSV file
output_csv_path = "/home/osheakes/Research_Project_MMM/Fasta/Fungal_Blasts/Fungal_Hits/fungal_master_blaster.csv"

try:
    with open(output_csv_path, mode='w', newline='') as csv_file:
        fieldnames = ['Accession ID', 'Hits']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        
        writer.writeheader()
        for accession_id, hits in query_hits.items():
            writer.writerow({'Accession ID': accession_id, 'Hits': ', '.join(hits)})
    print(f"Results saved to '{output_csv_path}'")
except Exception as e:
    print(f"An error occurred while writing to CSV: {e}")


Ranked total hits for each accession ID:
1. Accession ID: Q12965, Total Hits: 1000
2. Accession ID: E...  102     7e-20, Total Hits: 1000
3. Accession ID: P11802, Total Hits: 1000
4. Accession ID: P51587, Total Hits: 1000
5. Accession ID: O75381, Total Hits: 1000
6. Accession ID: P09914, Total Hits: 1000
7. Accession ID: P60484, Total Hits: 1000
8. Accession ID: E...  58.9    3e-05, Total Hits: 1000
9. Accession ID: Q00535, Total Hits: 1000
10. Accession ID: P06493, Total Hits: 1000
11. Accession ID: P43490, Total Hits: 1000
12. Accession ID: Q9BTW9, Total Hits: 1000
13. Accession ID: Q99653, Total Hits: 1000
14. Accession ID: Q9Y3Z3, Total Hits: 1000
15. Accession ID: O96011, Total Hits: 1000
16. Accession ID: Q15303, Total Hits: 1000
17. Accession ID: O43745, Total Hits: 1000
18. Accession ID: Q96BS2, Total Hits: 1000
19. Accession ID: P19525, Total Hits: 1000
20. Accession ID: O75192, Total Hits: 1000
21. Accession ID: P19838, Total Hits: 1000
22. Accession ID: P06396, Total Hits: 1

### 1.2.1 Make additions to masterfile to analyse hits in regard to protein type and name

In [45]:
import pandas as pd

# File paths
bacterial_master_blaster_filepath = '/home/osheakes/Research_Project_MMM/Fasta/Fungal_Blasts/Fungal_Hits/fungal_master_blaster.csv'
protein_names_filepath = "/home/osheakes/Research_Project_MMM/Fasta/Fungal_Blasts/Fungal_Hits/processed_seqs/protein_names.csv"
output_csv_path = "/home/osheakes/Research_Project_MMM/Fasta/Fungal_Blasts/Fungal_Hits/fungal_master_blaster_protein_results.csv"

# Read the bacterial master blaster CSV
bacterial_master_blaster_df = pd.read_csv(bacterial_master_blaster_filepath)

# Read the protein names CSV
protein_names_df = pd.read_csv(protein_names_filepath)

# Merge the dataframes on 'Accession ID'
merged_df = pd.merge(bacterial_master_blaster_df, protein_names_df[['Accession ID', 'Protein Names', 'Protein Type']], on='Accession ID', how='left')

# Reorder columns to place 'Protein Names' and 'Protein Type' at the start
cols = ['Protein Names', 'Protein Type'] + [col for col in bacterial_master_blaster_df.columns if col not in ['Protein Names', 'Protein Type']]
merged_df = merged_df[cols]

# Write the merged dataframe to a new CSV file
merged_df.to_csv(output_csv_path, index=False)

# Count the number of unique protein names
unique_protein_names = merged_df['Protein Names'].nunique()
print(f"Number of unique protein names: {unique_protein_names}")

# Breakdown of each protein type
protein_type_counts = merged_df['Protein Type'].value_counts()
print("Breakdown of protein types:")
print(protein_type_counts)

print(f"Merged results saved to '{output_csv_path}'")


Number of unique protein names: 59
Breakdown of protein types:
Structural                                    15
Antiviral Proteins                            11
Cell Cycle Regulation Proteins                 6
Transcription Factors                          6
Immune System - Cytokines                      5
Metabolic Hormones                             5
Cardiovascular and Renal Hormones              4
Tumour Suppressors and DNA Repair Proteins     3
Gastrointestinal                               2
Neuropeptides and Neurotransmitter             2
Name: Protein Type, dtype: int64
Merged results saved to '/home/osheakes/Research_Project_MMM/Fasta/Fungal_Blasts/Fungal_Hits/fungal_master_blaster_protein_results.csv'


# 

# 

## 2.0 Run Blast on Fungal Processed Samples

### 2.1 Create Working Directories

In [23]:
!mkdir  /home/osheakes/Research_Project_MMM/Fasta/Fungal_Blasts/Fungal_Hits/

In [24]:
!mkdir  /home/osheakes/Research_Project_MMM/Fasta/Fungal_Blasts/Fungal_Hits/processed_seqs

mkdir: cannot create directory ‘/home/osheakes/Research_Project_MMM/Fasta/Fungal_Blasts/Fungal_Hits/processed_seqs’: File exists


### 2.2 Pull Fasta's from Fungal processed sequences Text file

In [26]:
import os

# Input file path
file_path = "/home/osheakes/Research_Project_MMM/Fasta/Fungal_Blasts/Fungal_Hits/processed_fungal_blasts.txt"
# Output directory path
output_dir = "/home/osheakes/Research_Project_MMM/Fasta/Fungal_Blasts/Fungal_Hits/processed_seqs"

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Function to extract accession ID from header line
def extract_accession_id(header):
    parts = header.split('|')
    if len(parts) > 1:
        return parts[1].strip()
    return None

# Initialize file count
file_count = 0

# Read the input file and process
with open(file_path, 'r') as infile:
    current_file = None
    current_file_path = None
    
    for line in infile:
        if line.startswith('>'):
            # Close the previous file if it was open
            if current_file:
                current_file.close()
            
            # Extract accession ID and create a new file
            accession_id = extract_accession_id(line)
            if accession_id:
                current_file_path = os.path.join(output_dir, f"{accession_id}.fasta")
                current_file = open(current_file_path, 'w')
                file_count += 1  
        
        # Write the current line to the current file
        if current_file:
            current_file.write(line)
    
    # Close the last file if it was open
    if current_file:
        current_file.close()

print(f"FASTA files have been successfully created. Number of files created: {file_count}")

FASTA files have been successfully created. Number of files created: 65


## 2.3 Run Blast on Fungal Samples

In [27]:
!mkdir  /home/osheakes/Research_Project_MMM/Fasta/Fungal_Blasts/Fungal_Hits/processed_seqs/processed_fungal_results

In [None]:
import subprocess
import os
import time
from datetime import datetime, timezone

# Path to the directory containing fasta files
fasta_directory = "/home/osheakes/Research_Project_MMM/Fasta/Fungal_Blasts/Fungal_Hits/processed_seqs"

# Directory to store blast results
blast_output_directory =  "/home/osheakes/Research_Project_MMM/Fasta/Fungal_Blasts/Fungal_Hits/processed_seqs/processed_fungal_results"

# Create the blast output directory if it doesn't exist
os.makedirs(blast_output_directory, exist_ok=True)

# Record the start time of the entire process in GMT
start_time = datetime.now(timezone.utc)
print(f"Process started at: {start_time.strftime('%Y-%m-%d %H:%M:%S')} GMT")

# Iterate through each fasta file in the directory
for filename in os.listdir(fasta_directory):
    if filename.endswith(".fasta"):
        fasta_file = os.path.join(fasta_directory, filename)
        output_file = os.path.join(blast_output_directory, f"{os.path.splitext(filename)[0]}_blastp_results.txt")
        
        # Skip if output file already exists
        if os.path.exists(output_file):
            print(f"Skipping {filename} as output file already exists.")
            continue
        
        # Construct the blastp command
        blastp_command = [
            "blastp",
            "-db",
            "nr",
            "-query",
            fasta_file,
            "-out",
            output_file,
            "-evalue",
            "0.06",
            "-remote",
            "-entrez_query",
            "fungi[orgn]"
        ]
        
        # Record start time for this file
        file_start_time = time.time()
        
        # Execute the blastp command
        print(f"Running blast search for {filename}...")
        try:
            result = subprocess.run(blastp_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
            result.check_returncode()  
            print(f"Blast search for {filename} completed successfully.")
        except subprocess.CalledProcessError as e:
            print(f"Blast search for {filename} failed with return code {e.returncode}.")
            print(f"Error: {e.stderr}")
        
        # Calculate elapsed time for this file
        file_elapsed_time = time.time() - file_start_time
        
        # Print filename and elapsed time
        print(f"Blast search for {filename} completed in {file_elapsed_time:.2f} seconds.")
        
        # Check if 30 minutes have elapsed since the start of the process
        if (datetime.now(timezone.utc) - start_time).seconds >= 1800:
            print(f"Process has been running for 30 minutes. Current time: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')} GMT")
            start_time = datetime.now(timezone.utc)  

print("Blastp searches completed for all fasta files.")

Process started at: 2024-07-07 12:41:05 GMT
Running blast search for O96011.fasta...
Blast search for O96011.fasta completed successfully.
Blast search for O96011.fasta completed in 8827.62 seconds.
Process has been running for 30 minutes. Current time: 2024-07-07 15:08:13 GMT
Running blast search for Q8NFJ6.fasta...
Blast search for Q8NFJ6.fasta completed successfully.
Blast search for Q8NFJ6.fasta completed in 2797.03 seconds.
Process has been running for 30 minutes. Current time: 2024-07-07 15:54:50 GMT
Running blast search for Q15303.fasta...
Blast search for Q15303.fasta completed successfully.
Blast search for Q15303.fasta completed in 4010.21 seconds.
Process has been running for 30 minutes. Current time: 2024-07-07 17:01:40 GMT
Running blast search for P46663.fasta...


## 3.0 Process Fungal Blast Results

### 3.1 Collate hits from Processed Results

In [47]:
import os

# Directory containing blast result files
blast_output_directory = "/home/osheakes/Research_Project_MMM/Fasta/Fungal_Blasts/Fungal_Hits/processed_seqs/processed_fungal_results"

# Name of the master file
master_filename = "fungal_master_blaster.txt"

# Path to the master file
master_filepath = os.path.join(blast_output_directory, master_filename)

# Open the master file in append mode
with open(master_filepath, "a") as master_file:
    # Iterate through each file in the blast output directory
    for filename in os.listdir(blast_output_directory):
        # Check if the file is a blast result file
        if filename.endswith("_blastp_results.txt"):
            # Get the full path of the blast result file
            blast_result_filepath = os.path.join(blast_output_directory, filename)
            # Open the blast result file and read its content
            with open(blast_result_filepath, "r") as blast_result_file:
                # Read the content of the blast result file and write it to the master file
                master_file.write(blast_result_file.read())
                # Write a newline character to separate the content of each blast result file
                master_file.write("\n")

print("All blast_results.txt files have been concatenated into the master file:", master_filepath)

All blast_results.txt files have been concatenated into the master file: /home/osheakes/Research_Project_MMM/Fasta/Fungal_Blasts/Fungal_Hits/processed_seqs/processed_fungal_results/fungal_master_blaster.txt


### 3.2 Print Total Hits and Accession ID's

In [48]:
from collections import defaultdict
import csv

# Path to the master file
master_filepath = "/home/osheakes/Research_Project_MMM/Fasta/Fungal_Blasts/Fungal_Hits/processed_seqs/processed_fungal_results/fungal_master_blaster.txt"

# Initialize a defaultdict to store query counts and hits
query_hits = defaultdict(list)

# Read the file and extract query IDs and hits
try:
    with open(master_filepath, "r") as master_file:
        current_query_id = None
        for line in master_file:
            if "|" in line:  
                current_query_id = line.split("|")[1].strip()
            elif ">" in line:  
                if current_query_id:
                    query_hits[current_query_id].append(line.split(">")[1].strip())  
except FileNotFoundError:
    print(f"Error: File '{master_filepath}' not found.")
    exit(1)
except Exception as e:
    print(f"An error occurred: {e}")
    exit(1)

# Calculate total hits for each accession ID
total_hits = {query_id: len(hits) for query_id, hits in query_hits.items()}

# Rank and print total hits
print("Ranked total hits for each accession ID:")
for rank, (query_id, hits) in enumerate(sorted(total_hits.items(), key=lambda x: x[1], reverse=True), start=1):
    print(f"{rank}. Accession ID: {query_id}, Total Hits: {hits}")

# Write results to CSV file
output_csv_path = "/home/osheakes/Research_Project_MMM/Fasta/Fungal_Blasts/Fungal_Hits/processed_seqs/processed_fungal_results/fungal_master_blaster.csv"
try:
    with open(output_csv_path, mode='w', newline='') as csv_file:
        fieldnames = ['Accession ID', 'Hits']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        
        writer.writeheader()
        for query_id, hits in query_hits.items():
            writer.writerow({'Accession ID': query_id, 'Hits': ', '.join(hits)})
    print(f"Results saved to '{output_csv_path}'")
except Exception as e:
    print(f"An error occurred while writing to CSV: {e}")


Ranked total hits for each accession ID:
1. Accession ID: Q12965, Total Hits: 250
2. Accession ID: P11802, Total Hits: 250
3. Accession ID: P51587, Total Hits: 250
4. Accession ID: O75381, Total Hits: 250
5. Accession ID: P09914, Total Hits: 250
6. Accession ID: P60484, Total Hits: 250
7. Accession ID: Q00535, Total Hits: 250
8. Accession ID: P06493, Total Hits: 250
9. Accession ID: P43490, Total Hits: 250
10. Accession ID: Q9BTW9, Total Hits: 250
11. Accession ID: Q99653, Total Hits: 250
12. Accession ID: Q9Y3Z3, Total Hits: 250
13. Accession ID: O96011, Total Hits: 250
14. Accession ID: Q15303, Total Hits: 250
15. Accession ID: O43745, Total Hits: 250
16. Accession ID: Q96BS2, Total Hits: 250
17. Accession ID: P19525, Total Hits: 250
18. Accession ID: O75192, Total Hits: 250
19. Accession ID: P19838, Total Hits: 250
20. Accession ID: P06396, Total Hits: 250
21. Accession ID: Q92968, Total Hits: 250
22. Accession ID: Q9Y5Y5, Total Hits: 250
23. Accession ID: O43808, Total Hits: 250
24

### 3.3 Create comprehensive processed csv

In [52]:
import os
import csv

# Source directory containing blast result files with hits
source_dir = "/home/osheakes/Research_Project_MMM/Fasta/Fungal_Blasts/Fungal_Hits/processed_seqs/processed_fungal_results"

# Output CSV file path
output_csv = "/home/osheakes/Research_Project_MMM/Fasta/Fungal_Blasts/Fungal_Hits/processed_seqs/processed_fungal_results/comp_fungal_processed_results.csv"

# Function to extract query ID from file name
def extract_query_id(file_name):
    return file_name.replace("_blastp_results.txt", "")

# Parse out required data from blast results
def parse_blast_results(file_path):
    hits_data = []
    with open(file_path, 'r') as f:
        accession_id = extract_query_id(os.path.basename(file_path))
        hit_number = 0 
        hit_id = None 
        species_id = None 
        seq_length = None  
        evalue = None 
        score = None  
        identities = None  
        positives = None  
        gaps = None  
        query_definition = None  
        for line in f:
            line = line.strip()
            if line.startswith(">"):
                hit_number += 1 
                hit_id = line.split(">")[1].split(" ")[0]
                species_split = line.split("[")
                if len(species_split) > 1:
                    species_id = species_split[1].split("]")[0]
                else:
                    species_id = None
                query_definition = line  # Assuming entire line is query definition

            elif hit_id and "Length=" in line:
                seq_length = int(line.split("=")[-1].strip())
            elif hit_id and "Expect =" in line:
                evalue = line.split("Expect = ")[1].split(",")[0].strip()
                score = line.split("Score = ")[1].split(" bits")[0].strip()
            elif hit_id and "Identities =" in line:
                identities_str = line.split("Identities = ")[1].split(",")[0].strip()
                identities = round(float(identities_str.split(" ")[0].split("/")[0]) / float(identities_str.split(" ")[0].split("/")[1]) * 100, 2)
                positives_str = line.split("Positives = ")[1].split(",")[0].strip()
                positives = positives_str.split(" ")[0]
                gaps_str = line.split("Gaps = ")[1].split(" ")[0].strip()
                gaps = gaps_str
                hits_data.append([
                    os.path.basename(file_path), accession_id, hit_number, hit_id, seq_length, species_id, 
                    query_definition, "N/A", score, evalue, identities, positives, gaps, "Query", "Subject", "Fungal"
                ])
    return hits_data

# List to store parsed data
parsed_data = []

# List to store excluded file names
excluded_files = []

# Loop through all files in the source directory
for file_name in os.listdir(source_dir):
    if file_name.endswith("_blastp_results.txt"): 
        file_path = os.path.join(source_dir, file_name)
        parsed_data.extend(parse_blast_results(file_path))
    else:
        excluded_files.append(file_name)  

# Write parsed data to CSV file
with open(output_csv, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    # Write header
    writer.writerow([
        "File Name", "Accession ID", "Hit Number", "Hit ID", "Hit Length", "Species", 
        "Query Definition", "Hsp Bit Score", "Hsp Score", "Hsp E-value", 
        "Identity", "Positives", "Gaps", "Hsp Qseq", "Hsp Hseq", "Database"
    ])
    # Write parsed data
    writer.writerows(parsed_data)

print(f"CSV file created successfully with {len(parsed_data)} entries.")

# Print excluded file names
if excluded_files:
    print("\nExcluded files:")
    for file_name in excluded_files:
        print(file_name)
else:
    print("\nNo files were excluded.")

CSV file created successfully with 12885 entries.

Excluded files:
.ipynb_checkpoints
fungal_master_blaster.csv
fungal_master_blaster.txt


### 4.4 Add Protein Names

In [43]:
import csv

# Input and output file paths
input_file = "/home/osheakes/Research_Project_MMM/Fasta/Fungal_Blasts/Fungal_Hits/processed_seqs/protein_names.txt"
output_file = "/home/osheakes/Research_Project_MMM/Fasta/Fungal_Blasts/Fungal_Hits/processed_seqs/protein_names.csv"

# Initialize a dictionary to track seen combinations of protein type and accession ID
seen_combinations = {}

# Process the lines to split data and prepare for CSV writing
data = []
with open(input_file, 'r', encoding='utf-8') as txt_file:
    try:
        reader = csv.reader(txt_file, delimiter='\t')
        for line_data in reader:
            # Skip empty lines
            if not line_data or all(cell.strip() == '' for cell in line_data):
                continue
            
            # Ensure each line has enough cells to contain at least Protein Type and Accession ID
            if len(line_data) >= 2:
                protein_type = line_data[0].strip()
                accession_id = line_data[1].strip()
                if accession_id: 
                   
                    key = (protein_type, accession_id)
                    if key not in seen_combinations:
                        seen_combinations[key] = True
                        data.append(line_data)
    except UnicodeDecodeError:
        print(f"Error decoding line in file: {input_file}")
        

# Write the processed data to CSV file
with open(output_file, 'w', newline='', encoding='utf-8') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerows(data)

print(f"Converted {input_file} to CSV format at {output_file}")

Converted /home/osheakes/Research_Project_MMM/Fasta/Fungal_Blasts/Fungal_Hits/processed_seqs/protein_names.txt to CSV format at /home/osheakes/Research_Project_MMM/Fasta/Fungal_Blasts/Fungal_Hits/processed_seqs/protein_names.csv


In [53]:
import csv

# Input file paths
comp_archaeal_file = "/home/osheakes/Research_Project_MMM/Fasta/Fungal_Blasts/Fungal_Hits/processed_seqs/processed_fungal_results/comp_fungal_processed_results.csv"
protein_names_file = "/home/osheakes/Research_Project_MMM/Fasta/Fungal_Blasts/Fungal_Hits/processed_seqs/protein_names.csv"
output_file = "/home/osheakes/Research_Project_MMM/Fasta/Fungal_Blasts/Fungal_Hits/processed_seqs/processed_fungal_results/final_fungal_processed_results.csv"

# Step 1: Load data from protein_names.csv into a dictionary
accession_info = {}
with open(protein_names_file, 'r', encoding='utf-8') as csv_file:
    reader = csv.DictReader(csv_file)
    for row in reader:
        accession_id = row['Accession ID'].strip()
        protein_type = row['Protein Type'].strip()
        protein_names = row['Protein Names'].strip()
        accession_info[accession_id] = {
            'ProteinType': protein_type,
            'ProteinNames': protein_names
        }

# Step 2: Merge data with comp_archaeal_processed_results.csv and add new columns
merged_data = []
with open(comp_archaeal_file, 'r', encoding='utf-8') as csv_file:
    reader = csv.reader(csv_file)
    header = next(reader)
    header.insert(1, 'Protein Type')
    header.insert(2, 'Protein Names')
    merged_data.append(header)

    for row in reader:
        accession_id = row[1].strip()

        if accession_id in accession_info:
            protein_type = accession_info[accession_id]['ProteinType']
            protein_names = accession_info[accession_id]['ProteinNames']
        else:
            protein_type = ''
            protein_names = ''

        row.insert(1, protein_type)
        row.insert(2, protein_names)
        merged_data.append(row)

# Step 3: Write merged data to final_archaeal_processed_results.csv
with open(output_file, 'w', newline='', encoding='utf-8') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerows(merged_data)

print(f"Merged data from {comp_archaeal_file} and {protein_names_file} into {output_file}")


Merged data from /home/osheakes/Research_Project_MMM/Fasta/Fungal_Blasts/Fungal_Hits/processed_seqs/processed_fungal_results/comp_fungal_processed_results.csv and /home/osheakes/Research_Project_MMM/Fasta/Fungal_Blasts/Fungal_Hits/processed_seqs/protein_names.csv into /home/osheakes/Research_Project_MMM/Fasta/Fungal_Blasts/Fungal_Hits/processed_seqs/processed_fungal_results/final_fungal_processed_results.csv


### 4.5 Reorder Columns

In [55]:
import csv

input_file = "/home/osheakes/Research_Project_MMM/Fasta/Fungal_Blasts/Fungal_Hits/processed_seqs/processed_fungal_results/final_fungal_processed_results.csv"
output_file = "/home/osheakes/Research_Project_MMM/Fasta/Fungal_Blasts/Fungal_Hits/processed_seqs/processed_fungal_results/reordered_comp_fungal_processed_results.csv"
# Define the desired column order
desired_order = ['Protein Type', 'Protein Names', 'File Name', 'Accession ID', 'Hit Number', 'Hit ID', 'Hit Length', 'Species', 'Query Definition', 'Hsp Bit Score', 'Hsp Score', 'Hsp E-value', 'Identity', 'Positives', 'Gaps', 'Hsp Qseq', 'Hsp Hseq', 'Database']

# Read the original CSV file and reorder columns
with open(input_file, 'r', newline='', encoding='utf-8') as csv_file:
    reader = csv.reader(csv_file)
    header = next(reader)
    
    # Create a dictionary to map original indices to desired indices
    header_mapping = {header.index(col): idx for idx, col in enumerate(desired_order)}
    reordered_header = [header[idx] for idx in sorted(header_mapping.keys(), key=lambda x: header_mapping[x])]
    
    # Read and reorder rows
    rows = []
    for row in reader:
        reordered_row = [row[idx] for idx in sorted(header_mapping.keys(), key=lambda x: header_mapping[x])]
        rows.append(reordered_row)

# Write reordered data to a new CSV file
with open(output_file, 'w', newline='', encoding='utf-8') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(reordered_header)
    writer.writerows(rows)

print(f"Reordered columns in {input_file} and saved to {output_file}")

Reordered columns in /home/osheakes/Research_Project_MMM/Fasta/Fungal_Blasts/Fungal_Hits/processed_seqs/processed_fungal_results/final_fungal_processed_results.csv and saved to /home/osheakes/Research_Project_MMM/Fasta/Fungal_Blasts/Fungal_Hits/processed_seqs/processed_fungal_results/reordered_comp_fungal_processed_results.csv


## 5.0 Extract some key metrics from the Processed Search

## 5.1 Print Queries which Produced hits

In [1]:
import csv

# Path to the CSV file
csv_file_path = "/home/osheakes/Research_Project_MMM/Fasta/Fungal_Blasts/Fungal_Hits/processed_seqs/processed_fungal_results/reordered_comp_fungal_processed_results.csv"
# Initialize a set to store unique Accession IDs
unique_accession_ids = set()

try:
    # Open and read the CSV file
    with open(csv_file_path, mode='r') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        
        # Loop through each row in the CSV file
        for row in csv_reader:
            # Extract the Accession ID and add it to the set
            accession_id = row['Accession ID']  
            unique_accession_ids.add(accession_id)
    
    # Print each unique Accession ID
    for accession_id in unique_accession_ids:
        print(accession_id)
    
    # Print the total number of unique Accession IDs
    print(f"Total number of unique Accession IDs: {len(unique_accession_ids)}")

except FileNotFoundError:
    print(f"Error: File '{csv_file_path}' not found.")
except KeyError:
    print("Error: 'Accession ID' column not found in the CSV file. Please check the column name.")
except Exception as e:
    print(f"An error occurred: {e}")

Q00526
P06396
Q9P0W0
Q15303
P42224
P19838
O75381
O43808
P56199
Q10589
P02533
O96011
Q14765
P19525
P06400
O43745
Q96BS2
P60484
P06493
P09912
Q9BTW9
O75192
P32247
Q99653
P09914
P46663
P19883
P42226
P47898
P24941
P01566
P40763
O14879
Q7Z2W4
Q03252
Q9Y5Y5
P43490
O95786
P01574
P23142
P01568
Q8NFJ6
Q9Y3Z3
P20592
P01570
Q00535
P11802
Q12965
P52630
Q13325
Q02388
P20591
P51587
Q92968
Total number of unique Accession IDs: 54


### 5.2 Print the number of input files

In [2]:
import os

# Path to the directory
directory_path = "/home/osheakes/Research_Project_MMM/Fasta/Fungal_Blasts/Fungal_Hits/processed_seqs/"

# Initialize a counter for FASTA files
fasta_file_count = 0

# Iterate through the files in the directory
for file_name in os.listdir(directory_path):
    if file_name.endswith(".fasta") or file_name.endswith(".fa"):  
        fasta_file_count += 1

# Print the total number of FASTA files
print(f"Total number of FASTA files: {fasta_file_count}")

Total number of FASTA files: 65


# 

## 6.0 Perform species analysis

### 6.1 Create directory

In [3]:
!mkdir  "/home/osheakes/Research_Project_MMM/Fasta/Fungal_Blasts/Fungal_Hits/processed_seqs/processed_fungal_results/species"

### 6.2 Create Unique list of species

In [None]:
! awk -F "," '{ print $8 }' /home/osheakes/Research_Project_MMM/Fasta/Fungal_Blasts/Fungal_Hits/processed_seqs/processed_fungal_results/reordered_comp_fungal_processed_results.csv | sort | uniq > /home/osheakes/Research_Project_MMM/Fasta/Fungal_Blasts/Fungal_Hits/processed_seqs/processed_fungal_results/species/unique_fungi.txt

### 6.3 Run Taxonkit on Unique Species

In [4]:
import subprocess

# Path to the unique-species.txt file
file_path = "/home/osheakes/Research_Project_MMM/Fasta/Fungal_Blasts/Fungal_Hits/processed_seqs/processed_fungal_results/species/unique_fungi.txt"
# Path to the taxonkit binary
taxonkit_path = '/home/osheakes/Research_Project_MMM/Fasta/Viral_Blasts/processed_viral_seq/species/taxonkit'

# Read the species names from the file
with open(file_path, 'r') as file:
    species_names = [line.strip() for line in file.readlines()]

# Define a function to apply the command to each species name
def process_species_name(species_name):
    # Construct the command
    cmd = f'echo "{species_name}" | {taxonkit_path} name2taxid | {taxonkit_path} reformat -I 2'
    # Run the command
    result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
    # Debug: Print command and output
    print(f"Command: {cmd}")
    print(f"Stdout: {result.stdout.strip()}")
    print(f"Stderr: {result.stderr.strip()}")
    # Return the output
    return result.stdout.strip()

# Process each species name and store the results
results = {species: process_species_name(species) for species in species_names}

# Print the results
for species, output in results.items():
    print(f'{species}:\n{output}\n')

# Optionally, save the results to a new file
output_file_path = "/home/osheakes/Research_Project_MMM/Fasta/Fungal_Blasts/Fungal_Hits/processed_seqs/processed_fungal_results/species/processed_species.txt"
with open(output_file_path, 'w') as output_file:
    for species, output in results.items():
        output_file.write(f'{species}:\n{output}\n\n')

Command: echo "" | /home/osheakes/Research_Project_MMM/Fasta/Viral_Blasts/processed_viral_seq/species/taxonkit name2taxid | /home/osheakes/Research_Project_MMM/Fasta/Viral_Blasts/processed_viral_seq/species/taxonkit reformat -I 2
Stdout: 
Stderr: 
Command: echo "Abortiporus biennis" | /home/osheakes/Research_Project_MMM/Fasta/Viral_Blasts/processed_viral_seq/species/taxonkit name2taxid | /home/osheakes/Research_Project_MMM/Fasta/Viral_Blasts/processed_viral_seq/species/taxonkit reformat -I 2
Stdout: Abortiporus biennis	137743	Eukaryota;Basidiomycota;Agaricomycetes;Polyporales;Podoscyphaceae;Abortiporus;Abortiporus biennis
Stderr: 
Command: echo "Absidia" | /home/osheakes/Research_Project_MMM/Fasta/Viral_Blasts/processed_viral_seq/species/taxonkit name2taxid | /home/osheakes/Research_Project_MMM/Fasta/Viral_Blasts/processed_viral_seq/species/taxonkit reformat -I 2
Stdout: Absidia	4828	Eukaryota;Mucoromycota;Mucoromycetes;Mucorales;Cunninghamellaceae;Absidia;
Stderr: 
Command: echo "Absi