<a href="https://colab.research.google.com/github/Ruochenge/learning_test/blob/main/Novor_FASTA_SampleType.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>






 Description: Combine Novor FASTA by Sample type

This Python code is designed to combine multiple FASTA files generated from NovorCloud results into a single non-redundant FASTA file. The code performs the following tasks:

1. **Load and Parse FASTA Files**: The code can load FASTA files from a specified directory and parse the headers to extract information such as gene ID, protein name, species name, database, and protein number. This information is stored in a pandas DataFrame.

2. **Compare FASTA Files**: The code can compare two or more FASTA files and identify common gene IDs across the files. It also allows finding occurrences of a specific gene ID across multiple files.

3. **Identify Redundant Sequences**: The code can identify sequences that are redundant within a single FASTA file, meaning multiple gene IDs have the same sequence.

4. **Combine FASTA Files**: The main functionality of the code is to combine multiple FASTA files into a single non-redundant FASTA file. The combined file contains unique gene IDs, and for each entry, it includes additional information such as whether the gene ID is unique or non-unique, and the origin file(s) where the gene ID was found.

5. **Output Combined Non-redundant FASTA**: The code generates a new FASTA file with a modified header format that includes the protein name, species within brackets, a unique protein number, and additional information about the uniqueness and origin of each entry.

The code leverages the BioPython library for parsing and writing FASTA files, pandas for data manipulation, and other Python libraries for data processing and visualization tasks.

Overall, this code provides a convenient way to consolidate and analyze FASTA files from NovorCloud results, allowing researchers to work with a single non-redundant file and gain insights into the distribution and uniqueness of sequences across different samples or studies.

# Setup
## Combine all the .proteins.fasta file from NovorCloud results in a study
### study

Requirements
Mount google drive, parse the .proteins.fasta files



In [None]:
!pip install biopython
# !pip install venn



In [None]:
import pandas as pd
import numpy as np
import re


# from Bio.Seq import Seq
# from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

import json
import os
from pathlib import Path

#import pyvenn

# Google Colab specific for mounting Google Drive:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Novor Fasta

##Functions

In [None]:
def preprocess_header(header):
    """
    Normalize the header by removing the portion after the '|' and text within '{}'.

    Parameters:
    - header: The original header string.

    Returns:
    - A normalized header string.
    """
    header = re.sub(r'\|.*', '', header)  # Remove anything after '|'
    header = re.sub(r'\{.*?\}', '', header)  # Remove anything within '{}'
    return header

def parse_header(header):
    """
    Parse the preprocessed header to extract GeneID, protein name, and species name.

    Parameters:
    - header: The preprocessed header string.

    Returns:
    - A pandas Series with 'GeneID', 'Protein', and 'Species' extracted.
    """
    gene_id = header.split(' ')[0].strip('>')
    protein = ' '.join(header.split(' ')[1:]).split('[')[0].strip()
    species = header.split('[')[-1].split(']')[0]
    return pd.Series([gene_id, protein, species])



def load_and_parse_fasta_files(fasta_dir_path):
    """
    Load FASTA files from a directory, preprocess and parse headers, associate each sequence with its source,
    and organize data into a DataFrame.

    Parameters:
    - fasta_dir_path: The path to the directory containing FASTA files.

    Returns:
    - A DataFrame with columns for 'GeneID', 'Protein', 'Species', 'Header', 'Sequence', and 'Source_Index'.

        data = []
    fasta_files = list(Path(fasta_dir_path).rglob('*.[Ff][Aa][Ss][Tt][Aa]'))
    for fasta_file in fasta_files:
        # Extract sample type from file name using regular expression
        sample_type = re.search(r'^(?P<sample_type>[^_]+)',
    """
    data = []
    fasta_files = list(Path(fasta_dir_path).rglob('*.[Ff][Aa][Ss][Tt][Aa]'))
    for fasta_file in fasta_files:
        # Extract sample type from file name using regular expression
 #       sample_type = re.search(r'^(?P<sample_type>[^_]+)',
#        source_index = fasta_file.stem  # Or any other logic to assign a source_index based on the file path or name
        # Extract sample type from file name using regular expression
        sample_type = re.search(r'^(?P<sample_type>[^_]+)', fasta_file.stem).group('sample_type')
        source_index = sample_type  # Or any other logic to assign a source_index based on the file path or name
        for record in SeqIO.parse(str(fasta_file), 'fasta'):
            preprocessed_header = preprocess_header(record.description)
            gene_id, protein, species = parse_header(preprocessed_header).tolist()
            data.append([gene_id, protein, species, preprocessed_header, str(record.seq), source_index])

    df = pd.DataFrame(data, columns=['GeneID', 'Protein', 'Species', 'Header', 'Sequence', 'Source_Index'])
    return df

def pivot_dataframe(df):
    """
    Pivot the DataFrame to have sample types (Source_Index) as columns with the presence (1) or absence (0)
    of sequences indicated for each sample type.

    Parameters:
    - df: The DataFrame containing the sequences and their source indices.

    Returns:
    - A pivoted DataFrame with sequences categorized by sample type.
    """
    #Pivot table to have sample types as columns with presence (1) or absence (0) of sequences
    df_pivot = df.pivot_table(index=['GeneID', 'Protein', 'Species', 'Header', 'Sequence'],
                              columns='Source_Index',
                              aggfunc=lambda x: 1,
                              fill_value=0)
    # df_pivot = df.pivot_table(index=['GeneID', 'Protein', 'Species', 'Header', 'Sequence'],
    #                      columns='Source_Index',
    #                      aggfunc=lambda x: x.any(),
    #                      fill_value=False)

    # Reset index to convert indices back to regular columns
    df_pivot.reset_index(inplace=True)

    return df_pivot

## Load Directory and Taxonomic Dictionaries

In [None]:

# Specify the directory path where your FASTA files are located
fasta_dir_path = '/content/drive/MyDrive/Colab_Notebooks/Dictionaries/FASTA_dicts/SampleTypes'

#Load the FASTA files, preprocess and parse headers, and organize into a DataFrame
df_unique_sequences = load_and_parse_fasta_files(fasta_dir_path)

# #Display the first few rows of the DataFrame to verify the results
# print(df_unique_sequences.head())


# Assuming df_unique_sequences is your loaded and processed DataFrame
df_pivoted = pivot_dataframe(df_unique_sequences)

# Optionally, display the first few rows to verify the results
#print(df_pivoted.head())

In [None]:
# Load JSON taxonomy dictioanaries
genus_species_indexed_file_path = '/content/drive/MyDrive/Colab_Notebooks/Taxonomy/genus_species_indexed_dict.json'
genus_indexed_file_path = '/content/drive/MyDrive/Colab_Notebooks/Taxonomy/genus_indexed_dict.json'

# Load the genus_species_indexed dictionary
with open(genus_species_indexed_file_path, 'r') as file:
    genus_species_indexed = json.load(file)

# Load the genus_indexed dictionary
with open(genus_indexed_file_path, 'r') as file:
    genus_indexed = json.load(file)

In [None]:
# Define the new columns to be added to df_pivoted
new_columns = [
    "common_name", "NCBI_Taxon_ID", "Phylum", "SubPhylum", "Class",
    "Subclass", "SuperOrder", "Order", "SubOrder", "Family", "SubFamily",
    "Genus"#, "species"
]

# Add new columns to df_pivoted, initialize with NaN or appropriate default
for col in new_columns:
    df_pivoted[col] = np.nan

# Function to update row with taxonomic information
def update_taxonomic_info(row):
    species_key = row['Species']
    genus_key = species_key.split(' ')[0] if pd.notnull(species_key) else None

    # Try matching with genus_species_indexed dictionary
    if species_key in genus_species_indexed:
        info = genus_species_indexed[species_key]
    # Fallback to genus_indexed dictionary
    elif genus_key in genus_indexed:
        info = genus_indexed[genus_key]
    else:
        # If no match found, return row unchanged
        return row

    # Update row with found taxonomic information
    for col in new_columns:
        if col in info:
            row[col] = info[col]
    return row

# Apply the function to each row in df_pivoted
df_pivoted = df_pivoted.apply(update_taxonomic_info, axis=1)


In [None]:
# =============================Reporting combined_df================================

# #Check the df again!
#print (df_pivoted)
df_pivoted.head()
#df_pivoted.head(100)
#df_pivoted.shape
#df_pivoted.shape
#df_pivoted.info()
#df_pivoted.describe()
# df_pivoted['Sequence'].nunique()

Source_Index,GeneID,Protein,Species,Header,Sequence,Ancient,Bee,Blank,Bone,Ceramics,...,Phylum,SubPhylum,Class,Subclass,SuperOrder,Order,SubOrder,Family,SubFamily,Genus
0,10GS_A,"Human Glutathione S-transferase P1-1, Complex ...",Homo sapiens,"10GS_A Human Glutathione S-transferase P1-1, C...",PPYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKAS...,0,0,0,1,0,...,Chordata,Vertebrata,Mammalia,Eutheria,Euarchontoglires,Primates,,Haplorrhini,Catarrhini,Homo
1,121P_A,STRUKTUR UND GUANOSINTRIPHOSPHAT-HYDROLYSEMECH...,Homo sapiens,121P_A STRUKTUR UND GUANOSINTRIPHOSPHAT-HYDROL...,MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVI...,0,0,0,0,0,...,Chordata,Vertebrata,Mammalia,Eutheria,Euarchontoglires,Primates,,Haplorrhini,Catarrhini,Homo
2,12CA_A,ALTERING THE MOUTH OF A HYDROPHOBIC POCKET. ST...,Homo sapiens,12CA_A ALTERING THE MOUTH OF A HYDROPHOBIC POC...,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,0,0,0,0,0,...,Chordata,Vertebrata,Mammalia,Eutheria,Euarchontoglires,Primates,,Haplorrhini,Catarrhini,Homo
3,12E8_L,2e8 Fab Fragment,Mus musculus,12E8_L 2e8 Fab Fragment [Mus musculus],DIVMTQSQKFMSTSVGDRVSITCKASQNVGTAVAWYQQKPGQSPKL...,0,0,0,1,0,...,Chordata,Vertebrata,Mammalia,Eutheria,Euarchontoglires,Glires,,Rodentia,Myomorpha,Mus
4,132L_A,STRUCTURAL CONSEQUENCES OF REDUCTIVE METHYLATI...,Gallus gallus,132L_A STRUCTURAL CONSEQUENCES OF REDUCTIVE ME...,XVFGRCELAAAMXRHGLDNYRGYSLGNWVCAAXFESNFNTQATNRN...,0,0,0,1,0,...,Chordata,Vertebrata,Archelosauria,Archosauria,Galloanserae,Saurischia,,Theropoda,Coelurosauria,Gallus


#A quick check of the data

In [None]:
# Calculate the number of shared sample types per row, excluding the sample itself
shared_types = df_pivoted.eq(1).sum(axis=1) - 1

# Count the occurrences of each shared sample type count
shared_type_counts = shared_types.value_counts()

# Printing the result to verify the distribution
print(shared_type_counts)


0    1117456
1     113365
2      34340
3      14222
5      14002
4      12872
6        605
7         95
8         40
Name: count, dtype: int64


In [None]:
# Assuming 'df' is your DataFrame and it already includes only the relevant columns for sample types
relevant_columns = ['Ancient', 'Bee', 'Blank', 'Bone', 'Ceramics', 'Coprolite', 'DentalCalculus', 'Eggshell', 'Enamel', 'Glue', 'Parchment', 'Tendon', 'Textile','Silk','Hair']
co_occurrence_matrix = pd.DataFrame(index=relevant_columns, columns=relevant_columns).fillna(0)

for col1 in relevant_columns:
    for col2 in relevant_columns:
        # Count how many times each pair of sample types occurs together
        co_occurrence_matrix.loc[col1, col2] = ((df_pivoted[col1] == 1) & (df_pivoted[col2] == 1)).sum()

co_occurrence_matrix


Unnamed: 0,Ancient,Bee,Blank,Bone,Ceramics,Coprolite,DentalCalculus,Eggshell,Enamel,Glue,Parchment,Tendon,Textile,Silk,Hair
Ancient,231417,1,21,44,18,1,50,0,0,0,0,6,21,58,33
Bee,1,1617,249,746,347,762,704,150,0,0,0,164,1,1257,620
Blank,21,249,30772,19477,25626,214,25860,1,0,0,0,6387,16,25883,25714
Bone,44,746,19477,339593,19008,33087,33895,709,16,168,673,6236,72,72019,69249
Ceramics,18,347,25626,19008,30498,334,24310,99,0,0,0,6692,14,24861,24652
Coprolite,1,762,214,33087,334,42559,476,100,0,0,0,278,1,5360,266
DentalCalculus,50,704,25860,33895,24310,476,188486,817,0,0,0,6646,18,48604,49118
Eggshell,0,150,1,709,99,100,817,8876,0,0,0,201,0,585,469
Enamel,0,0,0,16,0,0,0,0,157,5,12,0,1,17,14
Glue,0,0,0,168,0,0,0,0,5,272,195,0,59,132,83


In [None]:
# import plotly.figure_factory as ff

# # Convert the co-occurrence matrix to a format suitable for Plotly's heatmap function
# matrix_values = co_occurrence_matrix.values
# labels = co_occurrence_matrix.columns

# # Create the heatmap
# fig = ff.create_annotated_heatmap(matrix_values, x=labels, y=labels, colorscale='Viridis')

# # Update the layout to make it more readable
# fig.update_layout(
#     title='Sample Type Co-occurrence Matrix',
#     xaxis=dict(title='Sample Type'),
#     yaxis=dict(title='Sample Type'),
#     autosize=False,
#     width=800,
#     height=800,
# )

# fig.show()


In [None]:
# #df_pivoted.head(5)
# print (df_pivoted)

In [None]:
#Save
# output_path = '/content/drive/MyDrive/Colab_Notebooks/FASTA_dicts/SampleTypes/unique_sequences.csv'
# df_unique_sequences.to_csv(output_path, index=False)
output_path = '/content/drive/MyDrive/Colab_Notebooks/Taxonomy/AncientProteinsFASTA_DB.csv'
df_pivoted.to_csv(output_path, index=False)

## To write back a specific component of the fasta_files

In [None]:
def generate_fasta(df_pivoted, sample_type, output_directory):
    """
    Generate a FASTA file for sequences present in a specified sample type.

    Parameters:
    - df_pivoted: DataFrame containing sequences and indicators for sample presence.
    - sample_type: String indicating the sample type to filter by (e.g., "Bone").
    - output_directory: String with the path to save the FASTA file.

    The function filters the DataFrame for sequences present in the specified sample type
    and writes these to a FASTA file named after the sample type.
    """
    # Filter for rows where the sequence is present in the specified sample type
    filtered_df = df_pivoted[df_pivoted[sample_type] > 0]

    # Construct the file path for the FASTA file
    fasta_file_path = f'{output_directory}/{sample_type}.fasta'

    # Open the file to write the FASTA entries
    with open(fasta_file_path, 'w') as fasta_file:
        for _, row in filtered_df.iterrows():
            # Construct the FASTA header
            header = f">{row['GeneID']} | {row['Protein']} | {row['Species']} | {row['Phylum']} § {row['SubPhylum']} {row['Class']} {row['Subclass']} {row['Order']} {row['Family']} {row['SubFamily']} {row['Genus']}"
            # Write the header and sequence to the file
            fasta_file.write(header + "\n" + row['Sequence'] + "\n\n")

    # Print confirmation that the file has been saved
    print(f"The FASTA file for {sample_type} has been saved to: {fasta_file_path}")

# Example usage:

sample_type = 'Silk'
            # "Ancient", "Bee", "Blank", "Bone", "Ceramics", "Coprolite",
            #"DentalCalculus", "Eggshell", "Enamel", "Glue", "Parchment",
            #"Tendon", ","Silk","Hair"
output_directory = '/content/drive/MyDrive/Colab_Notebooks/Taxonomy'
generate_fasta(df_pivoted, sample_type, output_directory)


The FASTA file for Silk has been saved to: /content/drive/MyDrive/Colab_Notebooks/Taxonomy/Silk.fasta


#Importing the FASTA file

In [None]:
#@title Importing Big Archaeological FASTA File to export Sample group
#This is included again in case you begin the script here
import pandas as pd
import numpy as np
import re


# # from Bio.Seq import Seq
# # from Bio.SeqRecord import SeqRecord
# from Bio import SeqIO

import json
import os
from pathlib import Path

#import pyvenn

# Google Colab specific for mounting Google Drive:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Define the path for the input CSV file
input_csv_path = '/content/drive/MyDrive/Colab_Notebooks/Taxonomy/AncientProteinsFASTA_DB.csv'

df_from_csv = pd.read_csv(input_csv_path )


  df_from_csv = pd.read_csv(input_csv_path )


In [None]:
# Define your sample type
sample_type = 'Parchment'
            # "Ancient", "Bee", "Blank", "Bone", "Ceramics", "Coprolite",
            #"DentalCalculus", "Eggshell", "Enamel", "Glue", "Parchment",
            #"Tendon", "Textile

# Dynamically select the column based on sample_type
# This assumes your column names for types are exactly matching the sample_type values
sample_column = f'{sample_type}'  # Adjust if your column naming scheme is different

# Filter the DataFrame for rows with a sequence and non-zero in the specified sample type column
filtered_df = df_from_csv[(df_from_csv['Sequence'].notna()) & (df_from_csv[sample_type] > 0)]

# Construct the file path using the sample type
fasta_file_path = f'/content/drive/MyDrive/Colab_Notebooks/Taxonomy/{sample_type}.fasta'

# Open the file to write the FASTA entries
with open(fasta_file_path, 'w') as fasta_file:
    for index, row in filtered_df.iterrows():
        # Construct the FASTA header with § symbol and spaces as requested
        header = f">{row['GeneID']} | {row['Protein']} | {row['Species']} | {row['Phylum']} § {row['SubPhylum']} {row['Class']} {row['Subclass']} {row['Order']} {row['SubOrder']} {row['Family']} {row['SubFamily']} {row['Genus']} {row['species']}"
        # Write the header and sequence to the FASTA file
        fasta_file.write(header + "\n" + row['Sequence'] + "\n\n")

#Looking at the Data

In [None]:
# # subset of 200 lines
# subset_size = min(len(df_sorted), 200)  # Ensures you don't exceed the number of rows in the DataFrame
# df_subset = df_unique_sequences.sample(n=subset_size, random_state=1)  # random_state ensures reproducibility

# # Specify the path to save the CSV file
# subset_csv_path = '/content/drive/MyDrive/Colab_Notebooks/FASTA_dicts/SampleTypes/random_subset.csv'

# # Save the subset DataFrame to a CSV file
# df_subset.to_csv(subset_csv_path, index=False)


In [None]:

# Filter the DataFrame for rows with a sequence and non-zero in the specified sample type column
filtered_df = df_pivoted[(df_pivoted['Sequence'].notna()) & (df_pivoted[sample_type] > 0)]


In [None]:
# List of sample type columns
sample_types = ['Ancient', 'Bee', 'Blank', 'Bone', 'Ceramics', 'Coprolite', 'DentalCalculus', 'Eggshell', 'Enamel', 'Glue', 'Parchment', 'Tendon', 'Silk']

# Initialize a dictionary to hold the count of unique proteins for each sample type
unique_proteins_count = {}

for sample_type in sample_types:
    # Filter rows where the sample type is present
    df_filtered = df_unique_sequences[df_unique_sequences[sample_type] == 1]

    # Count unique proteins in this filtered DataFrame
    unique_proteins = df_filtered['Protein'].nunique()

    # Store the count in the dictionary
    unique_proteins_count[sample_type] = unique_proteins

# Display the report
for sample_type, count in unique_proteins_count.items():
    print(f"{sample_type}: {count} unique proteins")


In [None]:
df_unique_sequences.head()

In [None]:
# Sorting the DataFrame by 'Protein' and then 'Species' in ascending order
df_sorted = df_unique_sequences.sort_values(by=['Protein', 'Species'])

# Display the first few rows to verify the sorting
print(df_sorted.head())

In [None]:
#----------------------Reporting
#df_sorted.shape
#df_sorted.info()
##df_sorted.describe()
#df_unique_sequences['Protein'].nunique()
#df_sorted['Species'].nunique()
df_sorted.head()

In [None]:
# Generating a pivot table to display proteins by sample types
# This assumes 'Sample_Type' columns already exist in 'df_sorted' indicating presence (1) or absence (0)
pivot_df = df_sorted.pivot_table(index='Protein', columns='Source_Index', fill_value=0)

# Optionally, apply some styling to make the table 'pretty'
# Here we're just converting the DataFrame to an HTML table as an example
pretty_html = pivot_df.to_html()

# Display the HTML table in Jupyter Notebook (as an example)
from IPython.display import display, HTML
display(HTML(pretty_html))


End of sequence

# Get .proteins.fasta file path

In [None]:
def find_fasta_files(base_path):
    """
    Walk through the directory structure starting from base_path to find all
    files ending with .proteins.fasta.

    Parameters:
    - base_path (str): The base directory to start the search from.

    Returns:
    - list: A list of paths to .proteins.fasta files.
    """
    fasta_files = []  # Initialize an empty list to store the paths of .proteins.fasta files
    for root, dirs, files in os.walk(base_path):  # os.walk to iterate through directories and files
        for file in files:
            if file.endswith('.proteins.fasta'):  # Check if the file name ends with .proteins.fasta
                fasta_files.append(os.path.join(root, file))  # Append the full path to the list
    return fasta_files  # Return the list of paths

# find all .proteins.fasta files for CE
fasta_files_CE = find_fasta_files(FILE_NAME_CE)
print(f"Files found in {STUDY_NAME_1}:")
for file_path in fasta_files_CE:
    print(file_path)
print(f"Total number of .proteins.fasta files found in {STUDY_NAME_1}: {len(fasta_files_CE)}\n")

# find all .proteins.fasta files for Tuuli
fasta_files_Tuuli = find_fasta_files(FILE_NAME_Tuuli)
print(f"Files found in {STUDY_NAME_2}:")
for file_path in fasta_files_Tuuli:
    print(file_path)
print(f"Total number of .proteins.fasta files found in {STUDY_NAME_2}: {len(fasta_files_Tuuli)}")

# Get Paths and names for FASTA files

In [None]:
def generate_fasta_files_list(file_paths, study_name):
    """
    Generates a list of tuples containing file paths and simplified file names.
    The simplified name is constructed from the study name and a part of the original file name.

    Parameters:
    - file_paths (list): List of full file paths to .proteins.fasta files.
    - study_name (str): The name of the study, used as a prefix for simplified file names.

    Returns:
    - list: A list of tuples, where each tuple contains (file path, simplified file name).
    """
    fasta_files = []
    for file_path in file_paths:
        # Extract the file name from the path
        original_file_name = os.path.basename(file_path)
        # Split the file name to get the part after the last '_' and before '.proteins.fasta'
        name_part = original_file_name.rsplit('_', 1)[-1].replace('.proteins.fasta', '')
        # Construct the simplified file name
        simplified_name = f"{study_name}_{name_part}"
        # Append the tuple to the list
        fasta_files.append((file_path, simplified_name))

    return fasta_files

# # Retrieve file paths using FILE_NAME_CE
# fasta_files_CE = find_fasta_files(FILE_NAME_CE)
# # Generate a list of fasta_files
# fasta_files_list_CE = generate_fasta_files_list(fasta_files_CE, STUDY_NAME_1)
# # Print each element followed by a newline
# for file_info in fasta_files_list_CE:
#     print(file_info)
# print()  # Add an empty line between the two lists for clarity

# # Retrieve file paths using FILE_NAME_Tuuli
# fasta_files_Tuuli = find_fasta_files(FILE_NAME_Tuuli)
# # Generate a list of fasta_files
# fasta_files_list_Tuuli = generate_fasta_files_list(fasta_files_Tuuli, STUDY_NAME_2)
# # Print each element followed by a newline
# for file_info in fasta_files_list_Tuuli:
#     print(file_info)



In [None]:
# #This is a repetition
# # Define your sample type
# sample_type = 'Silk'
#             # "Ancient", "Bee", "Blank", "Bone", "Ceramics", "Coprolite",
#             #"DentalCalculus", "Eggshell", "Enamel", "Glue", "Parchment",
#             #"Tendon", "Textile","Silk"

# fasta_file_path = f'/content/drive/MyDrive/Colab_Notebooks/Taxonomy/{sample_type}.fasta'
# with open(fasta_file_path, 'w') as fasta_file:
#     for index, row in filtered_df.iterrows():
#         header = f">{row['GeneID']} | {row['Protein']} | {row['Species']} | {row['Phylum']} § {row['SubPhylum']} {row['Class']} {row['Subclass']} {row['Order']} {row['SubOrder']} {row['Family']} {row['SubFamily']} {row['Genus']}"
#         fasta_file.write(header + "\n" + row['Sequence'] + "\n\n")

# # Dynamically select the column based on sample_type
# # This assumes your column names for types are exactly matching the sample_type values
# sample_column = f'{sample_type}'  # Adjust if your column naming scheme is different

# # Filter the DataFrame for rows with a sequence and non-zero in the specified sample type column
# filtered_df = df_pivoted[(df_pivoted['Sequence'].notna()) & (df_pivoted[sample_type] > 0)]

# # Construct the file path using the sample type
# fasta_file_path = f'/content/drive/MyDrive/Colab_Notebooks/Taxonomy/{sample_type}.fasta'

# # Open the file to write the FASTA entries
# with open(fasta_file_path, 'w') as fasta_file:
#     for index, row in filtered_df.iterrows():
#         # Construct the FASTA header with § symbol and spaces as requested
#         header = f">{row['GeneID']} | {row['Protein']} | {row['Species']} | {row['Phylum']} § {row['SubPhylum']} {row['Class']} {row['Subclass']} {row['Order']} {row['SubOrder']} {row['Family']} {row['SubFamily']} {row['Genus']}"
#         # Write the header and sequence to the FASTA file
#         fasta_file.write(header + "\n" + row['Sequence'] + "\n\n")


# Old stuff
Don't know how to use to add cloumns holding origin file of #id#, so wirte new code below to parse the fasta files in data frame and then generate a new combined fasta file.

In [None]:
# Specify the GeneID you're interested in
target_gene_id = 'XP_040101606.1'

# Paths to your FASTA files
fasta_file_paths = [
    f'{BASE_PATH}/{FILE_NAME_BOVINE}/Test_{FASTA_BOVINE}',
    f'{BASE_PATH}/{FILE_NAME_TURKEY}/Test_{FASTA_TURKEY}'
]

# Parse each FASTA file and filter for the target GeneID
filtered_dfs = []
for file_path in fasta_file_paths:
    # Parse the current FASTA file into a DataFrame
    df = parse_fasta_to_df(file_path, file_path.split('/')[-1])

    # Filter for rows where GeneID matches the target
    filtered_df = df[df['GeneID'] == target_gene_id]

    # Add the filtered DataFrame to the list, if not empty
    if not filtered_df.empty:
        filtered_dfs.append(filtered_df)

# Concatenate all filtered DataFrames to get a comprehensive list of occurrences
df_target_gene_id = pd.concat(filtered_dfs, ignore_index=True)

# Print the results
print(df_target_gene_id)

## Application 3
Calculate how many repeated GeneID in each file. Then compare two files to calculate how many GeneID repeated between two files, and in this case ignore how many repetition in one file.
After comparing two fasta files, there is no redundant GeneID in one file but there are common GeneID between two files.

# Application 4
## Figure out whether the seqence corresponding to the GeneID is unique in each fasta file.
to figure out whether the seqence corresponding to the GeneID is unique in each fasta file. If in one file, two GeneID have the same 'seq', then make a new dataframe called 'redundant_seq' to hold these GeneID and its corresponding information, and add new column behind each row in 'redundant_seq_group' with number starting from 'G1' which means the first group for the same 'seq'. This means GeneID with same 'seq' will have the same 'redundant_seq_group' information.

In [None]:
def find_redundant_sequences(fasta_file_path, file_name):
    # Parse the FASTA file into a DataFrame
    df = parse_fasta_to_df(fasta_file_path, file_name)

    # Group by 'Seq' to find redundant sequences
    grouped = df.groupby('Seq')

    # Initialize an empty DataFrame for redundant sequences
    redundant_seq = pd.DataFrame(columns=df.columns.tolist() + ['redundant_seq_group'])

    # Group identifier counter
    group_id = 1

    # Iterate over each group of sequences
    for seq, group in grouped:
        if len(group) > 1:  # If the group has more than one GeneID
            group['redundant_seq_group'] = f'G{group_id}'
            redundant_seq = pd.concat([redundant_seq, group], ignore_index=True)
            group_id += 1

    return redundant_seq

# Specify the FASTA file paths
fasta_file_paths = [
    f'{BASE_PATH}/{FILE_NAME_DINOSAUR}/Test_4_{FASTA_DINOSAUR}',
    f'{BASE_PATH}/{FILE_NAME_TURKEY}/Test_3_{FASTA_TURKEY}'
]

# Find redundant sequences in each file and print the results
for path in fasta_file_paths:
    file_name = path.split('/')[-1]
    redundant_seq_df = find_redundant_sequences(path, file_name)
    if not redundant_seq_df.empty:
        print(f"Redundant sequences in {file_name}:")
        print(redundant_seq_df)
    else:
        print(f"No redundant sequences found in {file_name}.")
