<a href="https://colab.research.google.com/github/Palaeoprot/pFind/blob/Merge_NovorCloud_Fasta/Combine_Fasta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Description
Function: merge .proteins.fasta files from NovorCloud results

# Setup

In [1]:
!pip install biopython

Defaulting to user installation because normal site-packages is not writeable


## import packages

In [2]:
import pandas as pd

from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

import os

import re


# from pathlib import Path

## import packages for Google drive files

In [None]:
# Google Colab specific for mounting Google Drive:
from google.colab import drive
drive.mount('/content/drive')

# Global variables

## for Googole Drive files

In [None]:
# STUDY_NAME = 'Dinosaur'

# below are the google drive paths and file names
# STUDY_NAME_1 = 'CE'
# STUDY_NAME_2 = 'Tuuli_Masks'

# BASE_PATH_CLOUD = f'/content/drive/My Drive/Colab_Notebooks/NovorCloud/'


# FILE_NAME_CE = f'{BASE_PATH}{STUDY_NAME_1}'
# FILE_NAME_Tuuli = f'{BASE_PATH}{STUDY_NAME_2}'



# FILE_NAME_BOVINE = 'F_1 - 260423_Bovine_Collagen_1hr_1_in_1000'
# FILE_NAME_TURKEY = 'F_2 - 260423_Turkey_Collagen_1hr_1_in_100'
# FILE_NAME_DINOSAUR = 'F_3 - 290423_Edmontosaur_1hr_Neat'

# FASTA_BOVINE = '260423_Bovine_Collagen_1hr_1_in_1000.proteins.fasta'
# FASTA_TURKEY = '260423_Turkey_Collagen_1hr_1_in_100.proteins.fasta'
# FASTA_DINOSAUR = '290423_Edmontosaur_1hr_Neat.proteins.fasta'


## for local computer files

In [3]:
# below are paths and file names in local computer MiniMax
BASE_PATH_LOCAL = "F:/Xuekai/CE&Tuuli"

STUDY_NAME = "CE&Tuuli"

## Settings: set the target base path AND output path of final merged fasta file

In [4]:
# Please set your target base path
    # target base path file contains all .proteins.fasta files
        # Note that all .proteins.fasta files will be excuted
TARGET_BASE_PATH = BASE_PATH_LOCAL

# Please set your output path of the merged .proteins.fasta file
    # The final merged non redundant fasta file will be name 'merged_nr.fasta' 
OUTPUT_PATH = f'{TARGET_BASE_PATH}/merged_nr.fasta'


# Function 1: find_fasta_files(base_path)
### Get all .proteins.fasta file paths in base path
### Note that for local computer files, the hidden ones will not be included
Walk through the directory structure starting from base_path to find all files ending with .proteins.fasta.
Return the list of all .proteins.fasta paths

In [5]:
def find_fasta_files(base_path):
    """
    Walk through the directory structure starting from base_path to find all
    files ending with .proteins.fasta.

    Parameters:
    - base_path (str): The base directory to start the search from.

    Returns:
    - list: A list of paths to .proteins.fasta files.
    """
    fasta_files = []  # Initialize an empty list to store the paths of .proteins.fasta files
    for root, dirs, files in os.walk(base_path):  # os.walk to iterate through directories and files
        for file in files:
            if file.endswith('.proteins.fasta'):  # Check if the file name ends with .proteins.fasta
                fasta_files.append(os.path.join(root, file))  # Append the full path to the list
    return fasta_files  # Return the list of paths

"""
Below is a test
Below is a test
Below is a test
"""
# # find all .proteins.fasta files for CE
# fasta_files_CE = find_fasta_files(FILE_NAME_CE)
# print(f"Files found in {STUDY_NAME_1}:")
# for file_path in fasta_files_CE:
#     print(file_path)
# print(f"Total number of .proteins.fasta files found in {STUDY_NAME_1}: {len(fasta_files_CE)}\n")

# # find all .proteins.fasta files for Tuuli
# fasta_files_Tuuli = find_fasta_files(FILE_NAME_Tuuli)
# print(f"Files found in {STUDY_NAME_2}:")
# for file_path in fasta_files_Tuuli:
#     print(file_path)
# print(f"Total number of .proteins.fasta files found in {STUDY_NAME_2}: {len(fasta_files_Tuuli)}")


'\nBelow is a test\nBelow is a test\nBelow is a test\n'

## Get .proteins.fasta file path in local computer

In [6]:
def find_fasta_files_local(base_path):
    """
    Walk through the directory structure starting from base_path to find all
    non-hidden files ending with .proteins.fasta.

    Parameters:
    - base_path (str): The base directory to start the search from.

    Returns:
    - list: A list of paths to non-hidden .proteins.fasta files.
    """
    fasta_files = []  # Initialize an empty list to store the paths of non-hidden .proteins.fasta files
    for root, dirs, files in os.walk(base_path):  # os.walk to iterate through directories and files
        for file in files:
            # Check if the file name ends with .proteins.fasta and does not start with .
            if file.endswith('.proteins.fasta') and not file.startswith('.'):
                fasta_files.append(os.path.join(root, file))  # Append the full path to the list
    return fasta_files  # Return the list of paths


# Function 2: generate_fasta_files_list(file_paths, study_name)
### Get Paths and give simplified names for FASTA files
Paths will use the Function 1
Simplified name format: {study_name}_{contents after '_' in the file name OR whole file name if no '_' in the file name}
    Simplified name example 1:
        file name: 20240209-0335_QEHF2_1007818_ONJ_TR_MC_CE15.proteins.fasta
        study name: Tuuli
        simplified name: Tuuli_CE15
    Simplified name example 2:
        file name: 20240209-0335QEHF21007818ONJTRMCCE15.proteins.fasta
        study name: Tuuli
        simplified name: Tuuli_20240209-0335QEHF21007818ONJTRMCCE15

In [None]:
def generate_fasta_files_list(file_paths, study_name):
    """
    Generates a list of tuples containing file paths and simplified file names.
    The simplified name is constructed from the study name and a part of the original file name.

    Parameters:
    - file_paths (list): List of full file paths to .proteins.fasta files.
    - study_name (str): The name of the study, used as a prefix for simplified file names.

    Returns:
    - list: A list of tuples, where each tuple contains (file path, simplified file name).
    """
    fasta_files = []
    for file_path in file_paths:
        # Extract the file name from the path
        original_file_name = os.path.basename(file_path)

        # Check if '_' is in the file name
        if '_' in original_file_name:
            # If '_' is present, split on the last '_' and take the part after it
            name_part = original_file_name.rsplit('_', 1)[-1].replace('.proteins.fasta', '')
        else:
            # If '_' is not present, remove the extension '.proteins.fasta' to use the whole file name
            name_part = original_file_name.replace('.proteins.fasta', '')

        # Construct the simplified file name
        simplified_name = f"{study_name}_{name_part}"
        # Append the tuple to the list
        fasta_files.append((file_path, simplified_name))

    return fasta_files


# # Retrieve file paths using FILE_NAME_CE
# fasta_files_CE = find_fasta_files(FILE_NAME_CE)
# # Generate a list of fasta_files
# fasta_files_list_CE = generate_fasta_files_list(fasta_files_CE, STUDY_NAME_1)
# # Print each element followed by a newline
# for file_info in fasta_files_list_CE:
#     print(file_info)
# print()  # Add an empty line between the two lists for clarity

# # Retrieve file paths using FILE_NAME_Tuuli
# fasta_files_Tuuli = find_fasta_files(FILE_NAME_Tuuli)
# # Generate a list of fasta_files
# fasta_files_list_Tuuli = generate_fasta_files_list(fasta_files_Tuuli, STUDY_NAME_2)
# # Print each element followed by a newline
# for file_info in fasta_files_list_Tuuli:
#     print(file_info)



## Function 2.2: Get Paths and give simplified names for FASTA files, file name is the content before the first '_'


In [7]:
def generate_fasta_files_list(file_paths, study_name):
    """
    Generates a list of tuples containing file paths and simplified file names.
    The simplified name is constructed from the study name and the part of the original file name before the first '_'.

    Parameters:
    - file_paths (list): List of full file paths to .proteins.fasta files.
    - study_name (str): The name of the study, used as a prefix for simplified file names.

    Returns:
    - list: A list of tuples, where each tuple contains (file path, simplified file name).
    """
    fasta_files = []
    for file_path in file_paths:
        # Extract the file name from the path
        original_file_name = os.path.basename(file_path)

        # Split on the first '_' and take the part before it
        name_part = original_file_name.split('_', 1)[0]

        # Construct the simplified file name
        simplified_name = f"{study_name}_{name_part}"
        # Append the tuple to the list
        fasta_files.append((file_path, simplified_name))

    return fasta_files


# Function 3-5: Using concat and drop_duplicates
Simple description:
Above is to get all the files ending with .proteins.fasta and give each a simplified name. In each .proteins.fasta file, there are some information in columns about peptide information like'GeneID', 'Protein', 'Species', 'Database', 'ProteinNO', 'Seq'. Now I want to merge all the fasta files using 'seq' as the key(consider using .concat), and then to remove all duplicate rows, keeping the first occurrence (consider using .drop_duplicates()). After merging, make the column ''ProteinNO'' have new numbers starting from 1. At the same time, add A new columns to the merged fasta file, which are 'originFile&NO': 'originFile&NO' contains the simplified names and 'ProteinNO' of all duplicate rows using 'seq' as the key, now put all these names and 'ProteinNO' for the one that has been kept in the dataframe in the 'originFile' of the first one that has been kept. The format of 'originFile&NO' is '{simplified_name}#{ProteinNO}#'

# Function 3: read_fasta_to_df(file_path, simplified_name)

In [8]:
def read_fasta_to_df(file_path, simplified_name):
    """
    Read a .fasta file and convert it to a DataFrame.
    """
    records = list(SeqIO.parse(file_path, "fasta"))
    data = []
    for record in records:
        # Extract 'Species', 'Protein', 'Database', and 'OriginNO' from the description
        species_match = re.search(r"\[(.*?)\]", record.description)
        species = species_match.group(1) if species_match else ''
        
        protein_match = re.search(r"^\S+\s+(.*?)(?:\s+\[|$)", record.description)
        protein = protein_match.group(1) if protein_match else ''
        
        database_match = re.search(r"\{(.*?)\}", record.description)
        database = database_match.group(1) if database_match else ''
        
        origin_no_match = re.search(r"#(\d+)#", record.description)
        origin_no = origin_no_match.group(1) if origin_no_match else ''
        
        data.append({
            'GeneID': record.id,
            'Protein': protein,
            'Species': species,
            'Database': database,
            'OriginNO': origin_no,
            'Seq': str(record.seq),
            'OriginFile': simplified_name,
        })

    return pd.DataFrame(data)

# Function 4: merge_and_clean_fasta(fasta_files_list)

In [9]:
def merge_and_clean_fasta(fasta_files_list):
    """
    Merge, clean, and renumber .fasta files, and update origin information.
    """
    all_dfs = []
    for file_path, simplified_name in fasta_files_list:
        df = read_fasta_to_df(file_path, simplified_name)
        df['originFile&NO'] = df['OriginFile'] + '#' + df['OriginNO'] + '#'
        all_dfs.append(df)

    merged_df = pd.concat(all_dfs, ignore_index=True)
    merged_df = merged_df.groupby('Seq').agg({
        'GeneID': 'first',
        'Protein': 'first',
        'Species': 'first',
        'Database': 'first',
        'originFile&NO': lambda x: '; '.join(x)
    }).reset_index()
    merged_df['ProteinNO'] = range(1, len(merged_df) + 1)
    
    return merged_df

# Function 5: df_to_fasta(merged_df, output_fasta_path)

In [10]:
def df_to_fasta(merged_df, output_fasta_path):
    """
    Convert the merged DataFrame to a .fasta file.
    """
    seq_records = []
    for _, row in merged_df.iterrows():
        seq_record = SeqRecord(
            Seq(row['Seq']),
            id=row['GeneID'],
            description=f"{row['Protein']} [{row['Species']}] {{{row['Database']}}} |{row['originFile&NO']}| #{row['ProteinNO']}#"
        )
        seq_records.append(seq_record)

    with open(output_fasta_path, 'w') as output_handle:
        SeqIO.write(seq_records, output_handle, 'fasta')

# Function 6: calculate_redundancy(merged_df)

In [11]:
def calculate_redundancy(merged_df):
    """
    Calculate the redundancy of sequences based on their presence in original files.
    """
    merged_df['Redundancy'] = merged_df['originFile&NO'].apply(lambda x: len(set(x.split(';'))))

    redundancy_counts = merged_df['Redundancy'].value_counts().sort_index()
    
    return redundancy_counts

# Main

In [12]:
"""
  Function 1
    Function 1
      Function 1

  """
# Function 1
# find all .proteins.fasta files in base path
fasta_files = find_fasta_files_local(TARGET_BASE_PATH) ### replace the base path in the ()

# # Below is to print all paths of fasta files
# print(f"Files found:")
# for file_path in fasta_files_minimax:
#     print(file_path)
# print(f"Total number of .proteins.fasta files found: {len(fasta_files_minimax)}\n")


"""
  Function 2
     Function 2
         Function 2

  """
# Function 2
# Generate a list of fasta_files
fasta_files_list = generate_fasta_files_list(fasta_files, STUDY_NAME)

# # Below is to print how many items in the list generated
# Print each element in the list followed by a newline
# counter
# counter = 0
# for file_info in fasta_files_list_minimax:
#     counter += 1
#     print(file_info)
# print(f"Total number of .proteins.fasta files with simplied names found in 'Tuuli&CateData': {counter}")

"""
  Below is a test to make a list with only first 4 items in the list of (file_path, simplified_name) tuples
  """
# # Select the first two items from the list and merge them into a new list
# fasta_file_test_4 = fasta_files_list_minimax[:4]

# # Print each element in the new merged list followed by a newline
# for file_info in fasta_file_test_4:
#     print(file_info)


"""
  Function 3 is used in Function 4
     Function 3 is used in Function 4
         Function 3 is used in Function 4

  """

"""
  Function 4
      Function 4
          Function 4
  """
# Function 4 use
merged_df = merge_and_clean_fasta(fasta_files_list)

"""
  Function 5
      Function 5
          Function 5
  """
# Function 5
df_to_fasta(merged_df, OUTPUT_PATH)

"""
  Function 6
      Function 6
          Function 6
  """
# Function 6
redundancy_counts = calculate_redundancy(merged_df)

print("Redundancy Counts:")
print(redundancy_counts)

Redundancy Counts:
Redundancy
1     135436
2      80466
3      41986
4      19872
5      22114
6      18846
7      15430
8      15600
9       5731
10      4273
11      8850
12      6065
13     14217
14      6744
15     10602
16      1232
17      1243
18      1281
19      2341
20      1853
21      2672
22        33
Name: count, dtype: int64
