# Sequence coding

This notebook will generate a dataframe that correlate the original identifiers of each sequence with new codes in the 'gene_X_database' format.

The original sequence headers will then be replaced by the new codes.  

In [None]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [None]:
from google.colab import files
import os
import pandas as pd
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

In [None]:
# In the local repository, compress the folder of the database of interest with the respective sequences
# tar czvf database_folder_name.tar.gz database_folder_name

In [None]:
# Upload the compressed folder with the fasta sequences

uploaded = files.upload()

TypeError: 'NoneType' object is not subscriptable

In [None]:
# Unzip folder

!tar -xvzf database_folder_name.tar.gz

dbcan/
dbcan/kpsE_dbcan.fasta
dbcan/_nanE_dbcan.fasta
dbcan/kpsM_dbcan.fasta
dbcan/SIAE_dbcan.fasta
dbcan/nanE_dbcan.fasta
dbcan/nanQ_dbcan.fasta
dbcan/kpsF_dbcan.fasta
dbcan/neuS_dbcan.fasta
dbcan/nanT_dbcan.fasta
dbcan/kpsT_dbcan.fasta
dbcan/kpsD_dbcan.fasta
dbcan/neuD_dbcan.fasta
dbcan/kpsC_dbcan.fasta
dbcan/neuC_dbcan.fasta
dbcan/nanR_dbcan.fasta
dbcan/nagZ_dbcan.fasta
dbcan/nanA_dbcan.fasta
dbcan/nanO_dbcan.fasta
dbcan/nanM_dbcan.fasta
dbcan/neuE_dbcan.fasta
dbcan/neuB_dbcan.fasta
dbcan/nanU_dbcan.fasta
dbcan/kpsS_dbcan.fasta
dbcan/nanK_dbcan.fasta
dbcan/neuA_dbcan.fasta
dbcan/kpsU_dbcan.fasta


1. Generating the dataframe with new codes

In [None]:
def sequence_coding(fasta_file, input_filename):
    """
    Process a FASTA file to generate a DataFrame containing headers and internal gene codes.

    Args:
        fasta_file (str): Path to the input FASTA file.
        input_filename (str): Name of the input file (used for generating codes).

    Returns:
        None: Saves the resulting DataFrame to a CSV file.
    """

    # List to store the headers
    headers = []

    # Read the FASTA file and extract the headers
    for record in SeqIO.parse(fasta_file, "fasta"):
        headers.append(record.id)

    # Create a DataFrame with the headers
    df = pd.DataFrame(headers, columns=['Header'])

    # Add a column with gene codes in the format gene_X_UNIP
    df['Gene_Code'] = [
        f"{input_filename.split('_')[0]}_" + str(i + 1) + f"_{input_filename.split('_')[1].replace('.fasta', '')}"
        for i in range(len(headers))
    ]

    # Save the DataFrame to a CSV file with the input filename in the output name
    output_filename = f"{input_filename.split('.')[0]}_codes_df.csv"
    df.to_csv(output_filename, index=False)

In [None]:
# Applying the function to all fasta files in the directory of interest

# Directory where the files are located
directory = 'database_folder_name'  # Adapt according to the directory name

# List all files in the directory
files = os.listdir(directory)

# Filter the list to include only FASTA files
fasta_files = [f for f in files if f.endswith('.fasta')]

# Iterate over the FASTA files
for fasta_file in fasta_files:
    full_path = os.path.join(directory, fasta_file)
    sequence_coding(full_path, fasta_file)

In [None]:
# Organizing files into a folder

!mkdir database_code_dataframes
!mv *.csv database_code_dataframes/

In [None]:
# Concatenating the dataframes

csv_files = [file for file in os.listdir('database_code_dataframes') if file.endswith('.csv')]

dataframes = []

for file in csv_files:
  file_path = os.path.join('database_code_dataframes', file)
  df = pd.read_csv(file_path)
  dataframes.append(df)

refseq_merged_df = pd.concat(dataframes, ignore_index=True)

# Saving the dataframe
refseq_merged_df.to_csv('database_merged_df.csv', index=False)

In [None]:
# Organizing and compressing files

!mv database_merged_df.csv database_code_dataframes/
!tar -czvf database_code_dataframes.tar.gz database_code_dataframes/

dbcan_code_dataframes/
dbcan_code_dataframes/kpsT_dbcan_codes_df.csv
dbcan_code_dataframes/nanR_dbcan_codes_df.csv
dbcan_code_dataframes/dbcan_merged_df.csv
dbcan_code_dataframes/neuD_dbcan_codes_df.csv
dbcan_code_dataframes/SIAE_dbcan_codes_df.csv
dbcan_code_dataframes/neuA_dbcan_codes_df.csv
dbcan_code_dataframes/nanQ_dbcan_codes_df.csv
dbcan_code_dataframes/kpsC_dbcan_codes_df.csv
dbcan_code_dataframes/neuE_dbcan_codes_df.csv
dbcan_code_dataframes/nagZ_dbcan_codes_df.csv
dbcan_code_dataframes/nanO_dbcan_codes_df.csv
dbcan_code_dataframes/neuC_dbcan_codes_df.csv
dbcan_code_dataframes/nanU_dbcan_codes_df.csv
dbcan_code_dataframes/kpsF_dbcan_codes_df.csv
dbcan_code_dataframes/nanA_dbcan_codes_df.csv
dbcan_code_dataframes/nanT_dbcan_codes_df.csv
dbcan_code_dataframes/neuB_dbcan_codes_df.csv
dbcan_code_dataframes/nanE_dbcan_codes_df.csv
dbcan_code_dataframes/kpsS_dbcan_codes_df.csv
dbcan_code_dataframes/neuS_dbcan_codes_df.csv
dbcan_code_dataframes/kpsU_dbcan_codes_df.csv
dbcan_code_data

In [None]:
# Download the compressed file for storage in the local repository

### Renaming headers

In [None]:
# Define the dataframe that relates original code - internal code
df = pd.read_csv('database_code_dataframes/database_merged_df.csv')

In [None]:
# Create output directory

!mkdir database_coded

In [None]:
!ls

dbcan  dbcan_coded  dbcan_code_dataframes  dbcan_code_dataframes.tar.gz  dbcan.tar.gz  sample_data


In [None]:
input_dir = 'database'
output_dir = 'database_coded'

In [None]:
# List all FASTA files in the input directory
fasta_files = [f for f in os.listdir(input_dir) if f.endswith('.fasta')]

# Loop to process each FASTA file
for fasta_file in fasta_files:
    # Full path to the input file
    input_path = os.path.join(input_dir, fasta_file)

    # Read the sequences from the FASTA file
    records = list(SeqIO.parse(input_path, 'fasta'))

    # List to store the updated sequences
    new_records = []

    # Replace headers
    for record in records:
        if record.id in df['Header'].values:
            # Get the corresponding new code
            new_code = df.loc[df['Header'] == record.id, 'Gene_Code'].values[0]

            # Create a new sequence with the updated header
            new_record = SeqRecord(record.seq, id=new_code, description='')
            new_records.append(new_record)
        else:
            # Keep the original sequence if the header is not in the DataFrame
            new_records.append(record)

    # Output file name (adds the _coded suffix)
    output_file = fasta_file.replace('.fasta', '_coded.fasta')
    output_path = os.path.join(output_dir, output_file)

    # Save the updated sequences to the output file
    SeqIO.write(new_records, output_path, 'fasta')

    print(f"File processed: {fasta_file} -> {output_file}")

File processed: neuB_dbcan.fasta -> neuB_dbcan_coded.fasta
File processed: neuA_dbcan.fasta -> neuA_dbcan_coded.fasta
File processed: neuS_dbcan.fasta -> neuS_dbcan_coded.fasta
File processed: nanO_dbcan.fasta -> nanO_dbcan_coded.fasta
File processed: kpsF_dbcan.fasta -> kpsF_dbcan_coded.fasta
File processed: kpsT_dbcan.fasta -> kpsT_dbcan_coded.fasta
File processed: SIAE_dbcan.fasta -> SIAE_dbcan_coded.fasta
File processed: kpsD_dbcan.fasta -> kpsD_dbcan_coded.fasta
File processed: nanK_dbcan.fasta -> nanK_dbcan_coded.fasta
File processed: nanA_dbcan.fasta -> nanA_dbcan_coded.fasta
File processed: kpsU_dbcan.fasta -> kpsU_dbcan_coded.fasta
File processed: neuC_dbcan.fasta -> neuC_dbcan_coded.fasta
File processed: neuE_dbcan.fasta -> neuE_dbcan_coded.fasta
File processed: kpsC_dbcan.fasta -> kpsC_dbcan_coded.fasta
File processed: kpsM_dbcan.fasta -> kpsM_dbcan_coded.fasta
File processed: nanU_dbcan.fasta -> nanU_dbcan_coded.fasta
File processed: nagZ_dbcan.fasta -> nagZ_dbcan_coded.fas

In [None]:
# Compressing the output directory

!tar -czvf database_coded.tar.gz database_coded/

dbcan_coded/
dbcan_coded/nanR_dbcan_coded.fasta
dbcan_coded/nanQ_dbcan_coded.fasta
dbcan_coded/nanM_dbcan_coded.fasta
dbcan_coded/neuA_dbcan_coded.fasta
dbcan_coded/nanT_dbcan_coded.fasta
dbcan_coded/neuD_dbcan_coded.fasta
dbcan_coded/kpsM_dbcan_coded.fasta
dbcan_coded/kpsE_dbcan_coded.fasta
dbcan_coded/neuB_dbcan_coded.fasta
dbcan_coded/neuS_dbcan_coded.fasta
dbcan_coded/nanO_dbcan_coded.fasta
dbcan_coded/kpsD_dbcan_coded.fasta
dbcan_coded/SIAE_dbcan_coded.fasta
dbcan_coded/kpsT_dbcan_coded.fasta
dbcan_coded/nanE_dbcan_coded.fasta
dbcan_coded/nanU_dbcan_coded.fasta
dbcan_coded/_nanE_dbcan_coded.fasta
dbcan_coded/kpsF_dbcan_coded.fasta
dbcan_coded/neuC_dbcan_coded.fasta
dbcan_coded/neuE_dbcan_coded.fasta
dbcan_coded/nanK_dbcan_coded.fasta
dbcan_coded/kpsS_dbcan_coded.fasta
dbcan_coded/kpsC_dbcan_coded.fasta
dbcan_coded/nagZ_dbcan_coded.fasta
dbcan_coded/nanA_dbcan_coded.fasta
dbcan_coded/kpsU_dbcan_coded.fasta


In [None]:
# Download the compressed file for storage in the local repository