### This code extracts the columns from psortb parsed text file - psort_parsed.txt, create dataframe and write it in a csv file called 'psort_extract_df.csv'. PSORTb data were collected in a text file for each sequence in uniprot_df.fasta file.

This is official site of the PSORTb database - https://www.psort.org/psortb/index.html
I used the docker image of the PSORTb command line tool to parse the data in my PC.
Detailed steps can be found here - https://hub.docker.com/r/brinkmanlab/psortb_commandline/

My terminal command was
###### ./psortb -i /Users/zaidur/Documents/Sequence_Project/aeromonasBact/uniprot_df.fasta -r /Users/zaidur/Documents/Sequence_Project/ --negative

In [93]:
"""
Here is a the typcial format one entry for one sequence id in psort_parsed.txt text file

SeqID: A0A068FZK6 
  Analysis Report:
    CMSVM-            CytoplasmicMembrane           [No details]
    CytoSVM-          Unknown                       [No details]
    ECSVM-            Unknown                       [No details]
    ModHMM-           CytoplasmicMembrane           [11 internal helices found]
    Motif-            Unknown                       [No motifs found]
    OMPMotif-         Unknown                       [No motifs found]
    OMSVM-            Unknown                       [No details]
    PPSVM-            Unknown                       [No details]
    Profile-          Unknown                       [No matches to profiles found]
    SCL-BLAST-        CytoplasmicMembrane           [matched 81170856: Undecaprenyl-phosphate alpha-N-acetylglucosaminyl 1-phosphate transferase]
    SCL-BLASTe-       Unknown                       [No matches against database]
    Signal-           Unknown                       [No signal peptide detected]
  Localization Scores:
    CytoplasmicMembrane    10.00
    Cytoplasmic            0.00
    Periplasmic            0.00
    Extracellular          0.00
    OuterMembrane          0.00
  Final Prediction:
    CytoplasmicMembrane    10.00

"""

'\nHere is a the typcial format one entry for one sequence id in psort_parsed.txt text file\n\nSeqID: A0A068FZK6 \n  Analysis Report:\n    CMSVM-            CytoplasmicMembrane           [No details]\n    CytoSVM-          Unknown                       [No details]\n    ECSVM-            Unknown                       [No details]\n    ModHMM-           CytoplasmicMembrane           [11 internal helices found]\n    Motif-            Unknown                       [No motifs found]\n    OMPMotif-         Unknown                       [No motifs found]\n    OMSVM-            Unknown                       [No details]\n    PPSVM-            Unknown                       [No details]\n    Profile-          Unknown                       [No matches to profiles found]\n    SCL-BLAST-        CytoplasmicMembrane           [matched 81170856: Undecaprenyl-phosphate alpha-N-acetylglucosaminyl 1-phosphate transferase]\n    SCL-BLASTe-       Unknown                       [No matches against database]

In [94]:
#This code creates Dataframe without the last "Final Prediction" Column

import pandas as pd
import re

# Open and read the file
with open("/Users/zaidur/Documents/Sequence_Project/aeromonasBact/psort_parsed.txt", "r") as file:
    text = file.read()

# Splitting the text into separate sequence entries
entries = text.split("-------------------------------------------------------------------------------")

data = []
for entry in entries:
    row = {}

    # Extracting the sequence ID
    seq_id_search = re.search(r'SeqID: (\S+)', entry)
    if seq_id_search is not None:
        row['SeqID'] = seq_id_search.group(1)

        # Extracting the values for the specified keys
        for key in ['CMSVM', 'CytoSVM', 'ECSVM', 'ModHMM', 'Motif', 'OMPMotif', 'OMSVM', 'PPSVM', 'Profile', 'SCL-BLAST', 'SCL-BLASTe', 'Signal']:
            match = re.search(rf'{key}-\s+(\S+)', entry)
            row[key] = match.group(1) if match else None

        # Extracting the values for the score keys
        for key in ['Cytoplasmic', 'CytoplasmicMembrane', 'Periplasmic', 'OuterMembrane', 'Extracellular']:
            match = re.search(rf'{key}\s+([\d\.]+)', entry)
            row[key] = float(match.group(1)) if match else None

        data.append(row)

df_draft = pd.DataFrame(data)
df_draft


Unnamed: 0,SeqID,CMSVM,CytoSVM,ECSVM,ModHMM,Motif,OMPMotif,OMSVM,PPSVM,Profile,SCL-BLAST,SCL-BLASTe,Signal,Cytoplasmic,CytoplasmicMembrane,Periplasmic,OuterMembrane,Extracellular
0,A0A068FVC1,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,2.00,2.00,2.00,2.00,2.00
1,A0A068FZD0,Unknown,Cytoplasmic,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Cytoplasmic,Unknown,Unknown,9.97,0.01,0.01,0.00,0.00
2,A0A068FZK6,CytoplasmicMembrane,Unknown,Unknown,CytoplasmicMembrane,Unknown,Unknown,Unknown,Unknown,Unknown,CytoplasmicMembrane,Unknown,Unknown,0.00,10.00,0.00,0.00,0.00
3,A0A075P9Z7,Unknown,Cytoplasmic,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Cytoplasmic,Unknown,Unknown,9.97,0.01,0.01,0.00,0.00
4,A0A075PBX8,Unknown,Cytoplasmic,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Cytoplasmic,Unknown,Unknown,9.97,0.01,0.01,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29776,Q7BJX9,Unknown,Cytoplasmic,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Cytoplasmic,Unknown,Unknown,9.97,0.01,0.01,0.00,0.00
29777,Q8UVZ1,Unknown,Cytoplasmic,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,8.96,0.51,0.26,0.01,0.26
29778,Q9L5A4,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Cytoplasmic,Unknown,Non-Cytoplasmic,0.00,3.24,6.49,0.14,0.14
29779,R1GTS7,CytoplasmicMembrane,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Cytoplasmic,Unknown,Non-Cytoplasmic,0.00,9.76,0.24,0.00,0.00


In [84]:
## This is the final code with all the columns

import pandas as pd
import re

# Open and read the file
with open("/Users/zaidur/Documents/Sequence_Project/aeromonasBact/psort_parsed.txt", "r") as file:
    text = file.read()
    

# Splitting the text into separate sequence entries
entries = text.split("-------------------------------------------------------------------------------")

data = []
for entry in entries:
    row = {}

    # Extracting the sequence ID
    seq_id_search = re.search(r'SeqID: (\S+)', entry)
    if seq_id_search is not None:
        row['SeqID'] = seq_id_search.group(1)

        # Extracting the values for the specified keys
        for key in ['CMSVM', 'CytoSVM', 'ECSVM', 'ModHMM', 'Motif', 'OMPMotif', 'OMSVM', 'PPSVM', 'Profile', 'SCL-BLAST', 'SCL-BLASTe', 'Signal']:
            match = re.search(rf'{key}-\s+(\S+)', entry)
            row[key] = match.group(1) if match else None

        # Extracting the values for the score keys
        for key in ['Cytoplasmic', 'CytoplasmicMembrane', 'Periplasmic', 'OuterMembrane', 'Extracellular']:
            match = re.search(rf'{key}\s+([\d\.]+)', entry)
            row[key] = float(match.group(1)) if match else None

        # Extracting the Final Prediction text
        final_prediction_search = re.search(r'Final Prediction:\n    (\S+)', entry)
        row['Final Prediction'] = final_prediction_search.group(1) if final_prediction_search else None

        data.append(row)

df = pd.DataFrame(data)
df

Unnamed: 0,SeqID,CMSVM,CytoSVM,ECSVM,ModHMM,Motif,OMPMotif,OMSVM,PPSVM,Profile,SCL-BLAST,SCL-BLASTe,Signal,Cytoplasmic,CytoplasmicMembrane,Periplasmic,OuterMembrane,Extracellular,Final Prediction
0,A0A068FVC1,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,2.00,2.00,2.00,2.00,2.00,Unknown
1,A0A068FZD0,Unknown,Cytoplasmic,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Cytoplasmic,Unknown,Unknown,9.97,0.01,0.01,0.00,0.00,Cytoplasmic
2,A0A068FZK6,CytoplasmicMembrane,Unknown,Unknown,CytoplasmicMembrane,Unknown,Unknown,Unknown,Unknown,Unknown,CytoplasmicMembrane,Unknown,Unknown,0.00,10.00,0.00,0.00,0.00,CytoplasmicMembrane
3,A0A075P9Z7,Unknown,Cytoplasmic,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Cytoplasmic,Unknown,Unknown,9.97,0.01,0.01,0.00,0.00,Cytoplasmic
4,A0A075PBX8,Unknown,Cytoplasmic,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Cytoplasmic,Unknown,Unknown,9.97,0.01,0.01,0.00,0.00,Cytoplasmic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29776,Q7BJX9,Unknown,Cytoplasmic,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Cytoplasmic,Unknown,Unknown,9.97,0.01,0.01,0.00,0.00,Cytoplasmic
29777,Q8UVZ1,Unknown,Cytoplasmic,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,8.96,0.51,0.26,0.01,0.26,Cytoplasmic
29778,Q9L5A4,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Cytoplasmic,Unknown,Non-Cytoplasmic,0.00,3.24,6.49,0.14,0.14,Unknown
29779,R1GTS7,CytoplasmicMembrane,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Cytoplasmic,Unknown,Non-Cytoplasmic,0.00,9.76,0.24,0.00,0.00,CytoplasmicMembrane


In [86]:
df.to_csv("/Users/zaidur/Documents/Sequence_Project/aeromonasBact/psort_extract_df.csv", index=False)

In [95]:
len(df)

29781