In [1]:
import pandas as pd
import numpy as np
from pprint import pprint
import requests
from io import StringIO
# import urllib.request
# import json
from uniprot_netsurfp_scraping import *

# Prep for Netsurfp 

## Use the Following Cell if Importing Data from and Excel Sheet

In [2]:
############# Define an auxiliary function to visualize features #################
def ginfo(df, varname):
    print(f'There are {len(df.axes[1])} features included in {varname}.: \n')
    for i in enumerate(df.axes[1]):
        print(i)

In [3]:

############# EDIT, FILL IN DATA HERE ############
data_file = '2020-02-03 MS compiled for ML project.xlsx'  # needs xlsx at the end 

plasma_prop_sheet = 'Protein properties, plasma'
plasma_mass_spec_sheet = '(GT)15-SWCNT, plasma'

csf_prop_sheet = 'Protein properties, CSF'
csf_mass_spec_sheet = '(GT)15-SWCNT, CSF'
###################################################

############ NO EDITS REQUIRED ##############

data_filepath = "data/" + data_file

# load data

plasma_raw_data = pd.read_excel(data_filepath, sheet_name= plasma_prop_sheet, thousands=',')
csf_raw_data = pd.read_excel(data_filepath, sheet_name= csf_prop_sheet, thousands=',')

# Get info
#ginfo(plasma_raw_data, "plasma_raw_data")
#ginfo(csf_raw_data, "csf_raw_data")

# Changed in a more standard way of processing the data.
#total_ids = plasma_raw_data.Entry.append(csf_raw_data.Entry)
total_ids = pd.concat([plasma_raw_data["Entry"], csf_raw_data["Entry"]], ignore_index=True)

"""
This section was updated to improve efficiency 10/17/2025
unique_ids = []

for i in total_ids:
    if i not in unique_ids:
        unique_ids.append(i)
"""

unique_ids = total_ids.unique()

## Use this for Importing New Data From UniProt

In [4]:
# The working link using the text name 'Cytoplasm' as a keyword
cytoplasm_link = 'https://rest.uniprot.org/uniprotkb/search?query=(keyword:Cytoplasm)%20AND%20(organism_id:9606)%20AND%20(reviewed:true)&format=tsv&fields=accession&size=500'

# Fetch data using requests
response = requests.get(cytoplasm_link)
response.raise_for_status()
tsv_data = StringIO(response.text)
df = pd.read_csv(tsv_data, sep='\t')
print(df["Entry"].unique())

['A0JNW5' 'A1A4S6' 'A1X283' 'A2RUB6' 'A5D8V7' 'A7E2V4' 'A9UHW6' 'B1AK53'
 'B3KU38' 'C9JLW8' 'C9JRZ8' 'O00159' 'O00165' 'O00214' 'O00291' 'O00329'
 'O00422' 'O00444' 'O00505' 'O00506' 'O00560' 'O00635' 'O14503' 'O14508'
 'O14613' 'O14618' 'O14640' 'O14744' 'O14757' 'O14788' 'O14907' 'O14908'
 'O14926' 'O14960' 'O14979' 'O15067' 'O15078' 'O15111' 'O15116' 'O15162'
 'O15169' 'O15247' 'O15259' 'O15265' 'O15297' 'O15318' 'O15488' 'O15511'
 'O15540' 'O43166' 'O43237' 'O43264' 'O43294' 'O43310' 'O43474' 'O43543'
 'O43548' 'O43556' 'O43566' 'O43598' 'O43609' 'O43808' 'O43823' 'O43865'
 'O43866' 'O43903' 'O60266' 'O60307' 'O60481' 'O60502' 'O60678' 'O60687'
 'O60759' 'O60784' 'O60879' 'O60921' 'O60930' 'O60941' 'O75072' 'O75132'
 'O75175' 'O75190' 'O75334' 'O75344' 'O75369' 'O75449' 'O75618' 'O75674'
 'O75679' 'O75815' 'O75821' 'O75828' 'O75838' 'O75897' 'O75935' 'O75952'
 'O75956' 'O76039' 'O76075' 'O94762' 'O94898' 'O94964' 'O94993' 'O95153'
 'O95154' 'O95155' 'O95208' 'O95237' 'O95248' 'O952

In [5]:
covid_link = 'https://covid-19.uniprot.org/uniprotkb?query=id&format=html'
cytoplasm_link = 'https://www.uniprot.org/uniprot/?query=locations:(location:%22Cytoplasm%20%5BSL-0086%5D%22)&fil=organism%3A%22Homo+sapiens+%28Human%29+%5B9606%5D%22+AND+reviewed%3Ayes&limit=1000'
dendritic_spine ='https://www.uniprot.org/uniprot/?query=locations:(location:%22Dendritic%20spine%20%5BSL-0284%5D%22)&fil=organism%3A%22Homo+sapiens+%28Human%29+%5B9606%5D%22+AND+reviewed%3Ayes&limit=150'
clatherin_pit = 'https://www.uniprot.org/uniprot/?query=locations:(location:%22Clathrin-coated%20pit%20%5BSL-0069%5D%22)&fil=organism%3A%22Homo+sapiens+%28Human%29+%5B9606%5D%22+AND+reviewed%3Ayes'
nucleus = 'https://www.uniprot.org/uniprot/?query=locations%3A%28location%3A%22Nucleus+%5BSL-0191%5D%22%29+reviewed%3Ayes+organism%3A%22Homo+sapiens+%28Human%29+%5B9606%5D%22&sort=score&limit=1000'

"""
# updated cytoplasm link
verification_entries = pd.read_html(cytoplasm_link, header=0)[0]["Entry"]

# The custom link must be updated with the correct sequence

unique_ids = []

for i in verification_entries.to_list():
    if i not in unique_ids:
        unique_ids.append(i)
"""

unique_ids = df["Entry"].unique()


## Generate the fasta file

In [6]:
chosen_list = ["Q01995", 'Q13148', 'P16070', 'P00698', 'P00338', 'P07998', 'Q8MU52', 'O00560']


In [7]:
unique_ids_list = chosen_list ### identify it your list here
fasta_filename = "chosen_fasta"
####### Dont Touch Below Here #####
total_fasta = ''

error_list = []
for protein in unique_ids_list:
    print(protein)
    try: current_fasta = protein_data_scraping_fasta(protein)

    except Exception:
        error_list.append(protein)

    
    #if first_loop:
    #    total_fasta = current_fasta
    #    first_loop = False
        
    else:
        total_fasta += '\n'+current_fasta      

    f = open("fasta_data/"+fasta_filename+'.txt', "w")
    f.write(total_fasta)
    f.close() 


f = open("fasta_data/"+fasta_filename+'.txt', "w")
f.write(total_fasta)
f.close()

Q01995
https://www.uniprot.org/uniprot/Q01995.fasta
Q13148
https://www.uniprot.org/uniprot/Q13148.fasta
P16070
https://www.uniprot.org/uniprot/P16070.fasta
P00698
https://www.uniprot.org/uniprot/P00698.fasta
P00338
https://www.uniprot.org/uniprot/P00338.fasta
P07998
https://www.uniprot.org/uniprot/P07998.fasta
Q8MU52
https://www.uniprot.org/uniprot/Q8MU52.fasta
O00560
https://www.uniprot.org/uniprot/O00560.fasta


In [8]:
print(error_list)

[]


# Go to NetSurfP website and Submit Data, after completion paste data into an excel file for import into the program

Use excel to break up all of the data into columns this will require titiling each column based off of the key netsurp provides 

# Processing NetSurfP data for use in future notebooks

In [9]:
# Use NetSurfP v2 from 
complete_netsurfp = pd.read_excel("data/"+'netsurfp2_lcms_proteins_data.xlsx')




In [10]:
print(complete_netsurfp)

                          id seq   n       rsa         asa q3   p[q3_H]  \
0       sp_P01019_ANGT_HUMAN   M   1  0.826128  165.308309  C  0.000003   
1       sp_P01019_ANGT_HUMAN   R   2  0.750999  171.978833  C  0.000011   
2       sp_P01019_ANGT_HUMAN   K   3  0.756331  155.577329  C  0.000017   
3       sp_P01019_ANGT_HUMAN   R   4  0.730103  167.193563  C  0.000011   
4       sp_P01019_ANGT_HUMAN   A   5  0.659811   72.711135  C  0.000020   
...                      ...  ..  ..       ...         ... ..       ...   
89891  sp_P02656_APOC3_HUMAN   S  95  0.872572  102.265425  C  0.012978   
89892  sp_P02656_APOC3_HUMAN   A  96  0.846348   93.267515  C  0.007286   
89893  sp_P02656_APOC3_HUMAN   V  97  0.866765  133.221756  C  0.002452   
89894  sp_P02656_APOC3_HUMAN   A  98  0.893835   98.500644  C  0.001597   
89895  sp_P02656_APOC3_HUMAN   A  99  0.967148  106.579704  C  0.000021   

            p[q3_E]   p[q3_C] q8  ...       p[q8_H]       p[q8_I]  \
0      3.821910e-07  0.999996 

In [11]:
print(unique_ids)

['A0JNW5' 'A1A4S6' 'A1X283' 'A2RUB6' 'A5D8V7' 'A7E2V4' 'A9UHW6' 'B1AK53'
 'B3KU38' 'C9JLW8' 'C9JRZ8' 'O00159' 'O00165' 'O00214' 'O00291' 'O00329'
 'O00422' 'O00444' 'O00505' 'O00506' 'O00560' 'O00635' 'O14503' 'O14508'
 'O14613' 'O14618' 'O14640' 'O14744' 'O14757' 'O14788' 'O14907' 'O14908'
 'O14926' 'O14960' 'O14979' 'O15067' 'O15078' 'O15111' 'O15116' 'O15162'
 'O15169' 'O15247' 'O15259' 'O15265' 'O15297' 'O15318' 'O15488' 'O15511'
 'O15540' 'O43166' 'O43237' 'O43264' 'O43294' 'O43310' 'O43474' 'O43543'
 'O43548' 'O43556' 'O43566' 'O43598' 'O43609' 'O43808' 'O43823' 'O43865'
 'O43866' 'O43903' 'O60266' 'O60307' 'O60481' 'O60502' 'O60678' 'O60687'
 'O60759' 'O60784' 'O60879' 'O60921' 'O60930' 'O60941' 'O75072' 'O75132'
 'O75175' 'O75190' 'O75334' 'O75344' 'O75369' 'O75449' 'O75618' 'O75674'
 'O75679' 'O75815' 'O75821' 'O75828' 'O75838' 'O75897' 'O75935' 'O75952'
 'O75956' 'O76039' 'O76075' 'O94762' 'O94898' 'O94964' 'O94993' 'O95153'
 'O95154' 'O95155' 'O95208' 'O95237' 'O95248' 'O952

In [12]:
# Replace unique names with the values that can be found within 
# netsurfp2_lcms_proteins_data.xlsx
unique_ids = pd.read_csv("fasta_data/unique_names.txt", header=None).values[:,0]

In [13]:
print(unique_ids)

['P02768' 'P00738' 'P01834' 'P01857' 'P02787' 'P02647' 'P0C0L4' 'Q9NYB0'
 'P01009' 'P02765' 'P02652' 'P01876' 'Q13683' 'P01023' 'P01024' 'P01031'
 'P02790' 'P02763' 'P01871' 'P02749' 'P0CG04' 'Q96G46' 'P01859' 'P04114'
 'P07477' 'P02760' 'P02774' 'P08603' 'P01042' 'P00747' 'P04004' 'P00450'
 'P02766' 'P01861' 'P01860' 'P30613' 'P00751' 'P10909' 'Q8IYD9' 'Q5T7B8'
 'P19652' 'Q6UVM3' 'P02747' 'Q9ULW3' 'Q16584' 'P02751' 'Q5SVS4' 'P19827'
 'P01011' 'Q15077' 'Q13671' 'P0DOY2' 'P04003' 'P06727' 'P01008' 'A6NEW6'
 'P19823' 'P06396' 'P58397' 'P05156' 'P00734' 'P04217' 'P07225' 'P00736'
 'P12259' 'P04196' 'P01019' 'Q13790' 'Q08380' 'P01591' 'Q8NEX9' 'P09871'
 'Q16548' 'P02649' 'P27169' 'P00739' 'Q99996' 'Q9Y5S2' 'P00748' 'Q9UQB3'
 'P02746' 'Q5JTD0' 'P07357' 'P02671' 'Q96PD5' 'P10643' 'O43318' 'Q8TE68'
 'P0C0L5' 'P25311' 'P01593' 'Q14520' 'P07358' 'P02748' 'Q14624' 'P23142'
 'Q15527' 'P03952' 'O43866' 'P08697' 'P05452' 'Q96IY4' 'P13671' 'P80108'
 'Q9BT78' 'P36955' 'P07360' 'P0DJI8' 'P04275' 'P042

In [14]:
processed_data = netsurfp_2_data_processing(unique_ids, complete_netsurfp) ### processes netsurfp data into feature columns

print(processed_data.shape)
processed_data.to_excel("data/"+'netsurfp_2_lcms_proteins_processed_updated.xlsx')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['class assignment'] = np.where(filtered.rsa > 0.25, 'E', 'B')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['class assignment'] = np.where(filtered.rsa > 0.25, 'E', 'B')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['class assignment'] = np.where(filtered.rsa > 0.25, 'E

(135, 55)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['class assignment'] = np.where(filtered.rsa > 0.25, 'E', 'B')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered['class assignment'] = np.where(filtered.rsa > 0.25, 'E', 'B')
