In [1]:
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
# from sklearn.metrics import log_loss, f1_score, fbeta_score, recall_score, precision_score, confusion_matrix
import urllib.request, json
from skimage.filters import threshold_otsu
from pprint import pprint

# Homemade functions required
from data_prep_functions import *
from interpro_scraping import interpro_scraping_pandas
from uniprot_netsurfp_scraping import *

# Generate features and names from predownloaded properties and LC-MS/MS data 

In [2]:
############# EDIT, FILL IN DATA HERE ############
data_file = '2020-02-03 MS compiled for ML project.xlsx'  # needs xlsx at the end 

plasma_prop_sheet = 'Protein properties, plasma'
plasma_mass_spec_sheet = 'PNPs, plasma' #'(GT)6-SWCNT, plasma'

csf_prop_sheet = 'Protein properties, CSF'
csf_mass_spec_sheet = 'PNPs, CSF' #'(GT)15-SWCNT, CSF'
###################################################

In [3]:
############ NO EDITS REQUIRED ##############

data_filepath = "data/" + data_file

# load data

plasma_raw_data = pd.read_excel(data_filepath, sheet_name= plasma_prop_sheet, thousands=',')
csf_raw_data = pd.read_excel(data_filepath, sheet_name= csf_prop_sheet, thousands=',')

plasma_mass_spec_data = pd.read_excel(data_filepath, sheet_name=plasma_mass_spec_sheet, header=2, thousands=',')
csf_mass_spec_data = pd.read_excel(data_filepath, sheet_name=csf_mass_spec_sheet, header=2, thousands=',')

# clean up data 

plasma_cleaned_data = clean_up_data_biopy(plasma_raw_data)# clean_up_data_v2(plasma_raw_data) 
csf_cleaned_data = clean_up_data_biopy(csf_raw_data) #clean_up_data_v2(csf_raw_data) 


plasma_cleaned_data, csf_cleaned_data = normalize_mass_length(plasma_cleaned_data, csf_cleaned_data)

FileNotFoundError: [Errno 2] No such file or directory: 'data/2020-02-03 MS compiled for ML project.xlsx'

In [48]:
netsurfp_data = pd.read_excel("data/"+'netsurfp_2_lcms_proteins_processed_updated.xlsx')

plasma_complete_data = pd.merge(plasma_cleaned_data, netsurfp_data, left_on='Entry', right_on='entry')
csf_complete_data = pd.merge(csf_cleaned_data, netsurfp_data, left_on='Entry', right_on='entry')

for df in [plasma_complete_data, csf_complete_data]:
    for col in ['asa_sum']:
        df[col+'_normalized'] = df[col] / df['Mass']

csf_complete_data.head()

Unnamed: 0,Entry,Protein names,Sequence,Length,Mass,frac_aa_A,frac_aa_C,frac_aa_D,frac_aa_E,frac_aa_F,...,fraction_exposed_exposed_S,fraction_exposed_exposed_T,fraction_exposed_exposed_V,fraction_exposed_exposed_W,fraction_exposed_exposed_Y,nsp_secondary_structure_coil,nsp_secondary_structure_sheet,nsp_secondary_structure_helix,nsp_disordered,asa_sum_normalized
0,P02768,Serum albumin,MKWVTFISLLFLFSSAYSRGVFRRDAHKSEVAHRFKDLGEENFKAL...,609,69367,0.103448,0.057471,0.059113,0.101806,0.057471,...,0.05,0.067647,0.035294,0.002941,0.011765,0.286,0.0,0.714,0.038,0.464794
1,P02766,Transthyretin (ATTR) (Prealbumin) (TBPA),MASHRLLLLCLAGLVFVSEAGPTGTGESKCPLMVKVLDAVRGSPAI...,147,15887,0.102041,0.013605,0.034014,0.088435,0.040816,...,0.105263,0.063158,0.042105,0.010526,0.021053,0.551,0.381,0.068,0.197,0.602589
2,P01009,Alpha-1-antitrypsin (Alpha-1 protease inhibito...,MPSSVSWGILLLAGLCCLVPVSLAEDPQGDAAQKTDTSHHDQDHPT...,418,46737,0.062201,0.007177,0.057416,0.076555,0.064593,...,0.076577,0.067568,0.031532,0.004505,0.0,0.4,0.33,0.27,0.105,0.463058
3,P41222,Prostaglandin-H2 D-isomerase (EC 5.3.99.2) (Be...,MATHHTLWMGLALLGVLGDLQAAPEAQVSVQPNFQQDKFLGRWFSA...,190,21029,0.089474,0.021053,0.042105,0.047368,0.047368,...,0.071429,0.095238,0.039683,0.02381,0.0,0.463,0.4,0.137,0.137,0.568527
4,P02787,Serotransferrin (Transferrin) (Beta-1 metal-bi...,MRLAVGALLVCAVLGLCLAVPDKTVRWCAVSEHEATKCQSFRDHMK...,698,77064,0.087393,0.057307,0.06447,0.060172,0.040115,...,0.06213,0.047337,0.026627,0.0,0.005917,0.474,0.186,0.34,0.03,0.419599


In [49]:
###### MASS SPEC DATA, LIKELY REQUIRED (THRESHOLD METHOD) ########

plasma_labels = plasma_mass_spec_data[['Accession', 'NP average','NP fold change']]
csf_labels = csf_mass_spec_data[['Accession', 'NP average', 'NP fold change']]

## For OTSU, Comment out if not using ###

# plasma_labels['Corona'] = (plasma_labels['NP average'] > criteria_plas).astype(int)
# csf_labels['Corona'] = (csf_labels['NP average'] > criteria_csf).astype(int)

## for TRAVIS, Comment out if not using ## 

thresh_power = 2.25 # change based on previous cell
fluids_type_list=['plasma', 'csf']
data_sheets = [plasma_labels, csf_labels]
for k in range(2):
    sheet = data_sheets[k]
    fluid_type = fluids_type_list[k]

    print(f'\n{fluid_type.capitalize()} running')

    num_proteins = np.zeros(100)
    for i in np.arange(0,100,1):
        thresh = i/100
        index = (sheet['NP average']>thresh)
        num_proteins[i] = (np.count_nonzero(index))

    x = np.arange(0,1,.01)
    num_proteins_above_1_abundance = min(num_proteins)
    biexponent_dist = num_proteins - num_proteins_above_1_abundance
    num_protein_thresh = np.max(biexponent_dist)/pow(np.e,thresh_power)
    cutoff_thresh_value = min(x[(biexponent_dist<num_protein_thresh)&(biexponent_dist>0)], default=0)
    sheet['Corona'] = ((sheet['NP average']>cutoff_thresh_value) | (sheet['NP fold change']>1)).astype(int)
    


### end different thresh methods



plasma_labels = plasma_labels.drop(['NP average', 'NP fold change'], axis=1)
csf_labels = csf_labels.drop(['NP average', 'NP fold change'], axis=1)



# checks for any accession lists
plasma_labels = accession_expansion(plasma_labels) 
csf_labels = accession_expansion(csf_labels)

plasma_total_data_names = pd.merge(plasma_labels, plasma_complete_data, left_on='Accession', right_on='Entry')
csf_total_data_names = pd.merge(csf_labels, csf_complete_data, left_on='Accession', right_on='Entry')

plasma_to_export =plasma_total_data_names.drop(['Accession','Entry', 'entry', 'Sequence', 'Length', 'Mass'], axis=1)
csf_to_export = csf_total_data_names.drop(['Accession','Entry', 'entry', 'Sequence', 'Length', 'Mass'], axis=1)

plasma_to_export.to_excel("data/"+'pnp_plasma_features_names_biopy_gravy.xlsx')
csf_to_export.to_excel("data/"+'pnp_csf_features_names_biopy_gravy.xlsx')



Plasma running

Csf running


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sheet['Corona'] = ((sheet['NP average']>cutoff_thresh_value) | (sheet['NP fold change']>1)).astype(int)


# Download protein property data

 ## use this if you have a list of protein targets

In [42]:
# only_uniprot_data = pd.read_excel('uniprot_1000_cytoplasm.xlsx')['Entry'] # for use when creating no netsurfp sets
chosen_list = ["Q01995", 'Q13148', 'P16070', 'P00698', 'P00338', 'P07998', 'Q8MU52', 'O00560'] 
test_data = uniprot_data_scraping(chosen_list)  # replace with only_uniprot_data for no netsurf p

(1, 63)
(1, 63)
(1, 63)
(1, 63)
(1, 63)
(1, 63)


In [43]:

test_data['Calcium binding'] = 0
test_data['Activity regulation'] = 0
test_data.head()

for i in range(8):

    testing = test_data.iloc[i*1:i*1 + 1]
    try:
        testing_cleaned = clean_up_data_biopy(testing)
    except:
        print(f'error at {i}')

test_data_corrected = test_data.copy()#.drop([405, 680, 707, 792]) # run above once and run again with error drops if there are any
test_cleaned_data = clean_up_data_biopy(test_data_corrected)


plasma_raw_data = pd.read_excel(data_filepath, sheet_name= plasma_prop_sheet, thousands=',')
plasma_cleaned_data = clean_up_data_biopy(plasma_raw_data)

test_cleaned_data['length'] = test_cleaned_data['Length'] / plasma_cleaned_data.Length.max()
test_cleaned_data['mass'] = test_cleaned_data['Mass'] / plasma_cleaned_data.Mass.max()
test_cleaned_data['molecular_weight'] = test_cleaned_data['molecular_weight'] / plasma_cleaned_data.molecular_weight.max()
plasma_cleaned_data, test_cleaned_data = normalize_mass_length(plasma_cleaned_data, test_cleaned_data)

In [44]:
netsurfp_test_data = pd.read_excel("data/"+'netsurfp_2_proteins_selected_for_testing_processed_updated.xlsx')  # load in the netsurfp data from the same set of data

test_complete_data = pd.merge(test_cleaned_data, netsurfp_test_data, left_on='Entry', right_on='entry')


for df in [test_complete_data]:
    for col in ['asa_sum']:
        df[col+'_normalized'] = df[col] / df['Mass']

test_complete_data.head()

Unnamed: 0,Entry,Protein names,Sequence,Length,Mass,frac_aa_A,frac_aa_C,frac_aa_D,frac_aa_E,frac_aa_F,...,fraction_exposed_exposed_S,fraction_exposed_exposed_T,fraction_exposed_exposed_V,fraction_exposed_exposed_W,fraction_exposed_exposed_Y,nsp_secondary_structure_coil,nsp_secondary_structure_sheet,nsp_secondary_structure_helix,nsp_disordered,asa_sum_normalized
0,Q01995,Transgelin (22 kDa actin-binding protein) (Pro...,MANKGPSYGMSREVQSKIEKKYDEELEERLVEWIIVQCGPDVGRPD...,201,22611,0.054726,0.004975,0.049751,0.074627,0.034826,...,0.079137,0.028777,0.043165,0.0,0.028777,0.607,0.0,0.393,0.065,0.560393
1,Q13148,TAR DNA-binding protein 43 (TDP-43),MSEYIRVTEDENDEPIEIPSEDDGTVLLSTVTAQFPGACGLRYRNP...,414,44740,0.062802,0.014493,0.05314,0.05314,0.05314,...,0.126246,0.019934,0.016611,0.016611,0.009967,0.676,0.198,0.126,0.341,0.576662
2,P16070,CD44 antigen (CDw44) (Epican) (Extracellular m...,MDKFWWHAAWGLCLVPLSLAQIDLNITCRFAGVFHVEKNGRYSISR...,742,81538,0.056604,0.012129,0.061995,0.067385,0.030997,...,0.125547,0.129927,0.029197,0.021898,0.013139,0.902,0.073,0.026,0.798,0.799411
3,P00698,"Lysozyme C (EC 3.2.1.17) (1,4-beta-N-acetylmur...",MRSLLILVLCFLPLAALGKVFGRCELAAAMKRHGLDNYRGYSLGNW...,147,16239,0.095238,0.061224,0.047619,0.013605,0.027211,...,0.087912,0.076923,0.032967,0.0,0.021978,0.537,0.075,0.388,0.122,0.550834
4,P00338,L-lactate dehydrogenase A chain (LDH-A) (EC 1....,MATLKDQLIYNLLKEEQTPQNKITVVGVGAVGMACAISILMKDLAD...,332,36689,0.054217,0.01506,0.054217,0.054217,0.021084,...,0.036585,0.054878,0.02439,0.012195,0.042683,0.319,0.205,0.476,0.006,0.433721


In [45]:
# test_data_done=test_complete_data.drop(['Entry', 'entry', 'Sequence', 'Length', 'Mass', 'Accession', 'Unnamed: 0'], axis=1)
test_data_done=test_complete_data.drop(['Entry', 'entry', 'Sequence', 'Unnamed: 0', 'Length', 'Mass'], axis=1)
# test_data_done.head()
test_data_done.shape
test_data_done.to_excel("data/"+'proteins_selected_for_testing_complete_updated.xlsx')

## use this if you want to download data based off of a link

In [12]:
### done to check bias in our classifier #### 
covid_link = 'https://covid-19.uniprot.org/uniprotkb?query=id&format=html'
cytoplasm_link = 'https://www.uniprot.org/uniprot/?query=locations:(location:%22Cytoplasm%20%5BSL-0086%5D%22)&fil=organism%3A%22Homo+sapiens+%28Human%29+%5B9606%5D%22+AND+reviewed%3Ayes&limit=75'
dendritic_spine ='https://www.uniprot.org/uniprot/?query=locations:(location:%22Dendritic%20spine%20%5BSL-0284%5D%22)&fil=organism%3A%22Homo+sapiens+%28Human%29+%5B9606%5D%22+AND+reviewed%3Ayes&limit=150'
clatherin_pit = 'https://www.uniprot.org/uniprot/?query=locations:(location:%22Clathrin-coated%20pit%20%5BSL-0069%5D%22)&fil=organism%3A%22Homo+sapiens+%28Human%29+%5B9606%5D%22+AND+reviewed%3Ayes'
nucleus = 'https://www.uniprot.org/uniprot/?query=locations%3A%28location%3A%22Nucleus+%5BSL-0191%5D%22%29+reviewed%3Ayes+organism%3A%22Homo+sapiens+%28Human%29+%5B9606%5D%22&sort=score&limit=120'

entry_check = pd.read_html(nucleus, header=0)
entry_list = entry_check[0]['Entry']

verification_set = uniprot_data_scraping(entry_list)

verification_set['Calcium binding'] = 0
verification_set['Activity regulation'] = 0
verification_cleaned_data = clean_up_data_biopy(verification_set)


plasma_raw_data = pd.read_excel(data_file, sheet_name= plasma_prop_sheet, thousands=',')
plasma_cleaned_data = clean_up_data_biopy(plasma_raw_data)
plasma_cleaned_data, test_cleaned_data = normalize_mass_length(plasma_cleaned_data, verification_cleaned_data)

# netsurfp_verification_data = pd.read_excel('netsurfp_verification_data_processed_nucleus.xlsx')

# verification_complete_data = pd.merge(verification_cleaned_data, netsurfp_verification_data, left_on='Entry', right_on='entry')


# for df in [verification_complete_data]:
#     for col in ['asa_sum']:
#         df[col+'_normalized'] = df[col] / df['Mass']
test_cleaned_data.head()
# verification_complete_data.head()