In [6]:
import os
import sys

sys.path.extend([
    "C:\\Users\\Wyss User\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python311\\site-packages",
    "C:\\Users\\Wyss User\\Documents\\EVs\\OLINK\\src",
])

from brainrnaseq_specificity import map_hgnc_ids, create_enrichment_dataframe, calculate_enrichment
from deeptmhmm_localization import identify_localization, parse_gz_file
from identify_targets import identify_targets
from olink_fractionation import analyze_fractionation
from raw_data_preprocessing import clean_up_raw_data, plot_protein_fractionation, ev_association_score_df, find_ratio

from io import StringIO

import matplotlib.pyplot as plt
import pandas as pd
import requests
import seaborn as sns

os.chdir("C:\\Users\\Wyss User\\Documents\\EVs\\OLINK")

In [7]:
assay_list_path = "data\\231220_ht_panel_assay_list.xlsx"
brain_rna_seq_raw_path = "data\\240411_brain_rna_seq_raw.csv"
output_directory = "outputs\\ht_output"
plate_layout_path = "data\\231204_Walt_Olink_HT_Plate.xlsx"
raw_data = "data\\240214_Walt_Olink_HT_Raw.parquet"
uniprot_fasta_database = "data\\uniprot_fasta_database.gz"

In [8]:
tidy_data = clean_up_raw_data(raw_data, plate_layout_path)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_data.loc[:, "Linear NPX"] = unique_data["PCNormalizedNPX"].map(lambda x: 2**x)


In [9]:
# SI tables 4 and 5

# Find proteins with a tau score of less than 0.25
hgnc_ids = (
    "https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/hgnc_complete_set.txt"
)
brain_rna_seq = pd.read_csv(brain_rna_seq_raw_path)

hgnc_uniprot_mapping_data = pd.read_csv(
    (StringIO(requests.get(hgnc_ids).text)),
    sep="\t",
    usecols=["hgnc_id", "uniprot_ids", "symbol", "name", "alias_symbol", "alias_name"],
)

hgnc_uniprot_mapping_data["uniprot_ids"] = hgnc_uniprot_mapping_data[
    "uniprot_ids"
].str.split("|")
hgnc_uniprot_mapping_data = hgnc_uniprot_mapping_data.explode("uniprot_ids")
hgnc_uniprot_mapping_data = hgnc_uniprot_mapping_data.reset_index(drop=True)

brain_rna_seq = pd.merge(
    brain_rna_seq,
    hgnc_uniprot_mapping_data,
    left_on="id",
    right_on="hgnc_id",
    how="inner",
)
brain_rna_seq.dropna(subset=["uniprot_ids"], inplace=True)
brain_rna_seq.drop_duplicates(subset=["uniprot_ids"], inplace=True)
brain_rna_seq.set_index(["uniprot_ids", "symbol", "name", "alias_symbol", "alias_name"], inplace = True)

expression_df = create_enrichment_dataframe(brain_rna_seq)

enrichment_values = expression_df.apply(
            lambda row: calculate_enrichment(row, "tau"), axis=1
        )

low_tau_score = enrichment_values[enrichment_values < 0.25]

# Find proteins with the correct fractionation pattern
fractionation_uniprot_ids = analyze_fractionation(
    tidy_data,
    ["9", "10"],
    ["7", "11", "12", "13"],
    sample_health="healthy",
    mean_median_individual="individual_median",
)

#Find internal and transmembrane proteins
fasta_sequences = parse_gz_file(uniprot_fasta_database)
fasta_sequences.update(
    {
        "NTproBNP": "HPLGSPGSASDLETSGLQEQRNHLQGKLSELQVEQTSLEPLQESPRPTGVWKSREVATEGIRGHRKMVLYTLRAPR",
        "O43521-2": "MAKQPSDVSSECDREGRQLQPAERPPQLRPGAPTSLQTEPQDRSPAPMSCDKSTQTPSPPCQAFNHYLSAMASMRQAEPADMRPEIWIAQELRRIGDEFNAYYARRVFLNNYQAAEDHPRMVILRLLRYIVRLVWRMH",
        "Q13114-2": "MESSKKMDSPGALQTNPPLKLHTDRSAGTPVFVPEQGGYKEKFVKTVEDKYKCEKCHLVLCSPKQTECGHRFCESCMAALLSSSSPKCTACQESIVKDKVFKDNCCKREILALQIYCRNESRGCAEQLMLGHLLVHLKNDCHFEELPCVRPDCKEKVLRKDLRDHVEKACKYREATCSHCKSQVPMIALQVSLLQNESVEKNKSIQSLHNQICSFEIEIERQKEMLRNNESKILHLQRVIDSQAEKLKELDKEIRPFRQNWEEADSMKSSVESLQNRVTELESVDKSAGQVARNTGLLESQLSRHDQMLSVHDIRLADMDLRFQVLETASYNGVLIWKIRDYKRRKQEAVMGKTLSLYSQPFYTGYFGYKMCARVYLNGDGMGKGTHLSLFFVIMRGEYDALLPWPFKQKVTLMLMDQGSSRRHLGDAFKPDPNSSSFKKPTGEMNIASGCPVFVAQTVLENGTYIKDDTIFIKVIVDTSDLPDP",
        "O75882-2": "MVAAAAATEARLRRRTAATAALAGRSGGPHWDWDVTRAGRPGLGAGLRLPRLLSPPLRPRLLLLLLLLSPPLLLLLLPCEAEAAAAAAAVSGSAAAEAKECDRPCVNGGRCNPGTGQCVCPAGWVGEQCQHCGGRFRLTGSSGFVTDGPGNYKYKTKCTWLIEGQPNRIMRLRFNHFATECSWDHLYVYDGDSIYAPLVAAFSGLIVPERDGNETVPEVVATSGYALLHFFSDAAYNLTGFNITYSFDMCPNNCSGRGECKISNSSDTVECECSENWKGEACDIPHCTDNCGFPHRGICNSSDVRGCSCFSDWQGPGCSVPVPANQSFWTREEYSNLKLPRASHKAVVNGNIMWVVGGYMFNHSDYNMVLAYDLASREWLPLNRSVNNVVVRYGHSLALYKDKIYMYGGKIDSTGNVTNELRVFHIHNESWVLLTPKAKEQYAVVGHSAHIVTLKNGRVVMLVIFGHCPLYGYISNVQEYDLDKNTWSILHTQGALVQGGYGHSSVYDHRTRALYVHGGYKAFSANKYRLADDLYRYDVDTQMWTILKDSRFFRYLHTAVIVSGTMLVFGGNTHNDTSMSHGAKCFSSDFMAYDIACDRWSVLPRPDLHHDVNRFGHSAVLHNSTMYVFGGFNSLLLSDILVFTSEQCDAHRSEAACLAAGPGIRCVWNTGSSQCISWALATDEQEEKLKSECFSKRTLDHDRCDQHTDCYSCTANTNDCHWCNDHCVPRNHSCSEGQISIFRYENCPKDNPMYYCNKKTSCRSCALDQNCQWEPRNQECIALPENICGIGWHLVGNSCLKITTAKENYDNAKLFCRNHNALLASLTTQKKVEFVLKQLRIMQSSQSMSKLTLTPWVGLRKINVSYWCWEDMSPFTNSLLQWMPSEPSDAGFCGILSEPSTRGLKAATCINPLNGSVCERPANHSAKQCRTPCALRTACGDCTSGSSECMWCSNMKQCVDSNAYVASFPFGQCMEWYTMSTCPPENCSGYCTCSHCLEQPGCGWCTDPSNTGKGKCIEGSYKGPVKMPSQAPTGNFYPQPLLNSSMCLEDSRYNWSFIHCPACQCNGHSKCINQSICEKCENLTTGKHCETCISGFYGDPTNGGKCQPCKCNGHASLCNTNTGKCFCTTKGVKGDECQLCEVENRYQGNPLRGTCYYTLLIDYQFTFSLSQEDDRYYTAINFVATPDEQNRDLDMFINASKNFNLNITWAASFSAGTQAGEEMPVVSKTNIKEYKDSFSNEKFDFRNHPNITFFVYVSNFTWPIKIQVQTE",
        "Q8WXW3-4": "MSRKISKESKKVNISSSLESEDISLETTVPTDDISSSEEREGKVRITRQLIERKELLHNIQLLKIELSQKTMMIDNLKVDYLTKIEELEEKLNDALHQKQLLTLRLDNQLAFQQKDASKYQELMKQEMETILLRQKQLEETNLQLREKAGDVRRNLRDFELTEEQYIKLKAFPEDQLSIPEYVSVRFYELVNPLRKEICELQVKKNILAEELSTNKNQLKQLTEELAAMKQILVKMHSKHSENSLLLTKTEPKHVTENQKSKTLNVPKEHEDNIFTPKPTLFTKKEAPEWSKKQKMKT",
    }
)
assays = pd.read_excel(assay_list_path)
assays["Sequence"] = assays["UniProt ID"].map(
    lambda x: fasta_sequences.get(x, "N/A")
)
tm_uniprot_ids = identify_localization(assays, "TMhelix", output_directory)
int_uniprot_ids = identify_localization(assays, "internal", output_directory)

# List of proteins that are internal with the correct fractionation pattern and a low tau score
int_low_tau = low_tau_score[
    (low_tau_score.index.get_level_values("uniprot_ids").isin(int_uniprot_ids))
    & (low_tau_score.index.get_level_values("uniprot_ids").isin(fractionation_uniprot_ids))
    ]

# List of proteins that are transmembrane with the correct fractionation pattern and a low tau score
tm_low_tau = low_tau_score[
    (low_tau_score.index.get_level_values("uniprot_ids").isin(tm_uniprot_ids))
    & (low_tau_score.index.get_level_values("uniprot_ids").isin(fractionation_uniprot_ids))
    ]

# Add fractionation scores to the dataframe of internal proteins with low tau scores
int_low_tau_fractionation_scores = []
for protein in list(int_low_tau.index.get_level_values("uniprot_ids")):
    df = tidy_data[protein]
    ratio = find_ratio(df, ["9", "10"], ["7", "11" "12", "13"])
    int_low_tau_fractionation_scores.append(ratio)

int_low_tau = int_low_tau.reset_index()
int_low_tau["ev_association_score"] = int_low_tau_fractionation_scores

# Add fractionation scores to the dataframe of transmembrane proteins with low tau scores
tm_low_tau_fractionation_scores = []
for protein in list(tm_low_tau.index.get_level_values("uniprot_ids")):
    df = tidy_data[protein]
    ratio = find_ratio(df, ["9", "10"], ["7", "11" "12", "13"])
    tm_low_tau_fractionation_scores.append(ratio)

tm_low_tau = tm_low_tau.reset_index()
tm_low_tau["ev_association_score"] = tm_low_tau_fractionation_scores

In [10]:
# SI tables 2 and 3
int_with_fract_pattern = set(int_uniprot_ids) & set(fractionation_uniprot_ids)
tm_with_fract_pattern = set(tm_uniprot_ids) & set(fractionation_uniprot_ids)

tm_fractionation_scores = []
tm_proteins = []

for protein in tm_with_fract_pattern:
    df = tidy_data[protein]
    ratio = find_ratio(df, ["9", "10"], ["7", "11" "12", "13"])
    tm_fractionation_scores.append(ratio)
    tm_proteins.append(protein)

int_fractionation_scores = []
int_proteins = []

for protein in int_with_fract_pattern:
    df = tidy_data[protein]
    ratio = find_ratio(df, ["9", "10"], ["7", "11" "12", "13"])
    int_fractionation_scores.append(ratio)
    int_proteins.append(protein)

tm_with_fract_df = pd.DataFrame({"uniprot_ids": tm_proteins, "ev_association_score": tm_fractionation_scores})
int_with_fract_df = pd.DataFrame({"uniprot_ids": int_proteins, "ev_association_score": int_fractionation_scores})

tm_with_fract_df = tm_with_fract_df.merge(hgnc_uniprot_mapping_data, on="uniprot_ids")
int_with_fract_df = int_with_fract_df.merge(hgnc_uniprot_mapping_data, on="uniprot_ids")

In [11]:
# generate the data for figure 1d and 1e

associated_proteins = []
associated_samples = []
associated_median_npx = []

fig_1d_associated = ["P08758", "P07355", "P09525", "Q9NP79"]
fig_1d_contaminant = ["P02751", "P00734", "P36955", "P01024"]

tidy_data_sec = tidy_data[
    (tidy_data.index.get_level_values("Health") == "Healthy")
    & (tidy_data.index.get_level_values("Sample").str.contains("SEC"))
]

for column in fig_1d_associated:
    df = tidy_data_sec[column]

    for sample in list(df.index.get_level_values("Sample").unique()):
        associated_proteins.append(column)
        associated_samples.append(sample)
        sample_df = df[df.index.get_level_values("Sample") == sample]
        associated_median_npx.append(sample_df.median())

associated_proteins_df = pd.DataFrame({"protein": associated_proteins, "sample": associated_samples, "median_npx": associated_median_npx})

contaminant_proteins = []
contaminant_samples = []
contaminant_median_npx = []

for column in fig_1d_contaminant:
    df = tidy_data_sec[column]

    for sample in list(df.index.get_level_values("Sample").unique()):
        contaminant_proteins.append(column)
        contaminant_samples.append(sample)
        sample_df = df[df.index.get_level_values("Sample") == sample]
        contaminant_median_npx.append(sample_df.median())

contaminant_proteins_df = pd.DataFrame({"protein": contaminant_proteins, "sample": contaminant_samples, "median_npx": contaminant_median_npx})

associated_proteins = associated_proteins_df.pivot(columns = "protein", index = "sample")
associated_proteins.to_excel("associated_proteins.xlsx")

contaminant_proteins = contaminant_proteins_df.pivot(columns = "protein", index = "sample")
contaminant_proteins.to_excel("contaminant_proteins.xlsx")

In [12]:
# identify targets found inside microglia cells that may be EV associated using the raw data file

identify_targets(
    assay_list_path = assay_list_path,
    uniprot_fasta_database = uniprot_fasta_database, 
    brain_rna_seq_raw_path = brain_rna_seq_raw_path,
    region = "internal", 
    cell_type = "microglia", 
    specificity_metric = "tau", 
    specificity_cutoff = 0.75,
    high_fractions = ['9', '10'], 
    low_fractions = ['7', '11', '12', '13'], 
    sample_health = 'healthy', 
    mean_median_individual = 'individual_median',
    raw_olink_data_file = raw_data,
    plate_layout_dataframe = plate_layout_path,
    output_directory = output_directory
    )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_data.loc[:, "Linear NPX"] = unique_data["PCNormalizedNPX"].map(lambda x: 2**x)


{'A2A288',
 'P14317',
 'P19838',
 'P57796',
 'Q12802',
 'Q6P0N0',
 'Q6P589',
 'Q7Z6K4',
 'Q9NP95',
 'Q9Y572'}

In [13]:
# identify targets found inside microglia cells that may be EV associated using the tidy dataframe

identify_targets(
    assay_list_path = assay_list_path,
    uniprot_fasta_database = uniprot_fasta_database, 
    brain_rna_seq_raw_path = brain_rna_seq_raw_path,
    region = "internal", 
    cell_type = "microglia", 
    specificity_metric = "tau", 
    specificity_cutoff = 0.75,
    high_fractions = ['9', '10'], 
    low_fractions = ['7', '11', '12', '13'], 
    sample_health = 'healthy', 
    mean_median_individual = 'individual_median',
    plate_layout_dataframe = plate_layout_path,
    tidy_dataframe = tidy_data,
    output_directory = output_directory
    )

{'A2A288',
 'P14317',
 'P19838',
 'P57796',
 'Q12802',
 'Q6P0N0',
 'Q6P589',
 'Q7Z6K4',
 'Q9NP95',
 'Q9Y572'}

In [14]:
# make a box-and-whisker plot for a target of interest

plot_protein_fractionation(tidy_data, "Q8WUM4")

KeyError: 'Q8WUM4'

In [None]:
# Find possible endothelial-associated proteins and graph the EV association score against the tau specificity score

internal_endothelial = identify_targets(
    assay_list_path = assay_list_path,
    uniprot_fasta_database = uniprot_fasta_database, 
    brain_rna_seq_raw_path = brain_rna_seq_raw_path,
    region = "internal", 
    cell_type = "endothelial", 
    specificity_metric = "tau", 
    specificity_cutoff = 0.75,
    high_fractions = ['9', '10'], 
    low_fractions = ['7', '11', '12', '13'], 
    sample_health = 'healthy', 
    mean_median_individual = 'individual_median',
    plate_layout_dataframe = plate_layout_path,
    tidy_dataframe = tidy_data,
    output_directory = output_directory
    )

transmembrane_endothelial = identify_targets(
    assay_list_path = assay_list_path,
    uniprot_fasta_database = uniprot_fasta_database, 
    brain_rna_seq_raw_path = brain_rna_seq_raw_path,
    region = "TMhelix", 
    cell_type = "endothelial", 
    specificity_metric = "tau", 
    specificity_cutoff = 0.75,
    high_fractions = ['9', '10'], 
    low_fractions = ['7', '11', '12', '13'], 
    sample_health = 'healthy', 
    mean_median_individual = 'individual_median',
    plate_layout_dataframe = plate_layout_path,
    tidy_dataframe = tidy_data,
    output_directory = output_directory
    )

hgnc_ids = (
        "https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/hgnc_complete_set.txt"
    )

hgnc_uniprot_mapping_data = pd.read_csv(
    (StringIO(requests.get(hgnc_ids).text)),
    sep="\t",
    usecols=["symbol", "uniprot_ids"],
)

ev_association_scores = ev_association_score_df(tidy_data, high_fractions = ['9', '10'], low_fractions = ['7', '11', '12', '13'])
internal_endothelial_ratios_df = ev_association_scores[ev_association_scores["ht_assay"].isin(list(internal_endothelial))]
transmembrane_endothelial_ratios_df = ev_association_scores[ev_association_scores["ht_assay"].isin(list(transmembrane_endothelial))]

brain_rna_seq_data = map_hgnc_ids(brain_rna_seq_raw_path)

expression_df = create_enrichment_dataframe(brain_rna_seq_data)
tau_scores = expression_df.apply(lambda row: calculate_enrichment(row, "tau"), axis = 1)

internal_endothelial_tau = tau_scores[tau_scores.index.isin(internal_endothelial)]
transmembrane_endothelial_tau = tau_scores[tau_scores.index.isin(transmembrane_endothelial)]

internal_endothelial_df = tidy_data.loc[:, list(internal_endothelial)]
transmembrane_endothelial_df = tidy_data.loc[:, list(transmembrane_endothelial)]

internal_endothelial_tau_df = internal_endothelial_tau.reset_index()
internal_endothelial_tau_df.columns = ['uniprot_ids', 'values']
internal_endothelial_tau_df['localization'] = 'internal'

transmembrane_endothelial_tau_df = transmembrane_endothelial_tau.reset_index()
transmembrane_endothelial_tau_df.columns = ['uniprot_ids', 'values']
transmembrane_endothelial_tau_df['localization'] = 'transmembrane'

internal_endothelial_df = pd.merge(internal_endothelial_ratios_df, internal_endothelial_tau_df, left_on = "ht_assay", right_on = "uniprot_ids")
transmembrane_endothelial_df = pd.merge(transmembrane_endothelial_ratios_df, transmembrane_endothelial_tau_df, left_on = "ht_assay", right_on = "uniprot_ids")

endothelial_df = pd.concat([internal_endothelial_df, transmembrane_endothelial_df])
endothelial_df = pd.merge(endothelial_df, hgnc_uniprot_mapping_data, on = "uniprot_ids")

endothelial_df.to_csv("endothelial_tau_ev_association.csv")


# plt.figure(figsize=(10, 6))

# for loc, group in endothelial_df.groupby('localization'):
#     plt.scatter(group['ht_ratio'], group['values'], label=loc, c='blue' if loc == 'internal' else 'red')

# for i in range(len(endothelial_df)):
#     plt.text(endothelial_df['ht_ratio'][i], endothelial_df['values'][i], endothelial_df['symbol'][i])

# plt.xlabel('EV Association Score')
# plt.ylabel('Tau Score')
# plt.title('Endothelial Cells')
# plt.legend()

# plt.show()

In [None]:
# create a dataframe of tau and ev asssociation scores for internal and transmembrane cell-type specific targets that meet fractionation and cell-type specificity criteria
def cell_type_ev_association_tau_scores(
    tidy_dataframe,
    assay_list_path,
    uniprot_fasta_database,
    brain_rna_seq_raw_path,
    cell_type,
    specificity_metric,
    specificity_cutoff,
    high_fractions,
    low_fractions,
    sample_health,
    mean_median_individual="median",
    output_directory="ht_output",
):
    internal = identify_targets(
        tidy_dataframe = tidy_dataframe,
        assay_list_path = assay_list_path,
        uniprot_fasta_database = uniprot_fasta_database, 
        brain_rna_seq_raw_path = brain_rna_seq_raw_path,
        region = "internal", 
        cell_type = cell_type, 
        specificity_metric = specificity_metric, 
        specificity_cutoff = specificity_cutoff,
        high_fractions = high_fractions, 
        low_fractions = low_fractions, 
        sample_health = sample_health, 
        mean_median_individual = mean_median_individual,
        output_directory = output_directory,
        )
    transmembrane = identify_targets(
        assay_list_path = assay_list_path,
        uniprot_fasta_database = uniprot_fasta_database, 
        brain_rna_seq_raw_path = brain_rna_seq_raw_path,
        region = "TMhelix", 
        cell_type = cell_type, 
        specificity_metric = specificity_metric, 
        specificity_cutoff = specificity_cutoff,
        high_fractions = high_fractions, 
        low_fractions = low_fractions, 
        sample_health = sample_health, 
        mean_median_individual = mean_median_individual,
        tidy_dataframe = tidy_dataframe,
        output_directory = output_directory
        )
    
    # Create dataframe of BrainRNA-Seq data
    hgnc_ids = (
        "https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/hgnc_complete_set.txt"
    )
    hgnc_uniprot_mapping_data = pd.read_csv(
        (StringIO(requests.get(hgnc_ids).text)),
        sep="\t",
        usecols=["symbol", "uniprot_ids"],
    )
    brain_rna_seq_data = map_hgnc_ids(brain_rna_seq_raw_path)
    expression_df = create_enrichment_dataframe(brain_rna_seq_data)

    # Find the tau scores for the BrainRNA-Seq data row by row/protein by protein
    tau_scores = expression_df.apply(lambda row: calculate_enrichment(row, specificity_metric), axis = 1)

    # Filter the dataframe of tau scores to include only identified internal and transmembrane targets
    internal_tau = tau_scores[tau_scores.index.isin(internal)]
    internal_tau_df = internal_tau.reset_index()
    internal_tau_df.columns = ['uniprot_ids', 'values']
    internal_tau_df['localization'] = 'internal'

    transmembrane_tau = tau_scores[tau_scores.index.isin(transmembrane)]
    transmembrane_tau_df = transmembrane_tau.reset_index()
    transmembrane_tau_df.columns = ['uniprot_ids', 'values']
    transmembrane_tau_df['localization'] = 'transmembrane'
    
    # Create a dataframe of EV association scores
    ev_association_scores = ev_association_score_df(tidy_data, high_fractions = high_fractions, low_fractions = low_fractions)

    # Filter the dataframe of EV association scores to include only identified internal and transmembrane targets
    internal_ratios_df = ev_association_scores[ev_association_scores["ht_assay"].isin(list(internal))]
    transmembrane_ratios_df = ev_association_scores[ev_association_scores["ht_assay"].isin(list(transmembrane))]

    internal_df = pd.merge(internal_ratios_df, internal_tau_df, left_on = "ht_assay", right_on = "uniprot_ids")
    transmembrane_df = pd.merge(transmembrane_ratios_df, transmembrane_tau_df, left_on = "ht_assay", right_on = "uniprot_ids")

    df = pd.concat([internal_df, transmembrane_df])
    df = pd.merge(df, hgnc_uniprot_mapping_data, on = "uniprot_ids")
    
    return df

In [None]:
astrocyte_ev_association_tau_scores = cell_type_ev_association_tau_scores(
    tidy_dataframe=tidy_data,
    assay_list_path=assay_list_path,
    uniprot_fasta_database=uniprot_fasta_database,
    brain_rna_seq_raw_path=brain_rna_seq_raw_path,
    cell_type="astrocyte",
    specificity_metric='tau',
    specificity_cutoff = 0.75,
    high_fractions = ['9', '10'], 
    low_fractions = ['7', '11', '12', '13'], 
    sample_health = 'healthy', 
    mean_median_individual = 'individual_median',
    output_directory="outputs\\ht_output",
)

endothelial_ev_association_tau_scores = cell_type_ev_association_tau_scores(
    tidy_dataframe=tidy_data,
    assay_list_path=assay_list_path,
    uniprot_fasta_database=uniprot_fasta_database,
    brain_rna_seq_raw_path=brain_rna_seq_raw_path,
    cell_type="endothelial",
    specificity_metric='tau',
    specificity_cutoff = 0.75,
    high_fractions = ['9', '10'], 
    low_fractions = ['7', '11', '12', '13'], 
    sample_health = 'healthy', 
    mean_median_individual = 'individual_median',
    output_directory="outputs\\ht_output",
)

microglia_ev_association_tau_scores = cell_type_ev_association_tau_scores(
    tidy_dataframe=tidy_data,
    assay_list_path=assay_list_path,
    uniprot_fasta_database=uniprot_fasta_database,
    brain_rna_seq_raw_path=brain_rna_seq_raw_path,
    cell_type="microglia",
    specificity_metric='tau',
    specificity_cutoff = 0.75,
    high_fractions = ['9', '10'], 
    low_fractions = ['7', '11', '12', '13'], 
    sample_health = 'healthy', 
    mean_median_individual = 'individual_median',
    output_directory="outputs\\ht_output",
)

oligodendrocytes_ev_association_tau_scores = cell_type_ev_association_tau_scores(
    tidy_dataframe=tidy_data,
    assay_list_path=assay_list_path,
    uniprot_fasta_database=uniprot_fasta_database,
    brain_rna_seq_raw_path=brain_rna_seq_raw_path,
    cell_type="oligodendrocyte",
    specificity_metric='tau',
    specificity_cutoff = 0.75,
    high_fractions = ['9', '10'], 
    low_fractions = ['7', '11', '12', '13'], 
    sample_health = 'healthy', 
    mean_median_individual = 'individual_median',
    output_directory="outputs\\ht_output",
)

neuron_ev_association_tau_scores = cell_type_ev_association_tau_scores(
    tidy_dataframe=tidy_data,
    assay_list_path=assay_list_path,
    uniprot_fasta_database=uniprot_fasta_database,
    brain_rna_seq_raw_path=brain_rna_seq_raw_path,
    cell_type="neuron",
    specificity_metric='tau',
    specificity_cutoff = 0.75,
    high_fractions = ['9', '10'], 
    low_fractions = ['7', '11', '12', '13'], 
    sample_health = 'healthy', 
    mean_median_individual = 'individual_median',
    output_directory="outputs\\ht_output",
)

In [None]:
astrocyte_ev_association_tau_scores["cell_type"] = "astrocyte"
endothelial_ev_association_tau_scores["cell_type"] = 'endothelial'
microglia_ev_association_tau_scores['cell_type'] = 'microglia'
oligodendrocytes_ev_association_tau_scores['cell_type'] = 'oligodendrocyte'
neuron_ev_association_tau_scores['cell_type'] = 'neuron'

ev_association_tau_scores = pd.concat([astrocyte_ev_association_tau_scores, endothelial_ev_association_tau_scores])
ev_association_tau_scores = pd.concat([ev_association_tau_scores, microglia_ev_association_tau_scores])
ev_association_tau_scores = pd.concat([ev_association_tau_scores, oligodendrocytes_ev_association_tau_scores])
ev_association_tau_scores = pd.concat([ev_association_tau_scores, neuron_ev_association_tau_scores])