In [1]:
import os
import sys
# change all these to relative paths -- relative to the jupyter notebook, will need pathlib
sys.path.extend([
    "C:\\Users\\Wyss User\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python311\\site-packages",
    "C:\\Users\\Wyss User\\Documents\\EVs\\OLINK\\src",
])

from config import hgnc_ids, high_fractions, low_fractions, MISSING_FASTA_SEQUENCES, CELL_TYPES
from brainrnaseq_specificity import map_hgnc_ids, create_enrichment_dataframe, process_hgnc_data, filter_low_tau
from deeptmhmm_localization import identify_localization, parse_gz_file, get_localization_data
from identify_targets import identify_targets, generate_protein_dataframe
from olink_fractionation import analyze_fractionation
from raw_data_preprocessing import clean_up_raw_data, plot_protein_fractionation, ev_association_score_df, find_ratio, calculate_fractionation_scores
from specificity_functions import calculate_enrichment
from gtex_specificity import gtex_specificity

from io import StringIO

import pandas as pd
import requests

os.chdir("C:\\Users\\Wyss User\\Documents\\EVs\\OLINK")

In [2]:
assay_list_path = "data\\231220_ht_panel_assay_list.xlsx"
brain_rna_seq_raw_path = "data\\240411_brain_rna_seq_raw.csv"
output_directory = "outputs\\ht_output"
plate_layout_path = "data\\231204_Walt_Olink_HT_Plate.xlsx"
raw_data = "data\\240214_Walt_Olink_HT_Raw.parquet"
uniprot_fasta_database = "data\\uniprot_fasta_database.gz"

gtex_path = r"C:\\Users\\Wyss User\\Documents\\EVs\\OLINK\\data\\GTEx_Analysis_v10_RNASeQCv2.4.2_gene_median_tpm.gct.gz"

In [3]:
# Create a tidy dataframe from the raw data file.
tidy_data = clean_up_raw_data(raw_data, plate_layout_path)

In [4]:
# SI Table 4

# Load data
brain_rna_seq = pd.read_csv(brain_rna_seq_raw_path)
hgnc_uniprot_mapping_data = process_hgnc_data(hgnc_ids)
brain_rna_seq = brain_rna_seq.merge(hgnc_uniprot_mapping_data, left_on="id", right_on="hgnc_id", how="inner").dropna(subset=["uniprot_ids"]).drop_duplicates(subset=["uniprot_ids"])
brain_rna_seq.set_index(["uniprot_ids", "symbol", "name", "alias_symbol", "alias_name"], inplace=True)
expression_df = create_enrichment_dataframe(brain_rna_seq)

# Identify low tau score proteins
low_tau_score = filter_low_tau(expression_df)

# Identify correct fractionation pattern proteins
fractionation_uniprot_ids = analyze_fractionation(tidy_data, high_fractions, low_fractions, sample_health="healthy", mean_median_individual="individual_median")

# Identify protein localizations
fasta_sequences = parse_gz_file(uniprot_fasta_database)
fasta_sequences.update(MISSING_FASTA_SEQUENCES)
assays = pd.read_excel(assay_list_path)
localization_ids = get_localization_data(assays, fasta_sequences, ["TMhelix", "internal", "external"], output_directory)

# Generate categorized protein dataframes
int_low_tau = generate_protein_dataframe(low_tau_score, fractionation_uniprot_ids, localization_ids["internal"], "internal", tidy_data, high_fractions, low_fractions)
tm_low_tau = generate_protein_dataframe(low_tau_score, fractionation_uniprot_ids, localization_ids["TMhelix"], "transmembrane", tidy_data, high_fractions, low_fractions)
ext_low_tau = generate_protein_dataframe(low_tau_score, fractionation_uniprot_ids, localization_ids["external"], "external", tidy_data, high_fractions, low_fractions)

# Concatenate results
low_tau_with_fract_pattern = pd.concat([int_low_tau, tm_low_tau, ext_low_tau])

# Re-name columns


In [8]:
# SI Table 5 

# Load and Process Brain RNA-Seq Data
brain_rna_seq = map_hgnc_ids(hgnc_ids, brain_rna_seq_raw_path)
expression_df = create_enrichment_dataframe(brain_rna_seq)

# Define localization categories and identify proteins with the correct localization

localization_types = ["TMhelix", "internal", "external"]
localization_ids = {loc: identify_localization(assays, loc, output_directory) for loc in localization_types}

# Store results for each localization category
localization_results = []

# Process each localization category
for loc, uniprot_ids in localization_ids.items():
    proteins_with_fract = set(uniprot_ids) & set(fractionation_uniprot_ids)

    # Compute EV association scores in a vectorized manner
    fractionation_scores = [
        find_ratio(tidy_data[protein], high_fractions, low_fractions)
        for protein in proteins_with_fract
    ]

    # Create a DataFrame
    df = pd.DataFrame({
        "uniprot_ids": list(proteins_with_fract),
        "ev_association_score": fractionation_scores,
    })

    # Merge with gene mapping data
    df = df.merge(hgnc_uniprot_mapping_data, on="uniprot_ids")
    df["localization"] = loc

    # Append to results
    localization_results.append(df)

# Concatenate results
fract_pattern = pd.concat(localization_results, ignore_index=True)

# Compute Tau Scores and Filter by Cutoff
tau_score_cutoff = 0.75
enrichment_values = expression_df.apply(lambda row: calculate_enrichment(row, "tau"), axis=1)
high_tau_score = enrichment_values[enrichment_values > tau_score_cutoff]

filtered_expression_df = expression_df[expression_df.index.isin(high_tau_score.index.tolist())]

max_col_list = []
uniprots_list = []

for index, row in filtered_expression_df.iterrows():
    max_column = row.idxmax()
    max_col_list.append(max_column)
    uniprots_list.append(index)

cell_type_targets = pd.DataFrame({"cell_type": max_col_list, "uniprot_id": uniprots_list})

si_table_5 = fract_pattern.merge(cell_type_targets, left_on = "uniprot_ids", right_on = "uniprot_id", how = "inner")
high_tau_score = high_tau_score.reset_index()
si_table_5 = si_table_5.merge(high_tau_score, on = "uniprot_ids")

#GTex Specificity
gtex_data = gtex_specificity(gtex_path)
si_table_5 = si_table_5.merge(gtex_data, on = "ensembl_gene_id")

# need to rename SI table columns

  row_x = array / max(array)


In [None]:
# generate the data for figure 1d and 1e

associated_proteins = []
associated_samples = []
associated_median_npx = []

fig_1d_associated = ["P08758", "P07355", "P09525", "Q9NP79"]
fig_1d_contaminant = ["P02751", "P00734", "P36955", "P01024"]

tidy_data_sec = tidy_data[
    (tidy_data.index.get_level_values("Health") == "Healthy")
    & (tidy_data.index.get_level_values("Sample").str.contains("SEC"))
]

for column in fig_1d_associated:
    df = tidy_data_sec[column]

    for sample in list(df.index.get_level_values("Sample").unique()):
        associated_proteins.append(column)
        associated_samples.append(sample)
        sample_df = df[df.index.get_level_values("Sample") == sample]
        associated_median_npx.append(sample_df.median())

associated_proteins_df = pd.DataFrame({"protein": associated_proteins, "sample": associated_samples, "median_npx": associated_median_npx})

contaminant_proteins = []
contaminant_samples = []
contaminant_median_npx = []

for column in fig_1d_contaminant:
    df = tidy_data_sec[column]

    for sample in list(df.index.get_level_values("Sample").unique()):
        contaminant_proteins.append(column)
        contaminant_samples.append(sample)
        sample_df = df[df.index.get_level_values("Sample") == sample]
        contaminant_median_npx.append(sample_df.median())

contaminant_proteins_df = pd.DataFrame({"protein": contaminant_proteins, "sample": contaminant_samples, "median_npx": contaminant_median_npx})

associated_proteins = associated_proteins_df.pivot(columns = "protein", index = "sample")
associated_proteins.to_excel("associated_proteins.xlsx")

contaminant_proteins = contaminant_proteins_df.pivot(columns = "protein", index = "sample")
contaminant_proteins.to_excel("contaminant_proteins.xlsx")

In [None]:
# identify targets found inside microglia cells that may be EV associated using the raw data file

identify_targets(
    assay_list_path = assay_list_path,
    uniprot_fasta_database = uniprot_fasta_database, 
    brain_rna_seq_raw_path = brain_rna_seq_raw_path,
    region = "TMhelix", 
    cell_type = "oligodendrocyte", 
    specificity_metric = "tau", 
    specificity_cutoff = 0.75,
    high_fractions = ['9', '10'], 
    low_fractions = ['7', '11', '12', '13'], 
    sample_health = 'healthy', 
    mean_median_individual = 'individual_median',
    raw_olink_data_file = raw_data,
    plate_layout_dataframe = plate_layout_path,
    output_directory = output_directory
    )

In [None]:
# identify targets found inside microglia cells that may be EV associated using the tidy dataframe

identify_targets(
    assay_list_path = assay_list_path,
    uniprot_fasta_database = uniprot_fasta_database, 
    brain_rna_seq_raw_path = brain_rna_seq_raw_path,
    region = "internal", 
    cell_type = "microglia", 
    specificity_metric = "tau", 
    specificity_cutoff = 0.75,
    high_fractions = ['9', '10'], 
    low_fractions = ['7', '11', '12', '13'], 
    sample_health = 'healthy', 
    mean_median_individual = 'individual_median',
    plate_layout_dataframe = plate_layout_path,
    tidy_dataframe = tidy_data,
    output_directory = output_directory
    )

In [None]:
# make a box-and-whisker plot for a target of interest

plot_protein_fractionation(tidy_data, "Q9Y251")