In [41]:
import pandas as pd
import numpy as np
import json
from scipy.stats import ttest_ind
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from matplotlib.cm import get_cmap
import matplotlib.colors as colors
import matplotlib.patches as patches
from mpl_toolkits.axes_grid1 import make_axes_locatable
import seaborn as sns

import obone

In [42]:
# define groups for analyzis, keys = graph y axis labels, values = survival[name] values
groups = {
    "CTL": "Control",
    "Group0.3": "0.3",
    "Group1": "1",
    "Group3": "3",
    "Group10": "10",
}

# define gene weights from json in ALZ directory
gene_weights_1 = "ALZ/alz_gene_weights_1.json" 
with open(gene_weights_1) as file_in:
    gene_weights_1 = json.load(file_in)
    gene_weights_1 = {int(k): v for k, v in gene_weights_1.items()}

In [43]:
# build GSE object to get survival
gse164788 = obone.GSE(accessionID="GSE164788")
survival = gse164788.survival()

# define column with group labels and fillna()
name = "c drug_concentration_um"
survival[name] = survival[name].fillna("Control")
survival[name] = survival[name].astype(str)

Survival GPL: GPL18573


In [44]:
# get expr file from parquet file in ALZ directory
expr = pd.read_parquet("ALZ/GSE164788-GPL18573-expr.parquet.gzip")

# drop random column that's not in survival file?
expr = expr.drop("dge1_O16", axis=1)

In [45]:
# make BoNE files, and recreate expr and survival to preserve BoNE formating
rodriguez = obone.BoNE(expr, survival)
expr = rodriguez.expr
survival = rodriguez.survival
thr = rodriguez.thr()

In [92]:
# rank function to determine score
def rank(expr, thr, gene_weights):
    for weight, group in gene_weights.items():
        expr = expr[expr.index.isin(group)]
        thr = thr[thr.index.isin(group)]

        sd = np.std(expr, axis=1).replace(0, 1)
        v = expr.sub(thr.iloc[:, 3], axis=0)
        v = v.div(3)
        v = v.div(sd, axis=0)

        rank = v.sum(axis=0)
        rank.name = f"Weight: {weight}"

        if "ranks" not in locals():
            ranks = rank
        else:
            ranks = pd.concat([ranks, rank], axis=1)

    weights = [float(w) for w in list(gene_weights.keys())]
    ranks["Score"] = np.dot(ranks, np.array(weights))
    ranks.name = "Sample"

    return ranks

def score(expr, survival, thr, survival_col, gene_weights):
    survival = survival[survival_col]
    survival.name = "Sample Type"

    # add score
    score = rank(expr, thr, gene_weights)["Score"]
    df = pd.concat((survival, score), axis=1, join="inner")
    df.index.name = "Sample"
    return df

def plot_data(expr, survival, thr, survival_col, gene_weights, groups):
    df = score(expr, survival, thr, survival_col, gene_weights)

    # map cval to samples and groups
    all_sample_types = []
    cval_group = {}
    cval_sample_type = {}
    for i, (group_name, sample_types) in enumerate(groups.items()):
        # ensure samples are a list
        if not isinstance(sample_types, list):
            sample_types = [sample_types]
        # ensure samples are capitalized
        sample_types = [str(sample_type) for sample_type in sample_types]
        # craete list of all samples in every group
        all_sample_types.extend(sample_types)
        # map cval to group name for line 92
        cval_group[i] = group_name
        # create cval per sample type
        for sample_type in sample_types:
            cval_sample_type[sample_type] = i
    df = df[df["Sample Type"].isin(all_sample_types)]
    df["Cval"] = df["Sample Type"].replace(cval_sample_type)

    # add annotation
    df = df.reset_index()
    df["Annotation"] = df.groupby(["Cval"])["Sample"].transform("count")
    df["Annotation"] = "(" + df["Annotation"].astype(str) + ")"
    df["Annotation"] = df["Cval"].replace(cval_group) + df["Annotation"]
    df = df.set_index("Sample")

    # add color
    color = {i: get_cmap("Paired")(i) for i in range(len(groups.keys()))}
    df["Color"] = df["Cval"].map(color)

    # add pvalue and roc_auc score
    control_scores = list(df[df["Cval"] == 0]["Score"])
    for val in df[df["Cval"] != 0]["Cval"].unique():
        group_score = list(df[df["Cval"] == val]["Score"])
        _, pval = ttest_ind(control_scores, group_score, equal_var=False)
        if pval < 0.05:
            df.loc[df["Cval"] == val, "Pval"] = pval

        roc_auc_data = df[df["Cval"].isin([0, val])]
        roc_auc = roc_auc_score(roc_auc_data["Cval"], roc_auc_data["Score"])
        df.loc[df["Cval"] == val, "ROC AUC"] = roc_auc

    # sort data by cval for proper coloring
    df = df.sort_values("Cval")
    return df

In [96]:
test_rank = rank(expr, thr, gene_weights_1)
test_score = score(expr, survival, thr, name, gene_weights_1)
test_plot_data = plot_data(expr, survival, thr, name, gene_weights_1, groups)

In [64]:
from dataclasses import dataclass


@dataclass
class BoNE:
    sample_data: pd.DataFrame

    def init(self, survival_col: str, gene_weights: dict, groups: dict) -> None:
        self.cval = list(self.sample_data.sort_values("Score")["Cval"])
        self.cval_colors = list(self.sample_data["Color"])

        group_data = self.sample_data.drop_duplicates("Annotation")
        group_data = group_data.set_index("Annotation")
        self.group_data = group_data
        self.annotations = list(group_data.index)
        self.group_colors = list(group_data["Color"])
        self.res = list(group_data["ROC AUC"].dropna())

        plt.figure(figsize=(10, 5), dpi=100)

    def title_bar(self):
        ax = plt.subplot2grid((4, 1), (0, 0))
        cval = np.array(self.cval).reshape(1, len(self.cval))
        extent = [0, len(self.cval), 0, 5]
        ax.axis(extent)
        cmap = colors.ListedColormap(self.cval_colors)

        ax.imshow(
            cval,
            interpolation="nearest",
            cmap=cmap,
            extent=extent,
            aspect="auto",
        )
        ax.set(xticks=range(len(self.cval)), xticklabels=[], yticklabels=[])
        ax.tick_params(top=False, left=False, bottom=False, right=False)
        for _, spine in ax.spines.items():
            spine.set_visible(False)
        ax.grid(which="major", color="black", alpha=0.5, linestyle="-", linewidth=0.75)

        res_text = f'AUC: {",".join([str(val) for val in self.res])}'
        ax.text(len(self.cval), 4, res_text)
        return ax

    def title_bar_top(self, ax):
        divider = make_axes_locatable(ax)
        ax1 = divider.append_axes("top", size="100%", pad="20%", frame_on=False)
        ax1.axison = False
        ax1.axis([0, len(self.cval), 0, 5])
        ax1.grid(False)

        spacer = len(self.cval) / len(self.annotations)
        for i in range(len(self.annotations)):
            ax1.add_patch(
                patches.Rectangle(
                    (i * spacer, 0),
                    1,
                    3,
                    facecolor=self.group_colors[i],
                    edgecolor="none",
                    alpha=1.0,
                )
            )
            ax1.text(
                i * spacer + 1,
                1,
                self.annotations[i],
                rotation="horizontal",
                ha="left",
                va="center",
                fontsize=12,
            )

    def violin(self, survival_col: str, gene_weights: dict, groups: dict):
        self.init(survival_col, gene_weights, groups)
        ax = self.title_bar()
        self.title_bar_top(ax)

        ax = plt.subplot2grid((4, 1), (1, 0), rowspan=3)
        sns.set_theme(palette=self.group_colors)

        ax = sns.violinplot(
            x="Score",
            y="Annotation",
            data=self.sample_data,
            inner="quartile",
            linewidth=0.5,
            ax=ax,
        )
        ax = sns.swarmplot(
            x="Score",
            y="Annotation",
            color="blue",
            alpha=0.2,
            ax=ax,
            data=self.sample_data,
        )
        ax.set(ylabel=None)
        ax.xaxis.grid(True, clip_on=False)

        if "Pval" in self.sample_data.columns:
            text = self.group_data[self.group_data["Pval"].notnull()]
            y_value = 0.5
            for annotation in text.index:
                ax.text(
                    text.loc[annotation, "Score"],
                    y_value,
                    f'p={text.loc[annotation, "Pval"]:.1e}',
                    horizontalalignment="center",
                    size=12,
                    color="0.3",
                )
                y_value += 1
        return ax

    def density(self, survival_col: str, gene_weights: dict, groups: dict):
        self.init(survival_col, gene_weights, groups)
        ax = self.title_bar()
        self.title_bar_top(ax)

        ax = plt.subplot2grid((4, 1), (1, 0), rowspan=3)
        df = self.sample_data.reset_index(drop=True)
        for i in range(len(self.annotations)):
            df1 = df[df["Annotation"] == self.annotations[i]]
            annotation = df1["Annotation"].iloc[0]
            s = df1.reset_index()["index"]
            s.name = annotation
            if len(s) != 1:
                ax = s.plot.kde(
                    bw_method=1.0, ax=ax, c=self.group_colors[i], label=annotation
                )
            elif len(s) == 1:
                df1["y"] = 1
                ax = df1.plot.line(
                    x=annotation, y="y", ax=ax, c=self.group_colors[i], label=annotation
                )
                ax.axvline(x=df1.index[0], c=self.group_colors[i])


(764, 20)