In [None]:
import os
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from upsetplot import UpSet, from_indicators, from_memberships

os.chdir('/lustre/home/reynaj/Projects/20241011.Byrd_Lab.IBD_NuLisa')

# set data dirs
limma_dir = 'results/aggregated/limma/'
rf_dir = 'results/aggregated/rf_model/'

# set the data levels/order
da_levels = ['N/A (Non-IBD Control)', 'Quiescent', 'Mild', 'Moderate']

## Load the DEG results for disease activity

In [None]:
fn = os.path.join(limma_dir, 'limma.all_data.disease_activity.control_versus_quiescent.tsv') 
qu_df = pd.read_table(fn)
qu_df.loc[:, 'group'] = 'quiescent'
top_qu_df = qu_df.loc[(qu_df['P.Value'] < 0.05), :]

In [None]:
fn = os.path.join(limma_dir, 'limma.all_data.disease_activity.control_versus_mild.tsv') 
mild_df = pd.read_table(fn)
mild_df.loc[:, 'group'] = 'mild'
top_mild_df = mild_df.loc[(mild_df['P.Value'] < 0.05), :]

In [None]:
fn = os.path.join(limma_dir, 'limma.all_data.disease_activity.control_versus_moderate.tsv') 
mod_df = pd.read_table(fn)
mod_df.loc[:, 'group'] = 'moderate'
top_mod_df = mod_df.loc[(mod_df['P.Value'] < 0.05), :]

## Load the importance scores from the RF model for disease activity

In [None]:
fn = os.path.join(rf_dir, 'rf_model.disease_activity.scores.tsv')
model_imp_df = pd.read_table(fn, index_col=0)
model_imp_df.loc[:, 'group'] = 'rf_model'

In [None]:
topn = 30
top_model_imp_df = model_imp_df.sort_values('score', ascending=False).head(topn) 

## Compare the overlaps between all groups (Upset Plot)

In [None]:
combined_df = pd.concat([top_qu_df, top_mild_df, top_mod_df, top_model_imp_df])

In [None]:
combined_df.head()

In [None]:
# generate the data for the upset plot
upset_df = combined_df.reset_index()
upset_df.rename(columns={'index': 'protein'}, inplace=True)
upset_df.loc[:, 'indicator'] = 1
upset_df = upset_df.loc[:, ['protein', 'group', 'indicator']]
upset_df = upset_df.pivot(index='protein', columns=['group'], values='indicator')
upset_df = upset_df.fillna(0).astype(bool).reset_index()
upset_df = from_indicators(indicators=['quiescent', 'mild', 'moderate', 'rf_model'], data=upset_df)

In [None]:
upset_df.head()

In [None]:
# Create UpSet plot
upset = UpSet(upset_df, show_counts=True)
upset.plot();

In [None]:
upset_df.loc[(True, True, True, True)]

## Plot gene expression for interesting genes across the RF Model and all other groups

In [None]:
# Load the protein levels data
fn = "results/aggregated/comp_data/protein_levels.npq.tsv"
protein_data = pd.read_csv(fn, sep="\t")
protein_data = protein_data.T
protein_data.index = protein_data.index.astype(int)

# Load the clinical data
fn = 'results/aggregated/comp_data/clinical_data.tsv'
clinical_data = pd.read_csv(fn, sep="\t")
clinical_data.index = clinical_data.matched_subject_id.astype(int)

# remove alamar samples
keep = (~clinical_data['matched_subject_id'] != 999).tolist()
clinical_data = clinical_data.loc[keep,:]
protein_data = protein_data.loc[keep,:]

In [None]:
# combine protein with clinical data
comp_protein_data = pd.concat([protein_data, clinical_data[['disease_activity']]], axis=1)

In [None]:
comp_protein_data.shape

### Completely shared protein levels

In [None]:
completely_shared = upset_df.loc[(True, True, True, True), 'protein'].tolist()

In [None]:
completely_shared

In [None]:
def da_boxplot(data, protein):
    fig, ax = plt.subplots()
    #sns.boxplot(data=data, x='disease_activity', y=protein, order=da_levels, ax=ax)
    sns.boxplot(data=data, x='disease_activity', y=protein, ax=ax)
    ax.set_title('{} levels across disease activity'.format(protein))
    return(fig, ax)

In [None]:
comp_protein_data

In [None]:
protein = 'IL36B'
fig, ax = da_boxplot(comp_protein_data, protein)

In [None]:
protein = 'CD80'
fig, ax = da_boxplot(comp_protein_data, protein)

### Shared between RF and Mild

In [None]:
shared = upset_df.loc[(False, True, False, True), 'protein'].tolist()
for protein in shared:
    fig, ax = da_boxplot(comp_protein_data, protein)

### Shared between RF, Quiescent and Mild 

In [None]:
shared = upset_df.loc[(True, True, False, True), 'protein'].tolist()
for protein in shared:
    fig, ax = da_boxplot(comp_protein_data, protein)

### Shared between RF, Mild and Moderate

In [None]:
shared = upset_df.loc[(False, True, True, True), 'protein'].tolist()
for protein in shared:
    fig, ax = da_boxplot(comp_protein_data, protein)

## Genes important only in RF 

In [None]:
shared = upset_df.loc[(False, False, False, True), 'protein'].tolist()
for protein in shared:
    fig, ax = da_boxplot(comp_protein_data, protein)