# Running GSEA and differential gene expression modules

### what is GSEA?

Gene Set Enrichment Analysis (GSEA) is a computational method that determines whether an a priori defined set of genes shows statistically
significant, concordant differences between two biological states
(e.g. phenotypes).


### Installing GSEApy

in your terminal run:

conda install -c bioconda gseapy

In [None]:
# importing modules
import gseapy as gp
import pandas as pd
import numpy as np

In [None]:
# reading in data

fold_change_data=pd.read_csv("INSERT PATH HERE",sep="\t")

### formatting into rnk file

what is an rnk file supposed to look like?

You can sort by fold change in this case

Gene  Value \
ATXN1 16.456753 \
UBQLN4	13.989493 \
CALM1	13.745533 \
DLG4	12.796588 \
MRE11A	12.787631 

In [None]:
fold_change_data.head()

In [None]:
# let's subset the dataframe down to columns of interest

fold_change_data=fold_change_data[fold_change_data['foldchange']!="Not Determined"]
fold_change_data['foldchange'] = pd.to_numeric(fold_change_data['foldchange'])

#make a new column that describes log2foldchange
fold_change_data["log2foldchange"]=np.log2(fold_change_data["Parental FPKM"] / fold_change_data["Persister FPKM"])

#save a version for later plotting
fold_change_data_plot=fold_change_data

# can you determine which column names you should specify below?
fold_change_data=fold_change_data[["INSERT COLUMN 1","INSERT COLUMN 2"]]

In [None]:
# now we need to sort the rnk file

fold_change_data=fold_change_data.sort_values(by="INSERT COLUMN 2",ascending=True)

In [None]:
# run GSEA
pre_res = gp.prerank(rnk=fold_change_data, # or rnk = rnk,
                     gene_sets='KEGG_2016',
                     min_size=5,
                     max_size=500,
                     permutation_num=1000, # reduce number to speed up testing
                     outdir=None, # don't write to disk
                     seed=6,
                     verbose=True, # see what's going on behind the scenes
                    )

In [None]:
# import plotting functions
import matplotlib.pyplot as plt

In [None]:
pre_res.res2d=pre_res.res2d[pre_res.res2d["fdr"]<0.05]

In [None]:
# take the top and bottom 5 gene sets
plot_df=pd.concat([pre_res.res2d.head(n=5),pre_res.res2d.tail(n=5)],axis=0)
plot_df

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Example data (replace these with your actual data)
enrichment_scores = plot_df["es"]
fdr_values = plot_df["fdr"]
gene_sets = list(plot_df.index.values)

# Create a scatter plot
fig, ax = plt.subplots(figsize=(10, 6))

# Set colors for positive and negative enrichment scores
positive_color = 'green'
negative_color = 'red'

# Determine dot size based on FDR values
dot_size = [1/(fdr_value+0.01) for fdr_value in fdr_values]

# Plot negative enrichment scores (to the left of x=0)
ax.scatter(enrichment_scores, gene_sets, s=dot_size, color=negative_color, label='Depleted', marker='o')

# Add vertical line at x=0
ax.axvline(0, color='black', linestyle='--', linewidth=0.8)

# Set x and y axis labels
ax.set_xlabel('Enrichment Score', fontsize=14)
ax.set_ylabel('Gene Sets', fontsize=14)

# Add y-axis grid lines
ax.grid(axis='y', linestyle='--', alpha=0.7)

# Add title
plt.title('GSEA Dot Plot', fontsize=16)

# Add legend
ax.legend(loc='upper right', fontsize=12)

# Show the plot
plt.tight_layout()
plt.show()


### what are volcano plots?

volcano plots are a neat way to visualize differences between gene expression between conditions

In [None]:
# plot volcano plot

In [None]:
df=fold_change_data_plot

fold_change = df['log2foldchange']
q_values = df['q_value']
gene_names = df['Gene ID']
# Compute the negative logarithm of q-values (base 10)
log_q_values = -np.log10(q_values)

# Create a scatter plot (volcano plot)
plt.figure(figsize=(10, 6))

# Plot non-significant points (gray dots)
plt.scatter(fold_change[q_values > 0.05], log_q_values[q_values > 0.05], color='gray', alpha=0.6, label='Non-significant')

# Plot significantly upregulated genes (red dots)
significant_up = (q_values <= 0.05) & (fold_change > 0)
plt.scatter(fold_change[significant_up], log_q_values[significant_up], color='red', alpha=0.8, label='Upregulated')

# Plot significantly downregulated genes (blue dots)
significant_down = (q_values <= 0.05) & (fold_change < 0)
plt.scatter(fold_change[significant_down], log_q_values[significant_down], color='blue', alpha=0.8, label='Downregulated')

# Add horizontal line at -log10(q-value) = 1.3 (corresponding to q-value = 0.05)
plt.axhline(y=1.3, color='gray', linestyle='--', linewidth=0.8)

# Set axis labels and title
plt.xlabel('Fold Change', fontsize=14)
plt.ylabel('-log10(Q-Value)', fontsize=14)
plt.title('Volcano Plot', fontsize=16)

# Annotate significant samples with gene names
significant_genes = gene_names[q_values <= 0.05]
for gene_name, fc, log_q in zip(significant_genes, fold_change[q_values <= 0.05], log_q_values[q_values <= 0.05]):
    plt.annotate(gene_name, xy=(fc, log_q), xytext=(5, 5), textcoords='offset points', fontsize=8, ha='left', va='bottom')

# Add legend
plt.legend()

# Show the plot
plt.tight_layout()
plt.show()

# TCGA experimentation

In [None]:
# use top genes/pathways from analysis within TCGA

# do top persistor pathways predict immunogenicity, surivival, and ???

In [None]:
rna_xcell=pd.read_csv("path to TCGA_composite_file.txt",sep="\t")

In [None]:
# set up stratification variable 

variable="CAMP"

mean=rna_xcell[variable].mean()
rna_xcell["group"]=np.where(rna_xcell[variable]>=mean,1,0)

#gene set example
#advanced task!
geneset=plot_df["genes"][0]
geneset

# some kind of linear combination of the genes from a given pathway?

In [None]:
# plot km curves 
import matplotlib.pyplot as plt
import matplotlib
import lifelines #conda install -c conda-forge lifelines
from lifelines import CoxPHFitter
from lifelines import KaplanMeierFitter
from lifelines.statistics import logrank_test

surv_df_canc=rna_xcell

surv_df_canc=surv_df_canc.dropna(subset=["OS"])
surv_df_canc=surv_df_canc.dropna(subset=["OS.time"])

surv_df_canc["OS"][surv_df_canc["OS"]=="True"]=1.0
surv_df_canc["OS"][surv_df_canc["OS"]=="Y"]=1.0
surv_df_canc["OS"][surv_df_canc["OS"]=="Dead"]=1.0
surv_df_canc["OS"][surv_df_canc["OS"]=="False"]=0.0
surv_df_canc["OS"][surv_df_canc["OS"]=="Not available"]=0.0
surv_df_canc["OS"][surv_df_canc["OS"]=="Alive"]=0.0
surv_df_canc["OS"][surv_df_canc["OS"]=="Died not of Melanoma"]=0.0
surv_df_canc["OS"]=pd.to_numeric(surv_df_canc["OS"])

fig=plt.figure(figsize=(7,7))
ax1 = plt.subplot(1,1,1)

kmf = KaplanMeierFitter()

kmf.fit(surv_df_canc[surv_df_canc['group']==1]['OS.time'], surv_df_canc[surv_df_canc['group']==1]['OS'],label="0, N="+str(surv_df_canc[surv_df_canc['group']==1].shape[0]))
kmf.plot(ci_show=False, ax=ax1)
kmf.fit(surv_df_canc[surv_df_canc['group']==0]['OS.time'], surv_df_canc[surv_df_canc['group']==0]['OS'],label="1, N="+str(surv_df_canc[surv_df_canc['group']==0].shape[0]))
kmf.plot(ci_show=False, ax=ax1)

results = logrank_test(surv_df_canc[surv_df_canc["group"]==0]['OS.time'], surv_df_canc[surv_df_canc["group"]==1]['OS.time'],event_observed_A=surv_df_canc[surv_df_canc["group"]==0]['OS'], event_observed_B=surv_df_canc[surv_df_canc["group"]==1]['OS'], alpha=.95)

plt.ylabel('% survival')
plt.xlabel('OS time (days)')
plt.title("0 vs 1 "+str(np.round(results.p_value,4)))
plt.legend(frameon=False)


In [None]:
# immunoscore differences
import seaborn as sns # conda install seaborn
from scipy import stats # conda install scipy

surv_df_canc=rna_xcell
f=plt.figure(figsize=(5,5))

ax=sns.boxplot(x="group",y="ImmuneScore",data=surv_df_canc)

labels=[item.get_text() for item in ax.get_xticklabels()]
labels=[x+" ("+str(len(surv_df_canc[surv_df_canc["group"]==x]))+")" for x in labels]
ax.set_xticklabels(labels)

cat0 = surv_df_canc[(surv_df_canc["group"]==0)].fillna(0)
cat3 = surv_df_canc[(surv_df_canc["group"]==1)].fillna(0)

plt.title("neither vs both "+str(np.round(stats.ttest_ind(cat0["group"],cat3["group"])[1],20)))#+"\n"+"neither vs y "+str(np.round(stats.mannwhitneyu(cat0[x], cat2[x])[1],4))+"\n"+"both vs y "+str(np.round(stats.mannwhitneyu(cat2[x], cat3[x])[1],4))+"\n"+"both vs x "+str(np.round(stats.mannwhitneyu(cat1[x], cat3[x])[1],4))+"\n"+"x vs y "+str(np.round(stats.mannwhitneyu(cat2[x], cat1[x])[1],4)))

