# Pipeline for analysis

All the data used in the original run can be found on the github page

## Pre processing raw data

Importing the appropiate directories

In [None]:
from DE import networkAnalysis as na

clusterDirectory need to point towards directory in which all the clusters can be found. 
It will then loop through all of the data files, gathering the sample ids from all cluster and create individual .csv files for analysis in R

In [None]:
csvpath = "folderPath/gex.tsv"
obj = na()
obj.readCsv(csvpath, sep = '\t')
#obj.renameCol('Unnamed: 0', 'gene_id')

In [None]:
obj.preProcess()

In [None]:
#Set which bottom percentage you intend to filter away
bottom = 0.75
obj.preFilter(bottom)

In [None]:
#Writes the pre processed and filtered, MUST be .txt and tab separated values
obj.filterDf.to_csv(f'folderPath/yourDesiredFileName.txt', sep = '\t', index = False)

## Visualizing SRIQ output

Creates the object from the SRIQ output

In [None]:
from DE import networkAnalysis as na



In [None]:
from DE import networkAnalysis as na
import pandas as pd
import seaborn as sns
csvpath = 'folderPath/expressionData.txt'
clusterpath = 'folderPath/SRIQClusterSolutionFolder'

test = na()
colName = 'Gene'
test.readSRIQ(csvpath, clusterpath, columnname = colName)

## Visualization of variation

In [None]:
test.SilhouttePlot(U_S = False)

Plots the metagenes in an boxplot

In [None]:
test.metaGenes(col_wrap = 2)

Calculates the centroids for each sample. Distribution plot will be shown, and when plotting the genexpression as labels for the samples

In [None]:
cp = '/USERS/jacobkarlstrom/projekt/SRIQ/notebook/data/extraData/wilkerson.2012.LAD.predictor.centroids.csv'
test.calcCentroids(cp)

## SAM analysis

In [None]:
props = '/Users/jacobkarlstrom/projekt/SRIQ/software/VRLA/resources/test.properties'
expressionData = '/Users/jacobkarlstrom/projekt/SRIQ/notebook/data/expressionData/uppGex(15k).txt'
dist = 0.55

test.samAnalysis(properties = props, expressionData = expressionData, dist = dist, spiral = True)

### Visualizing DEG

In [None]:
resultsPath = '/Users/jacobkarlstrom/projekt/SRIQ/notebook/data/expressionData/LUAD_ens_q_10000itr_1200var_10r/10000/QC_Spiral(false)/Results_log_0.61_6/LUAD_ens_q_Data_in_6_ClusterOrder_ABS_Unique.txt'
#resultsPath = '/Users/jacobkarlstrom/projekt/SRIQ/notebook/data/expressionData/Uppsala_10000itr_1200var_10r/10000/QC_Spiral(true)/Results_log_0.55_6/Uppsala_Data_in_6_ClusterOrder_ABS_Unique.txt'


test.plotSamResults(resultsPath)

newC = test.results.columns.tolist()[1:]
test.results = test.results.iloc[:,0:-1]
test.results.columns = newC

## T-test or Mann-whitney U-test

Filter variance, both bottom and top can be filtered.

In [None]:
bottom = 0.7
top = 0.95
test.filterVariantGenes(top = top, bottom = bottom)

Performs desired test for differential gene expression analysis.

In [None]:
test.diffGeneAnalysis(test = 'mannwhitneyu')

Filter the significant genes based of desired filteringtype.

In [None]:
test.filterEnrichedGenes(filteringType = 'log2fold', threshold = 2, csvpath = 'data/expressionData/fpkm.csv')

# Add features from data folder

To run this module you need clinical with followup data if running on TCGA data from https://gdc.cancer.gov/about-data/publications/pancanatlas in clinicalData folder

In [None]:
import pandas as pd
if isinstance(test.col_colors, pd.DataFrame): test.col_colors = test.col_colors['Clusters']
test.calcCentroids()
e , k, a = 'egfr_mutation_result','kras_mutation_result', 'eml4_alk_translocation_result'
test.addFeature(feature = 'tobacco_smoking_history', attr = 'Lifelong Non-smoker', censor='[Not Available]', title = 'Never-smokers')
test.addFeature(feature = 'gender', attr = 'MALE', title = 'Males')
test.addFeature(feature = 'gender', attr = 'FEMALE', title = 'Females')

test.addFeature(feature = e, attr = 'NO',censor = '[Not Available]', title = e)
test.addFeature(feature = k, attr = 'NO',censor = '[Not Available]', title = k)

Plots a clustermap of the result

In [None]:
test.plotEnrichedGenes(vmin = -1, vmax = 1, row_cluster= True)

In [None]:
g = sns.clustermap(test.samDf, vmin = -1, vmax  = 1, cmap = 'vlag', col_colors = test.col_colors, col_cluster = False, row_cluster = False)

g.ax_heatmap.set_xticks([])
g.ax_heatmap.set_yticks([])

Kaplan meier plot

In [None]:
test.kaplanMeier()

## Enrichment analysis

Converts the enrichment list to symbols if in ensembleIDs

In [None]:
test.tList = [[x.split('.')[0] for x in l] for l in test.tList]

In [None]:
test.ensemble2gene()

Fetches the most significant dbs for each cluster and saves them into a list called dbs

In [None]:
goDbs = ['GO_Biological_Process_2018', 'GO_Cellular_Component_2018', 'GO_Molecular_Function_2018']
test.enrichR(dbs = goDbs)

In [None]:
eDf = test.goEnrichDf

In [None]:
eDf['cluster'] = [1 if x == '1.0 up' else 2 if x == '1 down' else 3 if x == '2.0 up' else 4 if x == '2 down' else 5 if x == '3.0 up' else x for x in eDf['cluster'].tolist()]

In [None]:
import pandas as pd

ups = [str(i)+'.0 up' for i in range(1, 7)]
downs = [str(i)+' down' for i in range(1, 7)]
uppEDf = eDf[eDf['cluster'].isin(ups)]
downEDf = eDf[eDf['cluster'].isin(downs)]


tempDf = pd.DataFrame()
for up in ups:
    temp = uppEDf[uppEDf['cluster'] == up].iloc[:5,:]
    tempDf = pd.concat([temp, tempDf], axis = 0)
uppEDf = tempDf

tempDf = pd.DataFrame()
for down in downs:
    temp = downEDf[downEDf['cluster'] == down].iloc[:5,:]
    tempDf = pd.concat([temp, tempDf], axis = 0)
downEDf = tempDf


In [None]:
import seaborn as sns

c  = sns.dark_palette("red")
g = sns.clustermap(eDf.pivot(index = 1, columns = 'cluster', values = 2).fillna(0).transpose(), vmax =5, col_cluster = False, cmap = c)

#g.ax_heatmap.set_xticks([1,2,3,4,5])

In [None]:
test.plotEnrichmentResults(u_d='down')

Survival analysis

## Visualing genes

Takes single list of genes as argument

In [None]:
test.plotSingleGene(['KRAS', 'EGFR'])

Takes list of lists of genes as argument

In [None]:
test.plotMultipleGenes(['KRAS', 'EGFR'])

### TCGA data analysis

To run follow module, signature profiles need to be downloaded from http://tardis.cgu.edu.tw/msignaturedb/ or create on your own if not available using https://cancer.sanger.ac.uk/signatures/ into the clinicalData folder

In [None]:
import pandas as pd
sDf = test.sigDf.loc['Signature.4']
nDf = test.gexDf
nDf.columns = ['-'.join(x.split('-')[:3]) for x in nDf.columns]
concDf = pd.concat([sDf, test.gexDf.loc['Clusters']],axis = 'columns')

In [None]:
concDf['Clusters'] = [3.0 if x == 5 else 5.0 if x == 3 else 2.0 if x == 4 else 4.0 if x == 2 else x  for x in concDf['Clusters']]

In [None]:
import seaborn as sns

sns.boxplot(data = concDf, x = 'Clusters', y = 'Signature.4')

In [None]:
sns.clustermap(test.sigDf, col_colors = test.col_colors, col_cluster = False, row_cluster = False)

In [None]:
test.plotSignatures()

To run following module data from https://gdc.cancer.gov/about-data/publications/panimmune need to be imported into the clinicalData folder.

In [None]:
test.boxplotExternalData(3)

# Export to GSEA

Starts writing the .cls file

In [None]:
output =f'{len(test.gexDf.columns.tolist())} {len(test.sortedClusterList)} 1'
output += f'\n# {" ".join([str(i) for i in range(1, 1+len(test.sortedClusterList))])}'
lista = [str(int(x)) for x in list(test.transposedGexDf['Clusters'])]

Run ONLY if you want to combine clusters together

In [None]:
lista = ['1' if x != '4' else '2' for x in lista]

Writes the .cls file

In [None]:
output += f'\n{" ".join(lista)}'
with open('data/gsea/data.cls', 'w') as file:
    file.write(output)

Fetches the symbol names to be used in gsea

In [None]:
test.ensembl2symbol()

Creates expression file for gsea

In [None]:
import numpy as np
# df = test.symbolDf.filter(items = allGenes, axis = 'index')
df = test.symbolDf
df.insert(0, 'NAME', value  = df.index.tolist())
df.insert(1, 'DESCRIPTION', [np.nan for x in range(len(df.index))])

In [None]:
import itertools
allGenes =  (list(itertools.chain.from_iterable(test.eList)))

In [None]:
df = df[df['NAME'].isin(allGenes)]

In [None]:
df.to_csv('data/gsea/genes.txt', sep = '\t', index = False)