In [1]:
import os
import re
import numpy as np
import pandas as pd
from scipy.stats import gmean
from sklearn.decomposition import PCA
#Visualization
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

In [2]:
sample_info = pd.read_csv("./data_external/melanoma_data/sample.csv")
sample_info.columns = ['File_Name', 'Case_ID','Sample_ID','Sample_Type']

sample = sample_info.loc[~(
    (sample_info.Sample_Type == 'Solid Tissue Normal') |
    (sample_info.Sample_Type == 'Additional Metastatic')
    )]

print(sample.Sample_Type.value_counts())

Metastatic       367
Primary Tumor    103
Name: Sample_Type, dtype: int64


In [3]:
df_genes = pd.read_csv("./data_generated/ensemble.txt",sep='\t',header=None).iloc[:,0]
df_genes.shape

(60483,)

In [4]:
# create empty list to store expression data from files
data_list = []
# filling the Array files data in datos21 folder
for file in sample.File_Name:
    data_read = pd.read_csv(f'./data_external/melanoma_data/datos21/{file}', sep='\t', 
                                  header=None, usecols=[1])
    data_list.append(data_read.values)

In [5]:
# transform in a numpy array
data_stored = np.concatenate(data_list, axis=1).T
# sum 0.1 to avoid 0 divisions
data_stored += 0.1
print(data_stored.shape)

(470, 60483)


In [6]:
# primary tumor samples
primary = data_stored[sample.Sample_Type == 'Primary Tumor']
# the geometric mean of primary tumor samples taken as reference
ref = gmean(primary)

data = data_stored/ref
data = np.log2(data)

df = pd.DataFrame(data=data,columns=df_genes)
df.head()

Unnamed: 0,ENSG00000242268,ENSG00000270112,ENSG00000167578,ENSG00000273842,ENSG00000078237,ENSG00000146083,ENSG00000225275,ENSG00000158486,ENSG00000198242,ENSG00000259883,...,ENSG00000238244,ENSG00000186115,ENSG00000216352,ENSG00000267117,ENSG00000273233,ENSG00000105063,ENSG00000231119,ENSG00000280861,ENSG00000123685,ENSG00000181518
0,-0.667594,0.024722,-0.13841,-0.037171,0.120447,0.525614,-0.019284,-0.255458,0.711044,-0.513289,...,-4.80514e-15,0.569558,-0.015357,0.719398,0.337172,0.035513,0.041954,-4.80514e-15,0.673189,-0.010566
1,-0.667594,-0.067033,0.151962,-0.037171,-0.542366,-0.54684,-0.019284,-0.085034,0.44245,-0.513289,...,-4.80514e-15,-0.479669,-0.015357,-0.996845,-0.501789,-0.229711,0.262515,-4.80514e-15,-1.867977,-0.010566
2,-0.667594,-0.067033,-0.883102,-0.037171,0.833558,-1.604637,-0.019284,-0.372856,-0.929227,0.019839,...,-4.80514e-15,-0.479669,0.312919,-0.996845,-0.011669,0.563455,0.378923,-4.80514e-15,-0.811724,-0.010566
3,-0.667594,-0.025698,-1.964326,-0.037171,-0.030969,0.818633,-0.019284,0.085646,0.424491,-0.258813,...,-4.80514e-15,-0.412945,-0.015357,-0.253111,0.262264,-0.166382,-0.749621,-4.80514e-15,-0.224312,-0.010566
4,-0.667594,-0.067033,-0.292996,-0.037171,-0.00114,0.814253,-0.019284,1.121704,0.149756,0.29473,...,-4.80514e-15,-0.393684,-0.015357,0.46268,0.989708,-0.219318,0.91056,-4.80514e-15,0.468016,-0.010566


In [7]:
# Read the Top 28 pathways excel list
pathways = pd.read_excel("./data_external/data_pathways/pathways_All_ordered.xlsx",header=None)
# Name the columns
pathways.columns = ["Ensemble","Pathways"]

# Count pathways
pathways_names = pd.unique(pathways.Pathways)
# Count genes in pathways
genes_annotated = pd.unique(pathways.Ensemble)

# Print ranges
print(" Top Pathways:",
      pathways_names.shape[0],"\n",
      "Annotated Genes:",
      genes_annotated.shape[0]
      )

 Top Pathways: 2234 
 Annotated Genes: 10785


In [8]:
# Genes in pathways
genes_path = set(df_genes) & set(genes_annotated)

# Pathways with annotated genes
pathways_annotated = pathways.query('Ensemble in @genes_path')

In [9]:
X = []
for pathway in pathways_names:
    # genes evaluated in pathway (variable to operate)
    g_pathways = df[
        pathways_annotated[pathways_annotated.Pathways == pathway].Ensemble
    ]
    # Ecuation above
    X.append(np.sum(abs(g_pathways),axis=1)/g_pathways.shape[1])

X = pd.DataFrame(data=X, columns=pathways_names)
X.shape

(2234, 2234)

In [10]:
normal_file = sample_info[sample_info.Sample_Type == 'Solid Tissue Normal'].File_Name.values[0]
normal_file

'7b98f1f0-ec44-4414-8e1c-0d29a386e45c.FPKM.txt'

In [11]:
normal_sample = pd.read_csv(f'./data_external/melanoma_data/datos21/{normal_file}', sep='\t', 
                                  header=None, usecols=[1]) +0.1

In [12]:
normal_mean = gmean(normal_sample[1])

In [13]:
import pandas as pd
import numpy as np
from pathway_modules import PathwayAnalyzer

In [31]:
class PathwayAnalyzer:
    def __init__(self, path, ensemble_ids):
        self.pathways = pd.read_excel(path, header=None)
        self.pathways.columns = ["Ensemble", "Pathways"]
        self.pathway_names = pd.unique(self.pathways.Pathways)
        self.ensemble_ids = ensemble_ids

    def pathways_query(self):
        genes_annotated = pd.unique(self.pathways.Ensemble)
        genes_path = set(self.ensemble_ids) & set(genes_annotated)
        return self.pathways.query('Ensemble in @genes_path')
    
    def calculate_matrix(self, data_frame, col=False, col_name=None):
        X = []
        for p in self.pathway_names:
            g_pathways = self.pathways_query(self.pathways[self.pathways.Pathways == p].Ensemble)
            # Modify the following calculation as needed
            matrix_values = np.sum(abs(data_frame[g_pathways]), axis=1) / g_pathways.shape[1]
            X.append(matrix_values)
        
        if col:
            return pd.DataFrame(data=X, columns=col_name, index=self.pathway_names)
        else:
            return pd.DataFrame(data=X, index=self.pathway_names)

# Usage
#pathway_analyzer = PathwayAnalyzer('./data_external/data_pathways/pathways2_sorted.xls')
#ensemble_ids = df_genes.values
#data_frame = df  # Provide your data frame here
#result_matrix = pathway_analyzer.calculate_matrix(df)
#print(result_matrix)

In [32]:
pa = PathwayAnalyzer('./data_external/data_pathways/pathways2_sorted.xls',ensemble_ids=df_genes)

In [33]:
pa.calculate_matrix(df)

TypeError: pathways_query() takes 1 positional argument but 2 were given