In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from os import path
from pathlib import Path  

#pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)

In [2]:
# import files
data_file = path.join('/Users/louxuwen/Desktop/Documents/GitHub/SP23-BENG213/Project/Map Gene to KEGG Pathway/down_data_matrix.csv') # Data matrix
gene_file = path.join('/Users/louxuwen/Desktop/Documents/GitHub/SP23-BENG213/Project/Map Gene to KEGG Pathway/Homo_sapiens.tsv') # Gene Symbol and ID
pathway_file = path.join('/Users/louxuwen/Desktop/Documents/GitHub/SP23-BENG213/Project/Map Gene to KEGG Pathway/Pathway_Down.csv') # all the pathways used in the heatmap
gene_to_pathway_file = path.join('/Users/louxuwen/Desktop/Documents/GitHub/SP23-BENG213/Project/Map Gene to KEGG Pathway/Gene_to_Pathway.csv') # every pathway and the relevant genes

# dataframe of metadata
data = pd.read_csv(data_file,index_col=0)
gene_info = pd.read_csv(gene_file, sep="\t")
pathway = pd.read_csv(pathway_file,index_col=0).fillna("0")
gene_to_pathway = pd.read_csv(gene_to_pathway_file)

In [3]:
pathways = pathway.index.tolist()
pathway_name = []
pathway_number = []
for i in range(len(pathways)):
    temp = pathway.iloc[i]
    temp2 = []
    for j in range(4):
        if temp[j] != "0":
            pathway_name.append(pathways[i])
            pathway_number.append(temp[j])
pathway_number = set(pathway_number)            

In [4]:
# get gene IDs
# note that 7 percent (~252/3418 genes does not have matching symbol=ID)
Gene_IDs = []
for i in range(len(data.index.tolist())):
    Gene_name = data.iloc[i][0]
    
    # get gene symbol
    if "," in Gene_name:
        temp = Gene_name.split(",")
        if temp[0] != "":
            Gene_name = temp[0]
        else:
            Gene_name = temp[1].lstrip()
    
    # gene_symbol to ID
    ID = gene_info.loc[gene_info['Symbol'] == Gene_name]['GeneID'].tolist()
    if ID == []:
        Gene_IDs.append("None")
    else:
        Gene_IDs.append(ID[0])

In [5]:
# add the IDs column in and drop the genes that doesn't have an ID
data["ID"] = Gene_IDs
data = data[data['ID'] != "None"]

In [6]:
# get the pathway
# only 1402/3166 genes belong to a functional pathway
pathways_by_gene = []
pathways_by_gene_column = []
for i in range(len(data.index.tolist())):
    temp_ID = data.iloc[i][28]
    
    # get corresponding pathways
    temp_pathways = gene_to_pathway[gene_to_pathway['Gene']== data.iloc[i][28]]["Pathway"].tolist()
    
    if len(temp_pathways) == 0:
        pathways_by_gene_column.append("none")
    else:
        pathways_by_gene.append(temp_pathways)
        pathways_by_gene_column.append(" ".join(temp_pathways))

In [7]:
data["All Pathways"] = pathways_by_gene_column
data = data[data['All Pathways'] != "none"]

## output the data file as long as the gene is involved with a pathways ##

In [8]:
filepath = Path('/Users/louxuwen/Desktop/Documents/GitHub/SP23-BENG213/Project/Map Gene to KEGG Pathway/Data_All_Pathways.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
data.to_csv(filepath)  

## continue to shrink data only if a gene is involved with a pathway actually used in the heatmap ##

In [55]:
# find the pathway that's involved with the heat map
pathways_involved = []
pathways_involved_column = []
for i in range(len(data.index.tolist())):
    a = pathways_by_gene[i]
    temp = []

    for j in a:
        if j in pathway_number:
            temp.append(j)
    
    if len(temp) == 0:
        pathways_involved_column.append("none")
    else:
        pathways_involved.append(temp)
        pathways_involved_column.append(" ".join(temp))

In [56]:
data["Involved Pathways"] = pathways_involved_column
data = data[data['Involved Pathways'] != "none"]

In [57]:
drugs = data.columns.values.tolist()[1:28]
functions = pathway.index.tolist()

In [58]:
# create the finalmatrix
final_matrix = [[0.0] * len(drugs)]*len(functions)
counter=0
for i in range(len(data.index.tolist())):
    temp_pathways = pathways_involved[i]
    temp_values = data.iloc[i].tolist()[1:28]
    temp_functions = []
    for j in temp_pathways:
        a = pathway[pathway["pathway_1"] == j].index.tolist()
        if len(a)>0:
            temp_functions.append(a[0])
        else:
            a = pathway[pathway["pathway_2"] == j].index.tolist()
            if len(a)>0:
                temp_functions.append(a[0])
            else:
                a = pathway[pathway["pathway_3"] == j].index.tolist()
                if len(a)>0:
                    temp_functions.append(a[0])
                else:
                    a = pathway[pathway["pathway_4"] == j].index.tolist()
                    if len(a)>0:
                        temp_functions.append(a[0])
    for k in temp_functions:
        counter = counter + 1
        temp_index = functions.index(k)
        final_matrix[temp_index] = [final_matrix[temp_index][n] + temp_values[n] for n in range(len(temp_values))] 


In [59]:
final_data_functions_by_drugs = pd.DataFrame(final_matrix,index = functions, columns=drugs)
final_data_functions_by_drugs

Unnamed: 0,Trastuzumab,Tideglusib,Lapatinib,Mirdametinib,MK-2206,Nintedanib,Pertuzumab,Pictilisib,Refametinib,Rituximab,...,AZD-8055,Bortezomib,Carfilzomib,CUDC-101,Curcumin,Cytarabine,Dactolisib,Dasatinib,Gefitinib,Imatinib
Apelin,0.0,0.0,6.572377,0.0,0.0,0.0,0.0,0.0,8.434019,-23.683169,...,0.0,67.571468,78.799767,0.0,9.453918,0.0,0.0,28.146337,6.731208,0.0
Insulin,-9.216824,0.0,39.282873,17.632998,67.14163,5.538034,-9.541298,24.086317,24.395173,-57.997108,...,8.297748,176.673142,188.825612,0.0,40.541472,0.0,19.884511,126.600083,39.582835,13.258105
PI3K-AKT,-4.630123,0.0,47.539405,9.004597,32.525836,0.0,-14.223007,6.509195,16.297334,-33.126814,...,8.286896,131.901355,108.256232,13.389727,22.033432,0.0,14.621413,99.077675,32.909213,6.503637
mTOR,0.0,0.0,49.070156,0.0,24.449587,0.0,-13.840524,7.003734,8.434019,-29.068604,...,0.0,133.40127,140.77052,10.201032,18.499744,0.0,0.0,61.257559,28.839408,6.503637
HIF-1,0.0,0.0,20.172436,0.0,0.0,0.0,-4.909324,6.509195,8.434019,-21.750208,...,8.286896,75.056658,84.948408,0.0,9.025755,0.0,8.19792,39.185191,11.77296,6.616509
AMPK,-9.216824,0.0,33.627426,8.628401,39.638972,10.84722,-13.712845,29.85027,8.097839,-21.528648,...,16.110508,118.474846,99.610256,0.0,32.41349,0.0,15.55854,66.820248,35.614062,0.0
Hippo,0.0,0.0,26.354122,0.0,14.844044,5.550695,-9.168958,13.890178,0.0,-11.267384,...,17.054924,70.568565,40.227649,12.485414,21.266236,0.0,23.761408,25.55651,18.607903,7.831504
Rig-I like receptor,0.0,0.0,7.163843,0.0,8.712568,0.0,0.0,0.0,0.0,-2.889771,...,0.0,25.962151,16.8812,0.0,5.196384,0.0,0.0,7.343759,0.0,0.0
NF-kappa B,0.0,0.0,4.756414,0.0,12.378042,9.355389,0.0,6.945888,0.0,-2.889771,...,0.0,24.649071,30.794876,0.0,5.196384,0.0,0.0,41.370316,6.847358,0.0
Calcium,0.0,0.0,19.408772,0.0,8.457309,0.0,0.0,6.509195,0.0,-9.605182,...,8.286896,24.206169,33.401999,3.971397,12.415924,0.0,8.19792,25.986557,5.2878,0.0


In [60]:
filepath = Path('/Users/louxuwen/Desktop/Documents/GitHub/SP23-BENG213/Project/Map Gene to KEGG Pathway/Final_data_functions_by_drugs.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
final_data_functions_by_drugs.to_csv(filepath)  

## establish matrix as pathway by drugs ##