This is the initial attempt to create a data table for the project
Specifically, the following four phosphoproteome dataset are chosen:
1. afatinib
2. dasatinib
3. gefitinib
4. imatinib
Thess drugs are all kinase inhibitors, and are tested on the A431.

The logic is that if they are tested on the same cell line, the genes information should match.
Thus the dd PTM datasets are downloaded (aggregate by protein)

Important things to note:
1. Even within the same file, there could be duplicates in the gene name. Because the upregulation and downregulation can exist at the same time
2. Not all genes are present in the all four files

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from os import path
from pathlib import Path  

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
# import files
afatinib_file = path.join('..','data','raw_data','/home/amy/Documents/GitHub/SP23-BENG213/Project/Data Organization Attempt/Afatinib_ddPTM_aggregated.csv') # Enter metadata filename here
dasatinib_file = path.join('..','data','raw_data','/home/amy/Documents/GitHub/SP23-BENG213/Project/Data Organization Attempt/Dasatinib_ddPTM_aggregated.csv') # Enter metadata filename here
gefitinib_file = path.join('..','data','raw_data','/home/amy/Documents/GitHub/SP23-BENG213/Project/Data Organization Attempt/Gefitinib_ddPTM_aggregated.csv') # Enter metadata filename here
imatinib_file = path.join('..','data','raw_data','/home/amy/Documents/GitHub/SP23-BENG213/Project/Data Organization Attempt/Imatinib_ddPTM_aggregated.csv') # Enter metadata filename hereMYO5A

In [3]:
# dataframe of metadata
afatinib_raw = pd.read_csv(afatinib_file)
dasatinib_raw = pd.read_csv(dasatinib_file)
gefitinib_raw = pd.read_csv(gefitinib_file)
imatinib_raw = pd.read_csv(imatinib_file)

In [4]:
# drop the rows where the gene name doesnt exist
# these are the PTMs that cannot be mapped/categorized to a speicific gene
afatinib_raw = afatinib_raw[afatinib_raw['Gene Name'].notna()]
dasatinib_raw = dasatinib_raw[dasatinib_raw['Gene Name'].notna()]
gefitinib_raw = gefitinib_raw[gefitinib_raw['Gene Name'].notna()]
imatinib_raw = imatinib_raw[imatinib_raw['Gene Name'].notna()]

Regulations can be up and down, and it is possible that both regulations exist even for the same gene
Thus, it's important that we split the dataset into up regulation and downregulation
The processing pipeline should however stay the same

In [5]:
afatinib_raw_up = afatinib_raw.loc[afatinib_raw['Regulation'] == "up"]
dasatinib_raw_up = dasatinib_raw.loc[dasatinib_raw['Regulation'] == "up"]
gefitinib_raw_up = gefitinib_raw.loc[gefitinib_raw['Regulation'] == "up"]
imatinib_raw_up = imatinib_raw.loc[imatinib_raw['Regulation'] == "up"]

In [6]:
temp = []
temp = afatinib_raw_up["Gene Name"].tolist()
temp = temp + dasatinib_raw_up["Gene Name"].tolist()
temp = temp + gefitinib_raw_up["Gene Name"].tolist()
temp = temp + imatinib_raw_up["Gene Name"].tolist()
genes = set(temp)
genes_list = list(genes)

In [7]:
len(genes_list)

647

Making the list of average fold change

In [8]:
afatinib_up_ave_FC = []
for i in range(len(genes_list)):
    temp = afatinib_raw_up.loc[afatinib_raw_up['Gene Name'] == genes_list[i]]
    temp_list = temp["Average Fold Change"].tolist()
    if len(temp_list) == 0:
        afatinib_up_ave_FC.append(float(1))
    else:
        afatinib_up_ave_FC.append(temp_list[0])

In [9]:
dasatinib_up_ave_FC = []
for i in range(len(genes_list)):
    temp = dasatinib_raw_up.loc[dasatinib_raw_up['Gene Name'] == genes_list[i]]
    temp_list = temp["Average Fold Change"].tolist()
    if len(temp_list) == 0:
        dasatinib_up_ave_FC.append(float(1))
    else:
        dasatinib_up_ave_FC.append(temp_list[0])

In [10]:
gefitinib_up_ave_FC = []
for i in range(len(genes_list)):
    temp = gefitinib_raw_up.loc[gefitinib_raw_up['Gene Name'] == genes_list[i]]
    temp_list = temp["Average Fold Change"].tolist()
    if len(temp_list) == 0:
        gefitinib_up_ave_FC.append(float(1))
    else:
        gefitinib_up_ave_FC.append(temp_list[0])

In [11]:
imatinib_up_ave_FC = []
for i in range(len(genes_list)):
    temp = imatinib_raw_up.loc[imatinib_raw_up['Gene Name'] == genes_list[i]]
    temp_list = temp["Average Fold Change"].tolist()
    if len(temp_list) == 0:
        imatinib_up_ave_FC.append(float(1))
    else:
        imatinib_up_ave_FC.append(temp_list[0])

Make the data matrix

In [12]:
data_up = {'afatinib_up': afatinib_up_ave_FC,
        'dasatinib_up': dasatinib_up_ave_FC,
        'gefitinib_up': gefitinib_up_ave_FC,
        'imatinib_up': imatinib_up_ave_FC,
       }
up_matrix = pd.DataFrame(data_up, index=genes_list)

In [13]:
count = afatinib_up_ave_FC.count(1.0)
count = count + dasatinib_up_ave_FC.count(1.0)
count = count + gefitinib_up_ave_FC.count(1.0)
count = count + imatinib_up_ave_FC.count(1.0)
count = count/(4*len(genes_list))
print(str(round(count*100,2)) + "% of the data is filled with 1")

64.49% of the data is filled with 1


In [14]:
filepath = Path('/home/amy/Documents/GitHub/SP23-BENG213/Project/Data Organization Attempt/0_up_matrix.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
up_matrix.to_csv(filepath)  

Now making the down matrix

In [15]:
afatinib_raw_down = afatinib_raw.loc[afatinib_raw['Regulation'] == "down"]
dasatinib_raw_down = dasatinib_raw.loc[dasatinib_raw['Regulation'] == "down"]
gefitinib_raw_down = gefitinib_raw.loc[gefitinib_raw['Regulation'] == "down"]
imatinib_raw_down = imatinib_raw.loc[imatinib_raw['Regulation'] == "down"]

In [16]:
temp = []
temp = afatinib_raw_down["Gene Name"].tolist()
temp = temp + dasatinib_raw_down["Gene Name"].tolist()
temp = temp + gefitinib_raw_down["Gene Name"].tolist()
temp = temp + imatinib_raw_down["Gene Name"].tolist()
genes = set(temp)
genes_list = list(genes)

In [17]:
len(genes_list)

1235

Making the list of average fold change

In [18]:
afatinib_down_ave_FC = []
for i in range(len(genes_list)):
    temp = afatinib_raw_down.loc[afatinib_raw_down['Gene Name'] == genes_list[i]]
    temp_list = temp["Average Fold Change"].tolist()
    if len(temp_list) == 0:
        afatinib_down_ave_FC.append(float(1))
    else:
        afatinib_down_ave_FC.append(temp_list[0])

In [19]:
dasatinib_down_ave_FC = []
for i in range(len(genes_list)):
    temp = dasatinib_raw_down.loc[dasatinib_raw_down['Gene Name'] == genes_list[i]]
    temp_list = temp["Average Fold Change"].tolist()
    if len(temp_list) == 0:
        dasatinib_down_ave_FC.append(float(1))
    else:
        dasatinib_down_ave_FC.append(temp_list[0])

In [20]:
gefitinib_down_ave_FC = []
for i in range(len(genes_list)):
    temp = gefitinib_raw_down.loc[gefitinib_raw_down['Gene Name'] == genes_list[i]]
    temp_list = temp["Average Fold Change"].tolist()
    if len(temp_list) == 0:
        gefitinib_down_ave_FC.append(float(1))
    else:
        gefitinib_down_ave_FC.append(temp_list[0])

In [21]:
imatinib_down_ave_FC = []
for i in range(len(genes_list)):
    temp = imatinib_raw_down.loc[imatinib_raw_down['Gene Name'] == genes_list[i]]
    temp_list = temp["Average Fold Change"].tolist()
    if len(temp_list) == 0:
        imatinib_down_ave_FC.append(float(1))
    else:
        imatinib_down_ave_FC.append(temp_list[0])

Make the data matrix

In [23]:
data_down = {'afatinib_up': afatinib_down_ave_FC,
        'dasatinib_up': dasatinib_down_ave_FC,
        'gefitinib_up': gefitinib_down_ave_FC,
        'imatinib_up': imatinib_down_ave_FC,
       }
down_matrix = pd.DataFrame(data_down, index=genes_list)

In [24]:
count = afatinib_down_ave_FC.count(1.0)
count = count + dasatinib_down_ave_FC.count(1.0)
count = count + gefitinib_down_ave_FC.count(1.0)
count = count + imatinib_down_ave_FC.count(1.0)
count = count/(4*len(genes_list))
print(str(round(count*100,2)) + "% of the data is filled with 1")

55.59% of the data is filled with 1


In [25]:
filepath = Path('/home/amy/Documents/GitHub/SP23-BENG213/Project/Data Organization Attempt/0_down_matrix.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
down_matrix.to_csv(filepath)  