Data Type: microarray data with fluorescence intensity 

# Step 1: delete mean values, use median value for fluorescense intensity

In [33]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from os import path
from pathlib import Path  

**STEP 1: Take median data**
* only done once, ignore for later steps
* select median value only to avoid effect from extreme values

In [None]:
# mac
rawdata_file = path.join('..','data','raw_data','/Users/louxuwen/Desktop/Documents/GitHub/modulome-C_Glutamicum_Microarray_clean/1_metadata_file/raw_data.xlsx') # Enter metadata filename here
# linux
#rawdata_file = path.join('..','data','raw_data','/home/amy/Documents/GitHub/modulome-C_Glutamicum_Microarray_clean/1_metadata_file/raw_data.xlsx') # Enter metadata filename here

In [None]:
raw_data = pd.read_excel(rawdata_file,index_col=0).fillna(0);
print('Number of genes:',raw_data.shape[1])
print('Number of experiments:',raw_data.shape[0])

In [None]:
median_data = raw_data.loc[raw_data['Mean/Median'] == "Median"]

In [None]:
print('Number of genes:',median_data.shape[1])
print('Number of experiments:',median_data.shape[0])

In [None]:
from pathlib import Path  
filepath = Path('/Users/louxuwen/Desktop/Documents/GitHub/modulome-C_Glutamicum_Microarray_clean/1_metadata_file/median_data.xlsx')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
median_data.to_excel(filepath)  

**STEP 2: take the difference between foreground and background data**
* only done once, ignore for later steps
* thanks to Zack, Griff and Josh for figuring out dataframe w me :)

In [None]:
mediandata_file = path.join('..','data','raw_data','/home/amy/Documents/GitHub/modulome-C_Glutamicum_Microarray_clean/1_metadata_file/median_data.xlsx') # Enter metadata filename here
median_data = pd.read_excel(mediandata_file).fillna(0);
print('Number of genes:',median_data.shape[1])
print('Number of experiments:',median_data.shape[0])

In [None]:
b_data = median_data.loc[median_data['Fluorescence'] == "B635"]
f_data = median_data.loc[median_data['Fluorescence'] == "F635"]
b_data = b_data.reset_index(drop=True)
f_data = f_data.reset_index(drop=True)

In [None]:
net_data = f_data.copy() 
net_data.iloc[:,4:] = f_data.iloc[:,4:] - b_data.iloc[:,4:]

In [None]:
net_data.drop(columns = ['Mean/Median' , 'Fluorescence'])

In [None]:
from pathlib import Path  
filepath = Path('/home/amy/Documents/GitHub/modulome-C_Glutamicum_Microarray_clean/1_metadata_file/net_data.xlsx')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
net_data.to_excel(filepath)  

**STEP 3: Dye Swap (interrupted)**

In [2]:
net_data_file = path.join('..','data','net_data','/home/amy/Documents/GitHub/modulome-C_Glutamicum_Microarray_clean/1_metadata_file/net_data.xlsx') # Enter metadata filename here
dye_file = path.join('..','data','dye_label_data','/home/amy/Documents/GitHub/modulome-C_Glutamicum_Microarray_clean/1_metadata_file/dye_label.xlsx') # Enter metadata filename here
net_data = pd.read_excel(net_data_file);
dye_label = pd.read_excel(dye_file);

In [4]:
dye_label = dye_label.drop(columns = ['Unnamed: 2'])

In [5]:
print('Number of experiments in the dye list:',dye_label.shape[0])

Number of experiments in the dye list: 927


In [6]:
dye_label

Unnamed: 0,Sample,Label
0,"WT (stationary phase, OD 30) vs. WT (exponenti...",Cy5/Cy3
1,"WT (stationary phase, OD 30) vs. WT (exponenti...",Cy5/Cy3
2,"WT (stationary phase, OD 30) vs. WT (exponenti...",Cy3/Cy5
3,"WT (stationary phase, OD 30) vs. WT (exponenti...",Cy5/Cy3
4,"WT (stationary phase, OD 75) vs. WT (exponenti...",Cy5/Cy3
...,...,...
922,WT PctaD::Ptuf PctaE::Ptuf PctaC::Ptuf vs. WT-...,Cy3/Cy5
923,WT PctaD::Ptuf PctaE::Ptuf PctaC::Ptuf vs. WT-...,Cy5/Cy3
924,WT PnrdH::Ptuf PctaE::Ptuf PctaC::Ptuf vs. WT-...,Cy5/Cy3
925,WT PnrdH::Ptuf PctaE::Ptuf PctaC::Ptuf vs. WT-...,Cy5/Cy3


In [17]:
dye_label_list = dye_label['Sample'].tolist()
swap = dye_label.loc[dye_label['Label'] == "Cy3/Cy5"]
swap_list = swap["Sample"].tolist()
print('Number of experiments that need to be swapped:', len(swap_list))

Number of experiments that need to be swapped: 330


In [15]:
sample_list = net_data["Sample name"].tolist()
c =  len([i for i in swap_list if i in sample_list ])
print('Out of the experiments that need to be swapped, ', c, ' has can be found in the sample list')

Out of the experiments that need to be swapped,  186  has can be found in the sample list


In [18]:
d =  len([i for i in dye_label_list if i in sample_list])
print('Out of the 927 experiments in the dye_label file, ', d, ' has can be found in the sample list (1050 total)')

Out of the 927 experiments in the dye_label file,  521  has can be found in the sample list (1050 total)


Here we noticed an issue, that the dye_label file uses different sample names compared to the metadatasheet
Thus, need to manually sort through that first:
* reduce he 1050 experiments to 927
* make sure all 927 experiments' sample name matches

In [30]:
Rename_list = []
for i in dye_label_list:
    if i not in sample_list:
        Rename_list.append(i)
Rename_df = pd.DataFrame(Rename_list)
print(Rename_df)

                                                     0
0    WT (70 mM glutamine, w/o (NH4)2SO4 and urea) v...
1    WT (70 mM glutamine, w/o (NH4)2SO4 and urea) v...
2    WT (100 mM pyruvate) vs. WT (100 mM lactate)-rep1
3    WT (100 mM pyruvate) vs. WT (100 mM lactate)-rep2
4    WT (100 mM pyruvate) vs. WT (100 mM lactate)-rep3
..                                                 ...
401  ∆cgtSR5 (w/o CuSO4, OD 20) vs. ∆cgtSR5 (OD 20)...
402  ∆cgtSR5 (w/o CuSO4, OD 20) vs. ∆cgtSR5 (OD 20)...
403  ∆cgtSR5 (w/o CuSO4, OD 20) vs. ∆cgtSR5 (OD 20)...
404  ∆cgtSR5 (w/o CuSO4, OD 20) vs. ∆cgtSR5 (OD 20)...
405  DelAro4-4clpc mufasO pMKEX2-STSah (OD 5, 5 mM ...

[406 rows x 1 columns]


In [34]:
filepath = Path('/home/amy/Documents/GitHub/modulome-C_Glutamicum_Microarray_clean/1_metadata_file/rename_list.xlsx')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
Rename_df.to_excel(filepath)  