# Pathway Analysis

### Load imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import sys

source_dir = '../'
sys.path.append(source_dir)

from functions.filter_and_clean_dataframe import filter_and_clean_dataframe

# Set figure styles
sns.set_style("whitegrid")

# Set default color for text and lines to black
sns.set(rc={"axes.facecolor": "white", "axes.edgecolor": "black", "grid.color": "black", 
            "text.color": "black", "xtick.color": "black", "ytick.color": "black", 
            "axes.labelcolor": "black", "figure.facecolor": "white"})


### Load data

In [2]:
intracellular_df = pd.read_csv('../processed_data/intra_gcms.csv', index_col=0)
extracellular_1_df = pd.read_csv('../processed_data/extra_gcms_1.csv', index_col=0)
extracellular_4_df = pd.read_csv('../processed_data/extra_gcms_4.csv', index_col=0)
intracellular_pos_df = pd.read_csv('../processed_data/intra_lcms_pos.csv', index_col=0)
intracellular_neg_df = pd.read_csv('../processed_data/intra_lcms_neg.csv', index_col=0)
extracellular_pos_df = pd.read_csv('../processed_data/extra_lcms_pos.csv', index_col=0)
extracellular_neg_df = pd.read_csv('../processed_data/extra_lcms_neg.csv', index_col=0)

print(f'intracellular_df has shape {intracellular_df.shape}')
print(f'extracellular_1_df has shape {extracellular_1_df.shape}')
print(f'extracellular_4_df has shape {extracellular_4_df.shape}')
print(f'intracellular_pos_df has shape {intracellular_pos_df.shape}')
print(f'intracellular_neg_df has shape {intracellular_neg_df.shape}')
print(f'extracellular_pos_df has shape {extracellular_pos_df.shape}')
print(f'extracellular_neg_df has shape {extracellular_neg_df.shape}')

intracellular_df.head()

intracellular_df has shape (25, 320)
extracellular_1_df has shape (26, 247)
extracellular_4_df has shape (53, 279)
intracellular_pos_df has shape (27, 721)
intracellular_neg_df has shape (27, 665)
extracellular_pos_df has shape (80, 520)
extracellular_neg_df has shape (80, 661)


Unnamed: 0,2-aminoadipic acid,2'-Deoxycytidine 5'-monophosphoric acid,3-phosphoglycerate,4-aminobutyric acid (GABA),Adenine,Adipic acid,Arachidic acid,Beta-alanine,Carbonate ion,Citric acid,...,Unknown 220,Unknown 221,Unknown 222,Unknown 223,Unknown 224,Unknown 225,Unknown 226,Unknown 227,Unknown 228,Unknown 229
Tags,High Confidence,High Confidence,High Confidence,High Confidence,High Confidence,High Confidence,High Confidence,High Confidence,High Confidence,High Confidence,...,,,,,,,,,,
in_switchgrass_pvhg6_early_stat_1,206675.073240236,265628.2073572,398155.546965357,461693.997122307,553917.012625364,0.0,39884.14605205,1751763.14357969,1766960.21177728,98325.766858502,...,429928.1,49456.507927,371788.05341,234280.995486,5832.703213,1645217.0,351925.677818,24385.393103,217630.729539,198363.596018
in_switchgrass_pvhg6_early_stat_3,341996.103780714,358143.914807286,448290.918266103,496714.999168787,638854.84035566,0.0,84423.2394990384,2040960.80587233,2032543.136365,115405.400973157,...,523340.0,54438.918937,417617.364083,370183.713935,30622.454946,2054061.0,492993.409911,61659.348813,214347.074793,200325.541072
in_switchgrass_pvhg6_early_stat_2,263361.804510689,316901.937516122,382225.895834572,549619.810167144,802743.275250288,0.0,65523.7689777048,2144767.05673166,2175518.31265378,157969.941265746,...,522363.2,78126.743977,411800.779492,501043.347306,528.570981,2814759.0,305358.590677,0.0,267134.158139,243310.544402
in_switchgrass_wt_early_stat_1,361520.397329639,158448.34341112,322605.073303065,677195.048082466,710902.55546792,0.0,21587.08362939,1742924.76926073,2187317.70629786,73643.828000748,...,1094674.0,68781.500733,389899.251774,53068.346347,41841.431914,459118.5,0.0,0.0,0.0,0.0


### Define a list of labels related to early stationary phase wildtype

In [3]:
intracellular_labels = list(intracellular_df.index)

# remove 'Tags' row
labels_to_keep = [label for label in intracellular_labels if label != 'Tags']

# remove the PVHG strain rows to keep only the wild type data
labels_to_keep = [label for label in labels_to_keep if 'pvhg6' not in label]

# make a list of corresponding labels for the extracellular data
extra_labels_to_keep = [label.replace('in_', 'ex_') for label in labels_to_keep]

display(labels_to_keep)

extra_labels_to_keep

['in_switchgrass_wt_early_stat_1',
 'in_switchgrass_wt_early_stat_2',
 'in_switchgrass_wt_early_stat_3',
 'in_pvhg_wt_early_stat_1a',
 'in_pvhg_wt_early_stat_2a',
 'in_pvhg_wt_early_stat_3a',
 'in_glc_wt_early_stat_1',
 'in_glc_wt_early_stat_2',
 'in_glc_wt_early_stat_3',
 'in_glc/phe_wt_early_stat_1',
 'in_glc/phe_wt_early_stat_2',
 'in_glc/phe_wt_early_stat_3',
 'in_phe_wt_early_stat_1',
 'in_phe_wt_early_stat_2',
 'in_phe_wt_early_stat_3',
 'in_pvhg_wt_early_stat_1b',
 'in_pvhg_wt_early_stat_2b',
 'in_pvhg_wt_early_stat_3b']

['ex_switchgrass_wt_early_stat_1',
 'ex_switchgrass_wt_early_stat_2',
 'ex_switchgrass_wt_early_stat_3',
 'ex_pvhg_wt_early_stat_1a',
 'ex_pvhg_wt_early_stat_2a',
 'ex_pvhg_wt_early_stat_3a',
 'ex_glc_wt_early_stat_1',
 'ex_glc_wt_early_stat_2',
 'ex_glc_wt_early_stat_3',
 'ex_glc/phe_wt_early_stat_1',
 'ex_glc/phe_wt_early_stat_2',
 'ex_glc/phe_wt_early_stat_3',
 'ex_phe_wt_early_stat_1',
 'ex_phe_wt_early_stat_2',
 'ex_phe_wt_early_stat_3',
 'ex_pvhg_wt_early_stat_1b',
 'ex_pvhg_wt_early_stat_2b',
 'ex_pvhg_wt_early_stat_3b']

### Remove the rows that are not related to early stationary phase wildtype

In [4]:
intracellular_df = filter_and_clean_dataframe(intracellular_df, labels_to_keep, filter_unknowns=True)
extracellular_1_df = filter_and_clean_dataframe(extracellular_1_df, extra_labels_to_keep, filter_unknowns=True)
extracellular_4_df = filter_and_clean_dataframe(extracellular_4_df, extra_labels_to_keep, filter_unknowns=True)
intracellular_pos_df = filter_and_clean_dataframe(intracellular_pos_df, labels_to_keep, filter_unknowns=True)
intracellular_neg_df = filter_and_clean_dataframe(intracellular_neg_df, labels_to_keep, filter_unknowns=True)
extracellular_pos_df = filter_and_clean_dataframe(extracellular_pos_df, extra_labels_to_keep, filter_unknowns=True)
extracellular_neg_df = filter_and_clean_dataframe(extracellular_neg_df, extra_labels_to_keep, filter_unknowns=True)


print(f'intracellular_df has shape {intracellular_df.shape}')
print(f'extracellular_1_df has shape {extracellular_1_df.shape}')
print(f'extracellular_4_df has shape {extracellular_4_df.shape}')
print(f'intracellular_pos_df has shape {intracellular_pos_df.shape}')
print(f'intracellular_neg_df has shape {intracellular_neg_df.shape}')
print(f'extracellular_pos_df has shape {extracellular_pos_df.shape}')
print(f'extracellular_neg_df has shape {extracellular_neg_df.shape}')

intracellular_df.head()

intracellular_df has shape (18, 84)
extracellular_1_df has shape (3, 45)
extracellular_4_df has shape (12, 40)
intracellular_pos_df has shape (18, 209)
intracellular_neg_df has shape (18, 665)
extracellular_pos_df has shape (15, 59)
extracellular_neg_df has shape (15, 76)


Unnamed: 0,2-aminoadipic acid,2'-Deoxycytidine 5'-monophosphoric acid,3-phosphoglycerate,4-aminobutyric acid (GABA),Adenine,Arachidic acid,Beta-alanine,Carbonate ion,Citric acid,Dehydroalanine,...,L-methionine sulfoxide,Maltotriitol,Maltotriose,Melibiose,O-phospho-L-serine,Phlorobenzophenone,Phytanic acid,Porphine,Shikimate-3-phosphate,Turanose
in_switchgrass_wt_early_stat_1,361520.397329639,158448.34341112,322605.073303065,677195.048082466,710902.55546792,21587.08362939,1742924.76926073,2187317.70629786,73643.828000748,317216.41938428,...,57650.95129084,0.0,197109.09808694,154515.482330361,912327.96567334,54886.784330598,53255.6038503064,1567848.0586232,79592.744504499,41592.877786195
in_switchgrass_wt_early_stat_2,227309.55760657,213583.36610502,312837.42938902,575822.106374466,510742.474927335,30589.6316662972,1362157.06455961,2078919.83564604,85094.117589131,336929.76297088,...,33682.73078276,0.0,132211.94520795,178537.388821497,546466.68582922,13966.873772049,54527.47671824,942871.57758861,101591.311751851,125701.00546245
in_switchgrass_wt_early_stat_3,398074.885557321,211939.49668843,296180.63097077,819996.231224398,618736.890582027,43765.31575552,1604466.57278453,2008970.44657791,110175.160063256,515591.82996428,...,68076.39291200001,19900.1574488,130384.73448493,92173.058229043,656821.37515376,26902.583198962,56933.25212629,1269753.69603953,74901.178091305,93064.52323487
in_pvhg_wt_early_stat_1a,15162280.8838296,415540.715236955,271752.376247778,72902.08496758,594252.444514168,30943.57955535,2485640.0470855,1821222.8634703,228090.297121264,220103.450019034,...,65791.95675730999,0.0,46002.15191532,45797.01792878,244917.939423229,30514.717408854,41711.03618778,1191269.79205348,74966.129693052,115508.27512833
in_pvhg_wt_early_stat_2a,18852341.7192045,414698.828302764,282297.43522253,2048534.51426727,631378.749461895,34596.4930274828,3083610.04949007,2065605.09371182,267275.475767908,440350.185467694,...,192791.27122674,87602.9049696,0.0,152281.830320128,42215.306175899,38377.218959318,46684.49842109,1857530.45155866,105883.846141951,0.0


### Get PCA and LDA data

In [None]:
intracellular_gc_ms_pca, intracellular_gc_ms_pca_ls = get_pca_coordinates(intracellular_df)
extracellular_gc_ms_pca, extracellular_gc_ms_pca_ls = get_pca_coordinates(extracellular_4_df)
intracellular_lc_ms_pos_pca, intracellular_lc_ms_pos_pca_ls = get_pca_coordinates(intracellular_pos_df)
extracellular_lc_ms_pos_pca, extracellular_lc_ms_pos_pca_ls = get_pca_coordinates(extracellular_pos_df)

intracellular_gc_ms_lda, intracellular_gc_ms_lda_ls = get_lda_coordinates(intracellular_df)
extracellular_gc_ms_lda, extracellular_gc_ms_lda_ls = get_lda_coordinates(extracellular_4_df)
intracellular_lc_ms_pos_lda, intracellular_lc_ms_pos_lda_ls = get_lda_coordinates(intracellular_pos_df)
extracellular_lc_ms_pos_lda, extracellular_lc_ms_pos_lda_ls = get_lda_coordinates(extracellular_pos_df)

display(intracellular_gc_ms_pca)
intracellular_gc_ms_pca_ls

### Make PCA plots

In [None]:
# Create a 2x2 grid of subplots
fig, axs = plt.subplots(2, 2, figsize=(18, 18))


# Plot each PCA plot in a separate subplot
plot_pca_lda(axs[0, 0], intracellular_gc_ms_pca, intracellular_gc_ms_pca_ls,'Intracellular GC-MS')
plot_pca_lda(axs[0, 1], extracellular_gc_ms_pca, extracellular_gc_ms_pca_ls,'Extracellular GC-MS')
plot_pca_lda(axs[1, 0], intracellular_lc_ms_pos_pca, intracellular_lc_ms_pos_pca_ls,'Intracellular LC-MS')
plot_pca_lda(axs[1, 1], extracellular_lc_ms_pos_pca, extracellular_lc_ms_pos_pca_ls, 'Extracellular LC-MS')

# Add an overall title for the entire figure
fig.suptitle('PCA Plots', fontsize=24)

# Adjust layout to make space for the overall title
plt.tight_layout(rect=[0, 0, 1, 0.95])

# Display the plots
plt.show()

display(intracellular_gc_ms_pca_ls.head(10))
display(extracellular_gc_ms_pca_ls.head(10))
display(intracellular_lc_ms_pos_pca_ls.head(10))
display(intracellular_lc_ms_pos_pca_ls.head(10))

### Make LDA plots

In [None]:
# Create a 2x2 grid of subplots
fig, axs = plt.subplots(2, 2, figsize=(18, 18))

# Plot each PCA plot in a separate subplot
plot_pca_lda(axs[0, 0], intracellular_gc_ms_lda, intracellular_gc_ms_lda_ls, 'Intracellular GC-MS')
plot_pca_lda(axs[0, 1], extracellular_gc_ms_lda, extracellular_gc_ms_lda_ls, 'Extracellular GC-MS')
plot_pca_lda(axs[1, 0], intracellular_lc_ms_pos_lda, intracellular_lc_ms_pos_lda_ls, 'Intracellular LC-MS')
plot_pca_lda(axs[1, 1], extracellular_lc_ms_pos_lda, extracellular_lc_ms_pos_lda_ls, 'Extracellular LC-MS')

# Add an overall title for the entire figure
fig.suptitle('LDA Plots', fontsize=24)

# Adjust layout to make space for the overall title
plt.tight_layout(rect=[0, 0, 1, 0.95])

# Display the plots
plt.show()

display(intracellular_gc_ms_lda_ls.head(10))
display(extracellular_gc_ms_lda_ls.head(10))
display(intracellular_lc_ms_pos_lda_ls.head(10))
display(intracellular_lc_ms_pos_lda_ls.head(10))

## Make PCA and LDA with only known metabolites

### Make dataframes without the unknown metabolites

In [None]:
intracellular_df = filter_and_clean_dataframe(intracellular_df, labels_to_keep, filter_unknowns=True)
extracellular_1_df = filter_and_clean_dataframe(extracellular_1_df, extra_labels_to_keep, filter_unknowns=True)
extracellular_4_df = filter_and_clean_dataframe(extracellular_4_df, extra_labels_to_keep, filter_unknowns=True)
intracellular_pos_df = filter_and_clean_dataframe(intracellular_pos_df, labels_to_keep, filter_unknowns=True)
intracellular_neg_df = filter_and_clean_dataframe(intracellular_neg_df, labels_to_keep, filter_unknowns=True)
extracellular_pos_df = filter_and_clean_dataframe(extracellular_pos_df, extra_labels_to_keep, filter_unknowns=True)
extracellular_neg_df = filter_and_clean_dataframe(extracellular_neg_df, extra_labels_to_keep, filter_unknowns=True)

print(f'intracellular_df has shape {intracellular_df.shape}')
print(f'extracellular_1_df has shape {extracellular_1_df.shape}')
print(f'extracellular_4_df has shape {extracellular_4_df.shape}')
print(f'intracellular_pos_df has shape {intracellular_pos_df.shape}')
print(f'intracellular_neg_df has shape {intracellular_neg_df.shape}')
print(f'extracellular_pos_df has shape {extracellular_pos_df.shape}')
print(f'extracellular_neg_df has shape {extracellular_neg_df.shape}')

intracellular_df.head()

### Get LDA and PCA data for filtered dataframes

In [None]:
intracellular_gc_ms_pca, intracellular_gc_ms_pca_ls = get_pca_coordinates(intracellular_df)
extracellular_gc_ms_pca, extracellular_gc_ms_pca_s = get_pca_coordinates(extracellular_4_df)
intracellular_lc_ms_pos_pca, intracellular_lc_ms_pos_pca_ls = get_pca_coordinates(intracellular_pos_df)
extracellular_lc_ms_pos_pca, extracellular_lc_ms_pos_pca_ls = get_pca_coordinates(extracellular_pos_df)

intracellular_gc_ms_lda, intracellular_gc_ms_lda_ls = get_lda_coordinates(intracellular_df)
extracellular_gc_ms_lda, extracellular_gc_ms_lda_ls = get_lda_coordinates(extracellular_4_df)
intracellular_lc_ms_pos_lda, intracellular_lc_ms_pos_lda_ls = get_lda_coordinates(intracellular_pos_df)
extracellular_lc_ms_pos_lda, extracellular_lc_ms_pos_lda_ls = get_lda_coordinates(extracellular_pos_df)

display(intracellular_gc_ms_pca)
intracellular_gc_ms_pca_ls

### Make PCA Plots

In [None]:
# Create a 2x2 grid of subplots
fig, axs = plt.subplots(2, 2, figsize=(18, 18))


# Plot each PCA plot in a separate subplot
plot_pca_lda(axs[0, 0], intracellular_gc_ms_pca, intracellular_gc_ms_pca_ls, 'Intracellular GC-MS')
plot_pca_lda(axs[0, 1], extracellular_gc_ms_pca, extracellular_gc_ms_pca_ls, 'Extracellular GC-MS')
plot_pca_lda(axs[1, 0], intracellular_lc_ms_pos_pca, intracellular_lc_ms_pos_pca_ls, 'Intracellular LC-MS')
plot_pca_lda(axs[1, 1], extracellular_lc_ms_pos_pca, extracellular_lc_ms_pos_pca_ls, 'Extracellular LC-MS')

# Add an overall title for the entire figure
fig.suptitle('PCA Plots', fontsize=24)

# Adjust layout to make space for the overall title
plt.tight_layout(rect=[0, 0, 1, 0.95])

# Display the plots
plt.show()

### Make LDA Plots

In [None]:
# Create a 2x2 grid of subplots
fig, axs = plt.subplots(2, 2, figsize=(18, 18))

# Plot each PCA plot in a separate subplot
plot_pca_lda(axs[0, 0], intracellular_gc_ms_lda, intracellular_gc_ms_lda_ls, 'Intracellular GC-MS')
plot_pca_lda(axs[0, 1], extracellular_gc_ms_lda, extracellular_gc_ms_lda_ls, 'Extracellular GC-MS')
plot_pca_lda(axs[1, 0], intracellular_lc_ms_pos_lda, intracellular_lc_ms_pos_lda_ls, 'Intracellular LC-MS')
plot_pca_lda(axs[1, 1], extracellular_lc_ms_pos_lda, extracellular_lc_ms_pos_lda_ls, 'Extracellular LC-MS')

# Add an overall title for the entire figure
fig.suptitle('LDA Plots', fontsize=24)

# Adjust layout to make space for the overall title
plt.tight_layout(rect=[0, 0, 1, 0.95])

# Display the plots
plt.show()

display(intracellular_gc_ms_lda_ls.head(10))
display(extracellular_gc_ms_lda_ls.head(10))
display(intracellular_lc_ms_pos_lda_ls.head(10))
display(intracellular_lc_ms_pos_lda_ls.head(10))