In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression

# Importing excel sheets as dfs, should both have samples in the same order
predictors_df = pd.read_excel('../Predictors_Cleaned.xlsx', 0)
    # Rows are samples, columns are predictors
outcomes_df = pd.read_excel('../Outcomes_Cleaned.xlsx', 0)
    # Rows are samples, columns are mechanisms

# Dropping unnecessary metadata
X = predictors_df.drop('SAMPLE NAME', axis=1)
Y = outcomes_df.drop('MECHANISM', axis=1) # Full DF must iterate through
feature_names = X.columns
gene_names = Y.columns

# List of columns for iterating
columns = list(Y)

In [None]:
# Split the data
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=0)

# Making the PCR model
pcr = make_pipeline(StandardScaler(), PCA(n_components=2), LinearRegression())
pcr.fit(x_train, y_train)

# Retrieving the PCA step of the pipeline
pca = pcr.named_steps["pca"]  # retrieve the PCA step of the pipeline

In [None]:
# Visualization
fig, axes = plt.subplots(1, 2, figsize=(10, 3))
axes.scatter(pca.transform(x_test), y_test, alpha=0.3, label="PCA ground truth")
axes.scatter(
    pca.transform(x_test), pcr.predict(x_test), alpha=0.3, label=" PCR predictions"
)
axes.set(
    xlabel="Projected data onto first PCA component", ylabel="y", title="PCR / PCA"
)
axes.legend()