In [None]:
%pip install pyarrow

In [None]:
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

vst_transformed_counts_df = None
merged_metadata = None

# --- Load VST transformed counts data ---
try:
    vst_transformed_counts_df = pd.read_parquet("data/vst_transformed_counts.pq")
    merged_metadata = pd.read_parquet("data/merged_metadata.pq")
    print("VST transformed counts loaded successfully.")
except FileNotFoundError:
    print("Error: VST transformed counts or metadata file not found. Please check the file paths.")

# --- Transpose the VST data for PCA ---
data_for_pca = vst_transformed_counts_df.T
print(f"Shape of data for PCA (samples x genes): {data_for_pca.shape}")


# --- Perform PCA ---
# Initialize PCA. You typically start with 2 components for a 2D plot.
pca = PCA(n_components=2)

# Fit PCA to your data and transform it
principal_components = pca.fit_transform(data_for_pca)

# Create a DataFrame for the principal components for easier plotting
pca_df = pd.DataFrame(data=principal_components,
                      columns=['PC1', 'PC2'],
                      index=data_for_pca.index) # Use sample names as index

pca_df_with_metadata = pca_df.merge(merged_metadata, left_index=True, right_index=True)

# --- Plot the PCA results ---
plt.figure(figsize=(10, 8))
sns.scatterplot(
    x='PC1',
    y='PC2',
    hue='condition', # Color points by your experimental condition
    data=pca_df_with_metadata,
    s=100, # Size of the points
    alpha=0.8 # Transparency of the points
)

# Add explained variance to the axis labels
explained_variance_ratio = pca.explained_variance_ratio_
plt.xlabel(f'Principal Component 1 ({explained_variance_ratio[0]*100:.2f}%)')
plt.ylabel(f'Principal Component 2 ({explained_variance_ratio[1]*100:.2f}%)')
plt.title('PCA Plot of VST Transformed Counts')
plt.grid(True)
plt.legend(title='Condition')
plt.tight_layout()
plt.show()