In [None]:
%pip install numpy
%pip install matplotlib
%pip install pandas
%pip install seaborn
%pip install scikit-learn

In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

# Load mmc1 dataset
df = pd.read_excel('./data/mmc1.xlsx', index_col=None, header=None)
df 

In [None]:
cell_types = [ "NK.27+11b-.BM",
"NK.27+11b+.BM",
"NK.27-11b+.BM",
"NK.27+11b-.Sp",
"NK.27+11b+.Sp",
"NK.27-11b+.Sp",
"ILC2.SI",
"ILC3.NKp46-CCR6-.SI",
"ILC3.NKp46+.SI",
"ILC3.CCR6+.SI"]

filtered_df = df[df[1].isin(cell_types)]
filtered_df

In [None]:
# 3. Separate metadata from numerical data and only keep numeric columns
numeric_df = filtered_df.apply(pd.to_numeric, errors='coerce')
numeric_df = numeric_df.dropna(how='all')
# Remove the first seven columns 
numeric_df = numeric_df.iloc[:, 7:]
numeric_df

In [None]:
# 4. Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_df)

In [None]:
# K-Means Clustering
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(scaled_data)
# Add cluster labels to the original DataFrame
filtered_df['Cluster'] = kmeans.labels_

filtered_df['Cluster']

In [None]:
# 5. Apply PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(scaled_data)

In [None]:
# Make DataFrame for plotting
pca_df = pd.DataFrame(data=pca_result, columns=['PC1', 'PC2'])

In [None]:
pca_df

In [None]:

# Plot
plt.figure(figsize=(8, 6))
sns.scatterplot(data=pca_df, x="PC1", y="PC2")
plt.title("PCA of Gene Expression")
plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}% variance)")
plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}% variance)")
plt.tight_layout()
plt.show()

In [None]:
# K-Means Clustering
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(scaled_data)
# Add cluster labels 
pca_df['Cluster'] = kmeans.labels_
# Plot with clusters
plt.figure(figsize=(8, 6))
sns.scatterplot(data=pca_df, x="PC1", y="PC2", hue="Cluster", palette="viridis")
plt.title("PCA of Gene Expression with K-Means Clustering")
plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}% variance)")
plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}% variance)")
plt.legend(title='Cluster')
plt.tight_layout()
plt.show()

In [None]:
pca_df['Cluster']