In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import anndata as ad
import numpy as np
import pandas as pd
import umap
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

### Load Data

In [4]:
input_dir = "/Users/apple/Desktop/KB/data"
adata_train = ad.read_h5ad(input_dir+'/BiddyData/Biddy_train.h5ad')
adata_test = ad.read_h5ad(input_dir+'/BiddyData/Biddy_test.h5ad')

train_labels = adata_train.obs["clone_id"].to_numpy()
test_labels = adata_test.obs["clone_id"].to_numpy()

print(train_labels.shape, test_labels.shape)

(5893,) (641,)


## supUMAP embedding

In [5]:
# Extract the data matrix and labels
train_data = adata_train.X
test_data = adata_test.X


# labels = adata_train.obs['clone_id'].values

# Initialize UMAP with a higher number of neighbors for supervised learning
reducer = umap.UMAP(n_neighbors=15, n_components=10)



In [6]:
# Fit and transform the data with the labels
X_train = reducer.fit_transform(train_data, y=train_labels)

In [7]:
# get the test embeddings
X_test = reducer.transform(test_data)

In [8]:
X_train.shape, X_test.shape

((5893, 10), (641, 10))

### Generate the Composition pair

In [9]:
adata_train.obs["reprogramming_day"].unique()

['15', '28', '21', '12', '9', '6']
Categories (6, object): ['6', '9', '12', '15', '21', '28']

In [10]:
adata_train.obs["cell_type"].unique()

['Fibroblast', 'iEP', 'Ambiguous']
Categories (3, object): ['Ambiguous', 'Fibroblast', 'iEP']

In [11]:
def composit_pair_gen(X_train, adata_train):
    # train_labels = adata_train.obs["clone_id"].to_numpy()
    # print("train_labels.shape:", train_labels.shape)

    ### generate the labels
    adata_28 = adata_train[adata_train.obs["reprogramming_day"] == "28"]
    print("adata_28.shape:", adata_28.shape)

    # Initialize an empty dictionary to store the cell type distributions
    clone_cell_type_distribution = {}

    # Get the unique lineage
    unique_clone_ids = adata_28.obs["clone_id"].unique()

    # Loop through each unique lineage
    for clone_id in unique_clone_ids:
        # Filter the data to get only rows with the current clone_id
        clone_data = adata_28.obs[adata_28.obs["clone_id"] == clone_id]
        
        # Get the distribution of cell types in the current clone_id
        cell_type_distribution = clone_data["cell_type"].value_counts(normalize=True)
        
        # Round each percentage to 4 decimal places and convert to a dictionary
        cell_type_distribution = cell_type_distribution.round(4).to_dict()
        
        # Store the rounded distribution in the main dictionary
        clone_cell_type_distribution[clone_id] = cell_type_distribution

    # Print the resulting dictionary for verification
    i = 0
    for clone_id, distribution in clone_cell_type_distribution.items():
        print(f"Clone ID: {clone_id}, Cell Type Distribution: {distribution}")
        i+=1
        if i ==3:
            break


    # Step 1: Get embeddings for Day 12 cells
    day12_mask = adata_train.obs["reprogramming_day"] == "12" 
    X_train_day12 = X_train[day12_mask.values] 
    print(f"Day 12 embeddings shape: {X_train_day12.shape}")

    # Step 2: Get the clone labels for Day 12 cells
    clone_labels_day12 = adata_train.obs.loc[day12_mask, "clone_id"].to_numpy()

    # Step 3: Initialize y_train_prob matrix to store the probabilities
    n_classes = len(adata_train.obs["cell_type"].unique())
    y_train_prob = np.zeros((X_train_day12.shape[0], n_classes))

    # Step 4: Assign the distributions from clone_cell_type_distribution to each cell based on its clone_id
    for i, clone_id in enumerate(clone_labels_day12):
        if clone_id in clone_cell_type_distribution:
            # Get the distribution for the clone
            distribution = clone_cell_type_distribution[clone_id]
            
            # Ensure the order of cell types matches ['iEP', 'undifferentiated', 'Ambiguous', 'Fibroblast']
            y_train_prob[i, 0] = distribution.get('iEP', 0)  # Default to 0 if not present
            y_train_prob[i, 1] = distribution.get('Ambiguous', 0)  # Default to 0 if not present
            y_train_prob[i, 2] = distribution.get('Fibroblast', 0)  # Default to 0 if not present

    # Print the shape and first few examples of y_train_prob
    print(f"y_train_prob shape: {y_train_prob.shape}")
    print(f"First 5 rows of y_train_prob:\n{y_train_prob[:5]}")


    X_train_day12 = torch.tensor(X_train_day12, dtype=torch.float32)

    # Example soft labels: 5 samples, each with a probability distribution over 3 classes
    y_train_prob = torch.tensor(y_train_prob, dtype=torch.float32)

    return X_train_day12, y_train_prob

In [12]:
X_train_day12, y_train_prob = composit_pair_gen(X_train, adata_train)
X_test_day12, y_test_prob = composit_pair_gen(X_test, adata_test)

adata_28.shape: (2196, 2000)
Clone ID: 493.0, Cell Type Distribution: {'iEP': 0.8739, 'Ambiguous': 0.0943, 'Fibroblast': 0.0318}
Clone ID: 2352.0, Cell Type Distribution: {'Fibroblast': 0.5761, 'Ambiguous': 0.3043, 'iEP': 0.1196}
Clone ID: 487.0, Cell Type Distribution: {'iEP': 0.6833, 'Ambiguous': 0.3, 'Fibroblast': 0.0167}
Day 12 embeddings shape: (876, 10)
y_train_prob shape: (876, 3)
First 5 rows of y_train_prob:
[[0.8739 0.0943 0.0318]
 [0.8739 0.0943 0.0318]
 [0.8739 0.0943 0.0318]
 [0.8739 0.0943 0.0318]
 [0.8739 0.0943 0.0318]]
adata_28.shape: (255, 2000)
Clone ID: 493.0, Cell Type Distribution: {'iEP': 0.9565, 'Ambiguous': 0.0326, 'Fibroblast': 0.0109}
Clone ID: 2352.0, Cell Type Distribution: {'Ambiguous': 0.5333, 'Fibroblast': 0.4667, 'iEP': 0.0}
Clone ID: 487.0, Cell Type Distribution: {'Ambiguous': 1.0, 'Fibroblast': 0.0, 'iEP': 0.0}
Day 12 embeddings shape: (91, 10)
y_train_prob shape: (91, 3)
First 5 rows of y_train_prob:
[[0.9565 0.0326 0.0109]
 [0.9565 0.0326 0.0109]
 

In [13]:
import torch
import torch.nn as nn

class SoftLabelNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SoftLabelNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)  # Raw output before softmax
        return out


class Trainer:
    def __init__(self, model, optimizer, criterion, X_train, y_train_prob, num_epochs=10000, lr=0.01):
        self.model = model
        self.optimizer = optimizer
        self.criterion = criterion
        self.X_train = X_train
        self.y_train_prob = y_train_prob
        self.num_epochs = num_epochs
        self.lr = lr

    def train(self):
        for epoch in range(self.num_epochs):
            # Forward pass
            outputs = self.model(self.X_train)
            
            # Apply log_softmax to get log probabilities
            outputs_log_prob = torch.log_softmax(outputs, dim=1)
            
            # Calculate the KL divergence loss
            loss = self.criterion(outputs_log_prob, self.y_train_prob)
            
            # Backward pass and optimization
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            
            # Print loss every 50 epochs
            if (epoch+1) % 100 == 0:
                print(f'Epoch [{epoch+1}/{self.num_epochs}], Loss: {loss.item():.4f}')
    
    def predict(self, X_test):
        # Set the model to evaluation mode
        self.model.eval()
        with torch.no_grad():
            outputs = self.model(X_test)
            # Apply softmax to get predicted probabilities
            probabilities = torch.softmax(outputs, dim=1)
        return probabilities

    def evaluate_kl_divergence(self, X_test, y_test_prob):
        # Get the predicted log probabilities
        predicted_probabilities_log = torch.log_softmax(self.model(X_test), dim=1)
        
        # Calculate KL divergence between predicted and true probabilities
        kl_divergence = self.criterion(predicted_probabilities_log, y_test_prob)
        return kl_divergence.item()



In [16]:
# Initialize the model, optimizer, and KLDivLoss function
input_size = X_train_day12.shape[1]
hidden_size = 10
output_size = y_train_prob.shape[1]

model = SoftLabelNN(input_size, hidden_size, output_size)
criterion = nn.KLDivLoss(reduction='batchmean')  # KLDivLoss for comparing distributions
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Instantiate the Trainer class and start training
trainer = Trainer(model, optimizer, criterion, X_train_day12, y_train_prob, num_epochs=500)
trainer.train()

kl_divergence = trainer.evaluate_kl_divergence(X_test_day12, y_test_prob)
print(f"KL Divergence on test set: {kl_divergence:.4f}")

Epoch [100/500], Loss: 0.2502
Epoch [200/500], Loss: 0.2046
Epoch [300/500], Loss: 0.1994
Epoch [400/500], Loss: 0.1980
Epoch [500/500], Loss: 0.1967
KL Divergence on test set: 0.3217
