In [None]:
import numpy as np
import pandas as pd

# Assuming 'subject_data_full' is your DataFrame

# Extract embeddings as lists of NumPy arrays
cell_type_embeddings = subject_data_full["Cell_Type_Embeddings_Outlier"].apply(np.array).tolist()
gene_marker_embeddings = subject_data_full["Gene_Marker_Embeddings_Outlier"].apply(np.array).tolist()

# Verify that all embeddings have the expected shape, e.g., (768,) for cell type and (768,) for gene marker
# This step is crucial and you might need to adjust depending on your actual data shape

# Concatenate embeddings row-wise
concat_embeddings = [np.concatenate([cell_emb, gene_emb]) for cell_emb, gene_emb in zip(cell_type_embeddings, gene_marker_embeddings)]

# Correct approach, already in your code
concat_embeddings_matrix = np.stack(concat_embeddings)
concat_embeddings_tensor = torch.tensor(concat_embeddings_matrix).float()  # This should not trigger the warning


In [None]:
# Autoencoder combined dropout
    @staticmethod
    def combined_dropout(target_matrix, 
                        model_type         = 'combined_dropout',
                        variable_lr        = [1e-3, 1e-4] ,
                        batch_size         = 128,
                        num_epochs         = 5,
                        noise_factor       = [0.3,0.4],
                        dropout_rate       = [0.1,0.2],
                        input_outer_layer  = 1536,
                        inner_layer        = 64 ,
                        output_outer_layer = 100, 
                         ):
        # variables: learning rate, noise_factor, dropout_rate 

        # Outputs B4_a, B4_b, B4_c, B4_d, B4_e, B4_f, B4_g, B4_h 
        print("Control Point | staticmethod combined_dropout | 4.0") 
        X0_a_manager = AutoencoderManager(  model_type   = model_type,
                                    lr           = variable_lr[0], 
                                    batch_size   = batch_size, 
                                    num_epochs   = num_epochs, 
                                    noise_factor = noise_factor[0],
                                    dropout_rate = dropout_rate[0], 
                                    outer_layer         = input_outer_layer, 
                                    inner_layer         = inner_layer,
                                    output_outer_layer  = output_outer_layer ,
                                    ) 
        print("Control Point | staticmethod combined_dropout | 4.1") 
        X0_a_manager.load_data( data_matrix = target_matrix ) 
        print("Control Point | staticmethod combined_dropout | 4.2") 
        X0_a_manager.train_combined_dropout() 
        print("Control Point | staticmethod combined_dropout | 4.3") 
        X0_a_reconstructed_data = X0_a_manager.get_reconstructed_data_combined_dropout(target_matrix) 
        print("Control Point | staticmethod combined_dropout | 4.4") 

        B4_List = X0_a_reconstructed_data 
        return B4_List 

In [None]:
## Good one 

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Concatenate
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Assuming 'Expression_Embeddings' column where first element is gene expression and the rest are embedding dimensions
features = subject_data_full['Expression_Embeddings'].tolist()
expressions = np.array([feature[0] for feature in features])  # Gene expression data
embeddings = np.array([feature[1] for feature in features])  # Text embeddings

# Preprocess labels
cell_types = subject_data_full['Cell_Type'].values
encoder = LabelEncoder()
cell_types_encoded = encoder.fit_transform(cell_types)
cell_types_categorical = to_categorical(cell_types_encoded)

# Split data
X_train_expr, X_test_expr, X_train_emb, X_test_emb, y_train, y_test = train_test_split(
    expressions, embeddings, cell_types_categorical, test_size=0.2, random_state=42
)

# Input layers
input_expr = Input(shape=(1,), name='input_expr')  # Gene expression input
input_emb = Input(shape=(X_train_emb.shape[1],), name='input_emb')  # Text embeddings input

# Separate pathways
pathway_expr = Dense(32, activation='relu')(input_expr)
pathway_emb = Dense(128, activation='relu')(input_emb)

# Concatenate the pathways
concatenated = Concatenate()([pathway_expr, pathway_emb])

# Output layer
output = Dense(cell_types_categorical.shape[1], activation='softmax')(concatenated)

# Build and compile the model
model = Model(inputs=[input_expr, input_emb], outputs=output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()

# Train the model
model.fit(
    [X_train_expr, X_train_emb], y_train,
    validation_split=0.2,
    epochs=20,
    batch_size=32
)

# Evaluate the model
loss, accuracy = model.evaluate([X_test_expr, X_test_emb], y_test)
print(f"Test accuracy: {accuracy}")


In [None]:
def flatten_features(row):
    expression, embeddings = row[0], row[1]
    # Ensure the expression is the first item followed by all embedding values
    return [expression] + embeddings

#combined_features = np.hstack((expressions_normalization_process, embeddings_normalization_process))

Subject_Out_1_Nor_0_Aut_0_DR_0 = np.array(subject_data_full['Expression_Embeddings'].apply(flatten_features).tolist())
Subject_Out_1_Nor_1_Aut_0_DR_0 = np.array(subject_data_full['Expression_Embeddings_Normalized'].apply(flatten_features).tolist())
Subject_Out_1_Nor_1_Aut_1_DR_0 = np.array(subject_data_full['Expression_Embeddings_Autoencoder'].apply(flatten_features).tolist())
Subject_Out_1_Nor_1_Aut_1_DR_1 = np.array(subject_data_full['Expression_Embeddings_Dimension'].apply(flatten_features).tolist()) 
Subject_Out_1_Nor_1_Aut_1_DR_1
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# Assuming `subject_data_full` is your DataFrame and already contains "Expression_Embeddings" as combined features
X = np.array(subject_data_full['Expression_Embeddings'].apply(lambda x: [x[0]] + x[1]).tolist())

# Encode Cell_Type labels to integers and then to one-hot vectors
le = LabelEncoder()
y_integers = le.fit_transform(subject_data_full['Cell_Type'])
y_onehot = to_categorical(y_integers)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Define the neural network architecture
model = Sequential()
model.add(Dense(128, input_shape=(X.shape[1],), activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(y_onehot.shape[1], activation='softmax'))  # Output layer

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y_onehot, test_size=0.2, random_state=42)

model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=2)
print(f"Test accuracy: {test_accuracy:.4f}")

Subject_Out_1_Nor_0_Aut_0_DR_0

In [None]:
features = subject_data_full['Expression_Embeddings'].tolist()
expressions = np.array([feature[0] for feature in features])
embeddings = np.array([feature[1] for feature in features])

cell_types = subject_data_full['Cell_Type'].values
encoder = LabelEncoder()
cell_types_encoded = encoder.fit_transform(cell_types)
cell_types_categorical = to_categorical(cell_types_encoded)

n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

scores = []

for fold, (train_index, test_index) in enumerate(kf.split(expressions), start=1):
    X_train_expr, X_test_expr = expressions[train_index], expressions[test_index]
    X_train_emb, X_test_emb = embeddings[train_index], embeddings[test_index]
    y_train, y_test = cell_types_categorical[train_index], cell_types_categorical[test_index]

    input_expr = Input(shape=(1,), name='input_expr')
    input_emb = Input(shape=(X_train_emb.shape[1],), name='input_emb')
    
    noisy_expr = GaussianNoise(0.1)(input_expr)
    noisy_emb = GaussianNoise(0.1)(input_emb)

    pathway_expr = Dense(64, activation='relu')(noisy_expr)
    pathway_expr = Dropout(0.3)(pathway_expr)
    pathway_expr = Dense(32, activation='relu')(pathway_expr)

    pathway_emb = Dense(256, activation='relu')(noisy_emb)
    pathway_emb = Dropout(0.3)(pathway_emb)
    pathway_emb = Dense(128, activation='relu')(pathway_emb)
    
    concatenated = Concatenate()([pathway_expr, pathway_emb])

    output = Dense(cell_types_categorical.shape[1], activation='softmax')(concatenated)

    model = Model(inputs=[input_expr, input_emb], outputs=output)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    if fold == 1:
        # Only print the model summary for the first fold
        model.summary()

    print(f"\nTraining on fold {fold}...")
    model.fit([X_train_expr, X_train_emb], y_train, validation_split=0.2, epochs=30, batch_size=32, verbose=1)

    loss, accuracy = model.evaluate([X_test_expr, X_test_emb], y_test, verbose=1)
    print(f"Fold {fold} Test Accuracy: {accuracy}\n")
    scores.append(accuracy)

average_score = np.mean(scores)
print(f"\nAverage Test Accuracy across {n_splits} folds: {average_score}")


In [None]:
features = subject_data_full['Expression_Embeddings'].tolist()
expressions = np.array([feature[0] for feature in features])
embeddings = np.array([feature[1] for feature in features])

cell_types = subject_data_full['Cell_Type'].values
encoder = LabelEncoder()
cell_types_encoded = encoder.fit_transform(cell_types)
cell_types_categorical = to_categorical(cell_types_encoded)

n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

scores = []

for fold, (train_index, test_index) in enumerate(kf.split(expressions), start=1):
    X_train_expr, X_test_expr = expressions[train_index], expressions[test_index]
    X_train_emb, X_test_emb = embeddings[train_index], embeddings[test_index]
    y_train, y_test = cell_types_categorical[train_index], cell_types_categorical[test_index]

    input_expr = Input(shape=(1,), name='input_expr')
    input_emb = Input(shape=(X_train_emb.shape[1],), name='input_emb')
    
    noisy_expr = GaussianNoise(0.1)(input_expr)
    noisy_emb = GaussianNoise(0.1)(input_emb)

    pathway_expr = Dense(64, activation='relu')(noisy_expr)
    pathway_expr = Dropout(0.3)(pathway_expr)
    pathway_expr = Dense(32, activation='relu')(pathway_expr)

    pathway_emb = Dense(256, activation='relu')(noisy_emb)
    pathway_emb = Dropout(0.3)(pathway_emb)
    pathway_emb = Dense(128, activation='relu')(pathway_emb)
    
    concatenated = Concatenate()([pathway_expr, pathway_emb])

    output = Dense(cell_types_categorical.shape[1], activation='softmax')(concatenated)

    model = Model(inputs=[input_expr, input_emb], outputs=output)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    if fold == 1:
        # Only print the model summary for the first fold
        model.summary()

    print(f"\nTraining on fold {fold}...")
    model.fit([X_train_expr, X_train_emb], y_train, validation_split=0.2, epochs=30, batch_size=32, verbose=1)

    loss, accuracy = model.evaluate([X_test_expr, X_test_emb], y_test, verbose=1)
    print(f"Fold {fold} Test Accuracy: {accuracy}\n")
    scores.append(accuracy)

average_score = np.mean(scores)
print(f"\nAverage Test Accuracy across {n_splits} folds: {average_score}")
