In [None]:
print("operationnn.ipynb working")

In [None]:
# Create a str from keys and values of dict
def dict_to_str(d):
    result = '_'.join([f'{k}_{v}' for k, v in d.items()])
    print(result)
    return result  

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Concatenate, GaussianNoise, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.regularizers import l2 


In [None]:
#features   = subject_data_full['Expression_Embeddings'].tolist()
#cell_types = subject_data_full['Cell_Type'].values
features   = subject_data['Expression_Embeddings'].tolist()
cell_types = subject_data['Cell_Type'].values


In [None]:
# Unpack expression levels and embeddings into separate arrays
expressions = np.array([feature[0] for feature in features])
embeddings = np.array([feature[1] for feature in features])

# Encode categorical data
encoder = LabelEncoder()
cell_types_encoded = encoder.fit_transform(cell_types)
cell_types_categorical = to_categorical(cell_types_encoded)

# Step 2: Setting up cross-validation
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
scores = []

# Parameters for model layers
exp_p0 = {"GaussianNoise":0.1}
exp_p1 = {"Dense":64}
exp_p2 = {"Dropout":0.3}
exp_p3 = {"Dense":32} 

emb_p0 = {"GaussianNoise":0.1}
emb_p1 = {"Dense":256}
emb_p2 = {"Dropout":0.3}
emb_p3 = {"Dense":128} 
l2_reg = 0.001

# Model compilation parameters
compliers = {"optimizer":"adam", "loss":"categorical_crossentropy", "metrics":["accuracy"]}

# Step 3: Model building and training in K-fold cross-validation
for fold, (train_index, test_index) in enumerate(kf.split(expressions), start=1):
    # Split data into training and testing sets for this fold
    X_train_expr, X_test_expr = expressions[train_index], expressions[test_index]
    X_train_emb, X_test_emb = embeddings[train_index], embeddings[test_index]
    y_train, y_test = cell_types_categorical[train_index], cell_types_categorical[test_index]

    # Define model inputs
    input_expr = Input(shape=(1,), name='input_expr')
    input_emb = Input(shape=(X_train_emb.shape[1],), name='input_emb')
    
    # Build pathways for expression and embedding inputs
    noisy_expr = GaussianNoise(exp_p0["GaussianNoise"])(input_expr)
    noisy_emb = GaussianNoise(emb_p0["GaussianNoise"])(input_emb)

    pathway_expr = Dense(exp_p1["Dense"], activation='relu', kernel_regularizer=l2(l2_reg))(noisy_expr)
    pathway_expr = Dropout(exp_p2["Dropout"])(pathway_expr)
    pathway_expr = Dense(exp_p3["Dense"], activation='relu', kernel_regularizer=l2(l2_reg))(pathway_expr)

    pathway_emb = Dense(emb_p1["Dense"], activation='relu', kernel_regularizer=l2(l2_reg))(noisy_emb)
    pathway_emb = Dropout(emb_p2["Dropout"])(pathway_emb)
    pathway_emb = Dense(emb_p3["Dense"], activation='relu', kernel_regularizer=l2(l2_reg))(pathway_emb)
    
    # Concatenate pathways and define output layer
    concatenated = Concatenate()([pathway_expr, pathway_emb])
    output = Dense(cell_types_categorical.shape[1], activation='softmax')(concatenated)

    # Compile model
    model = Model(inputs=[input_expr, input_emb], outputs=output)
    model.compile(optimizer=compliers['optimizer'], loss=compliers['loss'], metrics=compliers['metrics'])

    if fold == 1:
        # Only print the model summary for the first fold
        model.summary()

    print(f"\nTraining on fold {fold}...")
    model.fit([X_train_expr, X_train_emb], y_train, validation_split=0.2, epochs=30, batch_size=32, verbose=1)

    # Step 4: Evaluate the model
    loss, accuracy = model.evaluate([X_test_expr, X_test_emb], y_test, verbose=1)
    print(f"Fold {fold} Test Accuracy: {accuracy}\n")
    scores.append(accuracy)

# Step 5: Reporting results
average_score = np.mean(scores)
print(f"\nAverage Test Accuracy across {n_splits} folds: {average_score}")


In [None]:
nn_configurations = {
    "exp_path": {
        "exp_p0": {"GaussianNoise": 0.1},
        "exp_p1": {"Dense": 64, "activation": "relu"},
        "exp_p2": {"Dropout": 0.3},
        "exp_p3": {"Dense": 32, "activation": "relu"}
    },
    "emb_path": {
        "emb_p0": {"GaussianNoise": 0.1},
        "emb_p1": {"Dense": 256, "activation": "relu"},
        "emb_p2": {"Dropout": 0.3},
        "emb_p3": {"Dense": 128, "activation": "relu"}
    },
    "compliers": {
        "optimizer": "adam", 
        "loss": "categorical_crossentropy", 
        "metrics": ["accuracy"]
    }
}

# Function to convert the configuration dictionary into a string representation
def config_to_str(config_dict):
    config_str_list = []
    for path, configs in config_dict.items():
        for key, value in configs.items():
            config_str_list.append(f"{key}: {value}")
    return ', '.join(config_str_list)

# Example of how to use it
config_str = config_to_str(nn_configurations)
config_str

In [None]:
# Step 1: Model Prediction
# Generate predictions for the test dataset
predictions = model.predict([X_test_expr, X_test_emb])

# Step 2: Convert predictions from one-hot encoded to class indices
y_pred = np.argmax(predictions, axis=1)  # Predicted class indices
y_true = np.argmax(y_test, axis=1)       # True class indices

# Note: y_true_indices seems redundant since it's the same as y_true.
# Including this step only if it serves a specific purpose not clear from the provided context.
y_true_indices = np.argmax(y_test, axis=1)

# Step 3: Classification Report
# Generate a classification report using true labels and predictions
report = classification_report(y_true_indices, y_pred, output_dict=True)

# Step 4: Convert the classification report to a DataFrame for easier handling and visualization
statistics_NeNe = pd.DataFrame(report).transpose()

# Step 5: Annotate the DataFrame with additional metadata
# Construct a model label using parameters like outlier, autoencoder, normalization, and dimension settings
applications_label = f"{subject_outlier}_{subject_autoencoder}_{subject_normalization}_{subject_dimension}"

# Add model parameters and application settings to the DataFrame
statistics_NeNe['Parameters'] = config_str
statistics_NeNe['Applications'] = applications_label
statistics_NeNe['Applications_Condition'] = f"{subject_label}_NeNe"


# Step 6: Summary DataFrame
# Create a summary DataFrame that collates all relevant metrics and metadata for easy reference
statistics_NeNe_DF = pd.DataFrame({
    "Applications": [applications_label],
    "Applications_Condition": [f"{subject_label}"],
    "Model": ["NeNe"],  # Model name or identifier
    "Parameters": [config_str],
    "Accuracy": [average_score],  # Assumes average_score is calculated previously
    "Precision": [precision_score(y_true_indices, y_pred, average='weighted', zero_division=0)],
    "Recall": [recall_score(y_true_indices, y_pred, average='weighted', zero_division=0)],
    "F1": [f1_score(y_true_indices, y_pred, average='weighted', zero_division=0)]
})

# Step 7: Package and store results
# Create a dictionary to store the model, predictions, and statistics for later use or further analysis
Subject_Process_Dict = {
    "Model": model,
    "Predictions": y_pred,
    "Statistics": statistics_NeNe,
    "Statistics_DF": statistics_NeNe_DF
}


In [None]:
print("operationnn.ipynb has finished")