In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Concatenate, GaussianNoise, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.regularizers import l2 

# Utility function to convert dictionary to string (for logging purposes)
def dict_to_str(d):
    return ', '.join([f"{k}={v}" for k, v in d.items()]) 

In [None]:
# Encode categorical data
encoder = LabelEncoder()
cell_types_encoded = encoder.fit_transform(y)
cell_types_categorical = to_categorical(cell_types_encoded)


In [None]:
# Set up cross-validation
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
scores = []

# Parameters for model layers
exp_p0 = {"GaussianNoise": 0.1}
exp_p1 = {"Dense": 64}
exp_p2 = {"Dropout": 0.3}
exp_p3 = {"Dense": 32}

emb_p0 = {"GaussianNoise": 0.1}
emb_p1 = {"Dense": 256}
emb_p2 = {"Dropout": 0.3}
emb_p3 = {"Dense": 128}
l2_reg = 0.001

# Model compilation parameters
compliers = {"optimizer": "adam", "loss": "categorical_crossentropy", "metrics": ["accuracy"]}


In [None]:
for fold, (train_index, test_index) in enumerate(kf.split(X_transformed_NN), start=1):
    # Split data into training and testing sets for this fold
    X_train, X_test = X_transformed_NN[train_index], X_transformed_NN[test_index]
    y_train, y_test = cell_types_categorical[train_index], cell_types_categorical[test_index]

    # Separate expression and embeddings
    X_train_expr = X_train[:, -1].reshape(-1, 1)
    X_test_expr = X_test[:, -1].reshape(-1, 1)
    X_train_emb = X_train[:, :-1]
    X_test_emb = X_test[:, :-1]

    # Define model inputs
    input_expr = Input(shape=(1,), name='input_expr')
    input_emb = Input(shape=(X_train_emb.shape[1],), name='input_emb')
    
    # Build pathways for expression and embedding inputs
    noisy_expr = GaussianNoise(exp_p0["GaussianNoise"])(input_expr)
    noisy_emb = GaussianNoise(emb_p0["GaussianNoise"])(input_emb)

    pathway_expr = Dense(exp_p1["Dense"], activation='relu', kernel_regularizer=l2(l2_reg))(noisy_expr)
    pathway_expr = Dropout(exp_p2["Dropout"])(pathway_expr)
    pathway_expr = Dense(exp_p3["Dense"], activation='relu', kernel_regularizer=l2(l2_reg))(pathway_expr)

    pathway_emb = Dense(emb_p1["Dense"], activation='relu', kernel_regularizer=l2(l2_reg))(noisy_emb)
    pathway_emb = Dropout(emb_p2["Dropout"])(pathway_emb)
    pathway_emb = Dense(emb_p3["Dense"], activation='relu', kernel_regularizer=l2(l2_reg))(pathway_emb)
    
    # Concatenate pathways and define output layer
    concatenated = Concatenate()([pathway_expr, pathway_emb])
    output = Dense(cell_types_categorical.shape[1], activation='softmax')(concatenated)

    # Compile model
    model = Model(inputs=[input_expr, input_emb], outputs=output)
    model.compile(optimizer=compliers['optimizer'], loss=compliers['loss'], metrics=compliers['metrics'])

    if fold == 1:
        # Only print the model summary for the first fold
        model.summary()

    print(f"\nTraining on fold {fold}...")
    model.fit([X_train_expr, X_train_emb], y_train, validation_split=0.2, epochs=30, batch_size=32, verbose=1)

    # Evaluate the model
    loss, accuracy = model.evaluate([X_test_expr, X_test_emb], y_test, verbose=1)
    print(f"Fold {fold} Test Accuracy: {accuracy}\n")
    scores.append(accuracy)


In [None]:
# Calculate average accuracy
average_score = np.mean(scores)
print(f"\nAverage Test Accuracy across {n_splits} folds: {average_score}")

# Model configuration
nn_configurations = {
    "exp_path": {
        "exp_p0": {"GaussianNoise": 0.1},
        "exp_p1": {"Dense": 64, "activation": "relu"},
        "exp_p2": {"Dropout": 0.3},
        "exp_p3": {"Dense": 32, "activation": "relu"}
    },
    "emb_path": {
        "emb_p0": {"GaussianNoise": 0.1},
        "emb_p1": {"Dense": 256, "activation": "relu"},
        "emb_p2": {"Dropout": 0.3},
        "emb_p3": {"Dense": 128, "activation": "relu"}
    },
    "compliers": {
        "optimizer": "adam", 
        "loss": "categorical_crossentropy", 
        "metrics": ["accuracy"]
    }
}

# Convert configuration dictionary to string representation
config_str = dict_to_str(nn_configurations)

# Generate predictions for the test dataset
predictions = model.predict([X_test_expr, X_test_emb])

# Convert predictions from one-hot encoded to class indices
y_pred = np.argmax(predictions, axis=1)
y_true = np.argmax(y_test, axis=1)

# Generate classification report
report = classification_report(y_true, y_pred, output_dict=True)
statistics_NeNe = pd.DataFrame(report).transpose()

# Annotate the DataFrame with additional metadata
applications_label = f"{subject_outlier}_{subject_autoencoder}_{subject_normalization}_{subject_dimension}"
statistics_NeNe['Parameters'] = config_str
statistics_NeNe['Applications'] = applications_label
statistics_NeNe['Applications_Condition'] = f"{subject_label}_NeNe"

# Create a summary DataFrame
statistics_NeNe_DF = pd.DataFrame({
    "Applications": [applications_label],
    "Applications_Condition": [subject_label],
    "Model": ["NeNe"],
    "Parameters": [config_str],
    "Accuracy": [average_score],
    "Precision": [precision_score(y_true, y_pred, average='weighted', zero_division=0)],
    "Recall": [recall_score(y_true, y_pred, average='weighted', zero_division=0)],
    "F1": [f1_score(y_true, y_pred, average='weighted', zero_division=0)]
})

# Package and store results
Subject_Process_Dict = {
    "Model": model,
    "Predictions": y_pred,
    "Statistics": statistics_NeNe,
    "Statistics_DF": statistics_NeNe_DF
}

print("Neural network process has finished")
