In [None]:
# Define the model
model = models.Sequential([
    layers.Input(shape=(1024,)),  # Input layer for 1024-dimensional embeddings
    layers.Dense(512, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(len(mlb.classes_), activation='sigmoid')  # Output layer for multi-label classification
])

# Compile the model with binary cross-entropy loss for multi-label classification
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC(name="AUC", multi_label=True)])

# Train the model
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=64)


In [None]:
import tensorflow as tf
from tensorflow.keras import layers, regularizers

# Define the model
model = tf.keras.Sequential([
    layers.Input(shape=(1024,)),  # Input layer for 1024-dimensional embeddings
    layers.Dense(512, activation='relu', kernel_regularizer=regularizers.l2(0.001)),  # First dense layer with L2 regularization
    layers.Dropout(0.3),  # Dropout layer to prevent overfitting
    layers.Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.001)),  # Second dense layer with L2 regularization
    layers.Dropout(0.3),  # Dropout layer
    layers.Dense(1400, activation='sigmoid')  # Output layer for multi-label classification
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['binary_accuracy'])

# Train the model
history = model.fit(X_train, y_train, 
                    epochs=10, 
                    batch_size=128, 
                    validation_data=(X_val, y_val))


In [None]:
# Load test data
test_data = np.load('test_data.npy')

# Generate predictions
preds = model.predict(test_data)
threshold = 0.5  # Adjust threshold if needed based on validation performance
pred_labels = (preds >= threshold).astype(int)


In [None]:
# Decode multi-hot predictions back to ICD10 codes
submission = []
for pred in pred_labels:
    codes = [mlb.classes_[i] for i, val in enumerate(pred) if val == 1]
    codes.sort()  # Sort lexicographically
    label_string = ';'.join(codes).upper()  # Uppercase and format as required
    submission.append(label_string)

import pandas as pd

# Generate sequential IDs (e.g., 0 to number of test samples - 1)
num_test_samples = len(pred_labels)  # Length of the test predictions
ids = range(1, num_test_samples + 1)

# Create the submission DataFrame
submission_df = pd.DataFrame({'id': ids, 'labels': submission})

# Save the clean submission file
submission_df.to_csv('submission.csv', index=False)



In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, models, regularizers

# Custom F2 loss function
def f2_loss(y_true, y_pred):
    y_pred = tf.cast(y_pred > 0.5, tf.float32)
    tp = tf.reduce_sum(y_true * y_pred, axis=0)
    fp = tf.reduce_sum((1 - y_true) * y_pred, axis=0)
    fn = tf.reduce_sum(y_true * (1 - y_pred), axis=0)
    f2 = (5 * tp) / (5 * tp + 4 * fn + fp + 1e-8)
    return 1 - tf.reduce_mean(f2)  # 1 - F2 to minimize loss

# Model architecture with increased complexity
def create_model():
    model = models.Sequential([
        layers.Input(shape=(1024,)),
        layers.Dense(1024, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
        layers.BatchNormalization(),
        layers.Dropout(0.5),

        layers.Dense(512, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
        layers.BatchNormalization(),
        layers.Dropout(0.5),

        layers.Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
        layers.BatchNormalization(),
        layers.Dropout(0.3),

        layers.Dense(1400, activation='sigmoid')  # Multi-label output
    ])
    model.compile(optimizer='adam', loss=f2_loss, metrics=['binary_accuracy'])
    return model

# Load data
embeddings_1 = np.load('embeddings_1.npy')
embeddings_2 = np.load('embeddings_2.npy')
labels_1 = pd.read_csv('icd_codes_1.txt', header=None)
labels_2 = pd.read_csv('icd_codes_2.txt', header=None)
test_embeddings = np.load('test_data.npy')

# Combine embeddings and labels for training
X_train = np.vstack([embeddings_1, embeddings_2])
y_train = pd.concat([labels_1, labels_2], ignore_index=True)

# Convert labels to multi-hot encoding
unique_labels = sorted(set(";".join(y_train[0].values).split(";")))
label_index = {label: i for i, label in enumerate(unique_labels)}

def labels_to_multi_hot(labels, label_index):
    multi_hot = np.zeros((len(labels), len(label_index)), dtype=int)
    for i, label_str in enumerate(labels):
        for label in label_str.split(";"):
            if label in label_index:
                multi_hot[i, label_index[label]] = 1
    return multi_hot

y_train_multi_hot = labels_to_multi_hot(y_train[0], label_index)

# Split data for validation (e.g., 80-20 split)
split_idx = int(0.8 * len(X_train))
X_val, y_val = X_train[split_idx:], y_train_multi_hot[split_idx:]
X_train, y_train = X_train[:split_idx], y_train_multi_hot[:split_idx]

# Create and train the model
model = create_model()
history = model.fit(X_train, y_train, epochs=20, batch_size=128, validation_data=(X_val, y_val))

In [None]:
from sklearn.metrics import f1_score
import random

# Select a random sample from the validation set
sample_size = 1000  # Adjust based on memory capacity
sample_indices = random.sample(range(len(X_val)), sample_size)
X_val_sample = X_val[sample_indices]
y_val_sample = y_val[sample_indices]

# Predict on the sample
val_preds_sample = model.predict(X_val_sample)

import numpy as np
from sklearn.metrics import fbeta_score

# Penalize higher thresholds and limit threshold search range
thresholds = np.arange(0.1, 0.5, 0.05)  # Restrict to lower values
best_thresholds = []

for i in range(y_val_sample.shape[1]):
    f2_scores = []
    for thresh in thresholds:
        preds = (val_preds_sample[:, i] > thresh).astype(int)
        
        # Calculate micro-F2 score
        f2 = fbeta_score(y_val_sample[:, i], preds, beta=2, average='micro')
        
        # Apply a penalty for higher thresholds (example penalty: subtract a factor based on threshold)
        penalty = 0.01 * (thresh - 0.3) if thresh > 0.3 else 0
        f2_scores.append(f2 - penalty)
        
    best_thresh = thresholds[np.argmax(f2_scores)]
    best_thresholds.append(best_thresh)


In [None]:
# Predictions on test data using a fixed threshold of 0.5
test_preds = model.predict(test_embeddings)



In [None]:
print(best_thresholds)

In [None]:
from skopt import gp_minimize
from skopt.space import Real
from sklearn.metrics import fbeta_score

# Define function to maximize F2 score
def f2_threshold_objective(thresh_values):
    # Convert array of threshold values to predictions
    preds = (val_preds_sample > np.array(thresh_values)).astype(int)
    micro_f2 = fbeta_score(y_val_sample, preds, beta=2, average='micro')
    return -micro_f2  # Negative because we are minimizing in Bayesian Optimization

# Define search space for thresholds (0.1 to 0.5 for each label)
space = [Real(0.1, 0.5, name=f'thresh_{i}') for i in range(y_val_sample.shape[1])]

# Run Bayesian optimization
opt_result = gp_minimize(f2_threshold_objective, space, n_calls=20, random_state=0)

# Extract best thresholds
best_thresholds = opt_result.x
print(best_thresholds)

In [None]:
# test_labels = []
# for i in range(test_preds.shape[0]):
#     labels = [unique_labels[j] for j in range(test_preds.shape[1]) if test_preds[i, j] > best_thresholds[j]])
#     test_labels.append(";".join(sorted(labels)))

# Predictions on test data using a fixed threshold of 0.5
test_preds = model.predict(test_embeddings)
test_labels = []
for i in range(test_preds.shape[0]):
    labels = [unique_labels[j] for j in range(test_preds.shape[1]) if test_preds[i, j] > 0.49]
    test_labels.append(";".join(sorted(labels)))

# Create submission dataframe
submission_df = pd.DataFrame({'id': range(1, len(test_labels) + 1), 'labels': test_labels})

# Save submission file
submission_df.to_csv('submission.csv', index=False)