In [None]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
import pandas as pd
from tensorflow.keras.layers import LeakyReLU

# Define file paths
embedding_file_1 = '/kaggle/input/ml-chall/embeddings_1.npy'
label_file_1 = '/kaggle/input/ml-chall/icd_codes_1.txt'
embedding_file_2 = '/kaggle/input/ml-chall/embeddings_2.npy'
label_file_2 = '/kaggle/input/ml-chall/icd_codes_2.txt'
test_embedding_file = '/kaggle/input/ml-chall/test_data.npy'
output_file = 'predictions.csv'

# Load embeddings
embeddings_1 = np.load(embedding_file_1)
embeddings_2 = np.load(embedding_file_2)
embeddings = np.vstack([embeddings_1, embeddings_2])  # Combine chunks

# Load and preprocess labels
def load_labels(file_path):
    with open(file_path, 'r') as f:
        labels = [line.strip().replace("'", "").split(';') for line in f.readlines()]
    return labels

labels_1 = load_labels(label_file_1)
labels_2 = load_labels(label_file_2)
labels = labels_1 + labels_2

# Encode labels using MultiLabelBinarizer
mlb = MultiLabelBinarizer()
multi_hot_labels = mlb.fit_transform(labels)

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(embeddings, multi_hot_labels, test_size=0.008, random_state=42)

# Custom F2 Score Metric
class F2Score(tf.keras.metrics.Metric):
    def __init__(self, name="f2_score", threshold=0.5, **kwargs):
        super(F2Score, self).__init__(name=name, **kwargs)
        self.threshold = threshold
        self.true_positives = self.add_weight(name="tp", initializer="zeros")
        self.false_positives = self.add_weight(name="fp", initializer="zeros")
        self.false_negatives = self.add_weight(name="fn", initializer="zeros")

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_pred = tf.cast(y_pred > self.threshold, tf.float32)
        y_true = tf.cast(y_true, tf.float32)

        tp = tf.reduce_sum(y_true * y_pred)
        fp = tf.reduce_sum((1 - y_true) * y_pred)
        fn = tf.reduce_sum(y_true * (1 - y_pred))

        self.true_positives.assign_add(tp)
        self.false_positives.assign_add(fp)
        self.false_negatives.assign_add(fn)

    def result(self):
        precision = self.true_positives / (self.true_positives + self.false_positives + tf.keras.backend.epsilon())
        recall = self.true_positives / (self.true_positives + self.false_negatives + tf.keras.backend.epsilon())
        
        # Calculate F2 score
        return (5.2 * precision * recall) / (4.5 * precision + recall + tf.keras.backend.epsilon())

    def reset_states(self):
        self.true_positives.assign(0)
        self.false_positives.assign(0)
        self.false_negatives.assign(0)

# Base Model 1: A simple neural network with dropout rate of 0.5
base_model_1 = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(1024,)),
    
    tf.keras.layers.Dense(1024),
    LeakyReLU(negative_slope=0.1),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.5),  # Modified dropout rate
    
    tf.keras.layers.Dense(700),
    LeakyReLU(negative_slope=0.1),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.5),  # Modified dropout rate
    
    tf.keras.layers.Dense(1400, activation='sigmoid')  # Sigmoid for multi-label output
])

# Compile the base model
base_model_1.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), 
                     loss='binary_crossentropy', 
                     metrics=[F2Score()])

# Train the base model
base_model_1.fit(X_train, y_train, epochs=150, batch_size=128, validation_data=(X_val, y_val))

# Load test embeddings
test_embeddings = np.load(test_embedding_file)

# Get predictions from the base model on test embeddings
final_predictions = base_model_1.predict(test_embeddings)

# Convert binary predictions back to ICD10 codes
final_predictions_binary = (final_predictions > 0.5).astype(int)
predicted_labels = mlb.inverse_transform(final_predictions_binary)

# Save predictions to CSV
predicted_codes_str = [";".join(codes) for codes in predicted_labels]
submission_df = pd.DataFrame({
    'id': range(1, len(predicted_codes_str) + 1),
    'labels': predicted_codes_str
})
submission_df.to_csv(output_file, index=False)

print("Predictions saved to", output_file)