In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras import metrics  
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, LeakyReLU
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import pandas as pd
from tensorflow.keras.optimizers import Adam, RMSprop, AdamW
from tensorflow.keras.optimizers.schedules import CosineDecay
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from sklearn.metrics import fbeta_score

In [2]:
# Step 1: Load the data
# Load embeddings and labels
embeddings_1 = np.load('embeddings_1.npy')
embeddings_2 = np.load('embeddings_2.npy')
embeddings = np.vstack([embeddings_1, embeddings_2])  # Combine both embedding files

In [3]:
# Load labels and convert them to multi-hot encoding
with open('icd_codes_1.txt') as f1, open('icd_codes_2.txt') as f2:
    labels_1 = [line.strip().split(';') for line in f1]
    labels_2 = [line.strip().split(';') for line in f2]
    labels = labels_1 + labels_2

In [4]:
# Create a mapping for ICD10 codes to multi-hot encoding
unique_codes = sorted(set(code for sublist in labels for code in sublist))
code_to_index = {code: idx for idx, code in enumerate(unique_codes)}
num_classes = len(unique_codes)

In [5]:
# Convert labels to multi-hot vectors
y = np.zeros((len(labels), num_classes), dtype=int)
for i, label_list in enumerate(labels):
    for code in label_list:
        y[i, code_to_index[code]] = 1

In [6]:
# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(embeddings, y, test_size=0.2, random_state=42)

In [8]:

# Define model architecture with increased depth
model = Sequential([
    Dense(2048, input_shape=(1024,)),
    LeakyReLU(alpha=0.005),
    # BatchNormalization(),
    Dropout(0.4),
    Dense(1024),
    LeakyReLU(alpha=0.005),
    # BatchNormalization(),
    Dropout(0.4),
    Dense(512),
    LeakyReLU(alpha=0.005),
    # BatchNormalization(),
    Dropout(0.3),
    Dense(256),
    LeakyReLU(alpha=0.005),
    # BatchNormalization(),
    Dropout(0.3),
    Dense(num_classes, activation='sigmoid')  # Sigmoid for multi-label classification
])

# Compile model with Cosine Decay schedule for the learning rate
cosine_decay = CosineDecay(initial_learning_rate=0.0005, decay_steps=10000, alpha=0.1)
optimizer = Adam(learning_rate=cosine_decay)

In [None]:
# Compile with weighted binary crossentropy if there is class imbalance
model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy', metrics.Precision(), metrics.Recall()])

In [None]:
# Train the model
history = model.fit(
    X_train, y_train,
    epochs=50,  # Increased from 30 to allow more time for training
    batch_size=128,
    validation_data=(X_val, y_val)
)

Epoch 1/50
[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 79ms/step - accuracy: 0.4867 - loss: 0.0034 - precision: 0.8032 - recall: 0.4884 - val_accuracy: 0.5574 - val_loss: 0.0024 - val_precision: 0.7931 - val_recall: 0.7019
Epoch 2/50
[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 92ms/step - accuracy: 0.5265 - loss: 0.0028 - precision: 0.8252 - recall: 0.5673 - val_accuracy: 0.5665 - val_loss: 0.0022 - val_precision: 0.8053 - val_recall: 0.7292
Epoch 3/50
[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 83ms/step - accuracy: 0.5473 - loss: 0.0025 - precision: 0.8392 - recall: 0.6088 - val_accuracy: 0.5699 - val_loss: 0.0020 - val_precision: 0.8269 - val_recall: 0.7345
Epoch 4/50
[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 82ms/step - accuracy: 0.5571 - loss: 0.0023 - precision: 0.8485 - recall: 0.6332 - val_accuracy: 0.5697 - val_loss: 0.0019 - val_precision: 0.8285 - val_recall: 0.7510
Epoch

In [None]:
# Train the model
history = model.fit(
    X_train, y_train,
    epochs=50,  # Increased from 30 to allow more time for training
    batch_size=128,
    validation_data=(X_val, y_val),
)

Epoch 1/50
[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 72ms/step - accuracy: 0.5896 - loss: 0.0015 - precision: 0.8869 - recall: 0.7559 - val_accuracy: 0.5817 - val_loss: 0.0016 - val_precision: 0.8460 - val_recall: 0.7954
Epoch 2/50
[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 70ms/step - accuracy: 0.5914 - loss: 0.0015 - precision: 0.8875 - recall: 0.7561 - val_accuracy: 0.5826 - val_loss: 0.0016 - val_precision: 0.8472 - val_recall: 0.7942
Epoch 3/50
[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 72ms/step - accuracy: 0.5911 - loss: 0.0015 - precision: 0.8868 - recall: 0.7559 - val_accuracy: 0.5819 - val_loss: 0.0016 - val_precision: 0.8446 - val_recall: 0.7969
Epoch 4/50
[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 73ms/step - accuracy: 0.5933 - loss: 0.0015 - precision: 0.8873 - recall: 0.7592 - val_accuracy: 0.5796 - val_loss: 0.0016 - val_precision: 0.8453 - val_recall: 0.7960
Epoch 5/

In [None]:
# Train the model
history = model.fit(
    X_train, y_train,
    epochs=50,  # Increased from 30 to allow more time for training
    batch_size=128,
    validation_data=(X_val, y_val)
)

Epoch 1/50
[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 70ms/step - accuracy: 0.6054 - loss: 0.0012 - precision: 0.8986 - recall: 0.8004 - val_accuracy: 0.5776 - val_loss: 0.0017 - val_precision: 0.8432 - val_recall: 0.8046
Epoch 2/50
[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 85ms/step - accuracy: 0.6048 - loss: 0.0012 - precision: 0.8990 - recall: 0.8005 - val_accuracy: 0.5814 - val_loss: 0.0017 - val_precision: 0.8445 - val_recall: 0.8042
Epoch 3/50
[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 87ms/step - accuracy: 0.6046 - loss: 0.0012 - precision: 0.8996 - recall: 0.8007 - val_accuracy: 0.5798 - val_loss: 0.0017 - val_precision: 0.8425 - val_recall: 0.8046
Epoch 4/50
[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 82ms/step - accuracy: 0.6050 - loss: 0.0012 - precision: 0.9004 - recall: 0.8031 - val_accuracy: 0.5786 - val_loss: 0.0017 - val_precision: 0.8409 - val_recall: 0.8065
Epoch

In [None]:
# Train the model
history = model.fit(
    X_train, y_train,
    epochs=50,  # Increased from 30 to allow more time for training
    batch_size=128,
    validation_data=(X_val, y_val)
)

Epoch 1/50
[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 83ms/step - accuracy: 0.6162 - loss: 0.0010 - precision: 0.9086 - recall: 0.8299 - val_accuracy: 0.5768 - val_loss: 0.0017 - val_precision: 0.8388 - val_recall: 0.8080
Epoch 2/50
[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 82ms/step - accuracy: 0.6142 - loss: 0.0010 - precision: 0.9085 - recall: 0.8299 - val_accuracy: 0.5829 - val_loss: 0.0017 - val_precision: 0.8403 - val_recall: 0.8074
Epoch 3/50
[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 78ms/step - accuracy: 0.6141 - loss: 0.0010 - precision: 0.9084 - recall: 0.8308 - val_accuracy: 0.5793 - val_loss: 0.0017 - val_precision: 0.8403 - val_recall: 0.8077
Epoch 4/50
[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 81ms/step - accuracy: 0.6144 - loss: 0.0010 - precision: 0.9090 - recall: 0.8314 - val_accuracy: 0.5777 - val_loss: 0.0017 - val_precision: 0.8387 - val_recall: 0.8078
Epoch

In [None]:
# Train the model
history = model.fit(
    X_train, y_train,
    epochs=50,  # Increased from 30 to allow more time for training
    batch_size=128,
    validation_data=(X_val, y_val)
)

Epoch 1/50
[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 64ms/step - accuracy: 0.6229 - loss: 9.3642e-04 - precision: 0.9158 - recall: 0.8503 - val_accuracy: 0.5791 - val_loss: 0.0018 - val_precision: 0.8385 - val_recall: 0.8081
Epoch 2/50
[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 72ms/step - accuracy: 0.6260 - loss: 9.3481e-04 - precision: 0.9149 - recall: 0.8499 - val_accuracy: 0.5771 - val_loss: 0.0018 - val_precision: 0.8368 - val_recall: 0.8085
Epoch 3/50
[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 69ms/step - accuracy: 0.6247 - loss: 9.2759e-04 - precision: 0.9153 - recall: 0.8513 - val_accuracy: 0.5779 - val_loss: 0.0018 - val_precision: 0.8355 - val_recall: 0.8091
Epoch 4/50
[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 74ms/step - accuracy: 0.6228 - loss: 9.2431e-04 - precision: 0.9152 - recall: 0.8515 - val_accuracy: 0.5838 - val_loss: 0.0018 - val_precision: 0.8361 - val_recall:

In [None]:
# Train the model
history = model.fit(
    X_train, y_train,
    epochs=50,  # Increased from 30 to allow more time for training
    batch_size=128,
    validation_data=(X_val, y_val)
)

Epoch 1/50
[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 80ms/step - accuracy: 0.6292 - loss: 8.4504e-04 - precision: 0.9215 - recall: 0.8678 - val_accuracy: 0.5842 - val_loss: 0.0019 - val_precision: 0.8362 - val_recall: 0.8078
Epoch 2/50
[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 81ms/step - accuracy: 0.6278 - loss: 8.5161e-04 - precision: 0.9212 - recall: 0.8659 - val_accuracy: 0.5815 - val_loss: 0.0019 - val_precision: 0.8351 - val_recall: 0.8100
Epoch 3/50
[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 82ms/step - accuracy: 0.6297 - loss: 8.3822e-04 - precision: 0.9214 - recall: 0.8678 - val_accuracy: 0.5805 - val_loss: 0.0019 - val_precision: 0.8373 - val_recall: 0.8080
Epoch 4/50
[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 90ms/step - accuracy: 0.6278 - loss: 8.4751e-04 - precision: 0.9216 - recall: 0.8663 - val_accuracy: 0.5813 - val_loss: 0.0019 - val_precision: 0.8338 - val_reca

In [None]:
# Train the model
history = model.fit(
    X_train, y_train,
    epochs=50,  # Increased from 30 to allow more time for training
    batch_size=128,
    validation_data=(X_val, y_val)
)

Epoch 1/50
[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 110ms/step - accuracy: 0.6335 - loss: 7.7855e-04 - precision: 0.9251 - recall: 0.8790 - val_accuracy: 0.5782 - val_loss: 0.0020 - val_precision: 0.8342 - val_recall: 0.8088
Epoch 2/50
[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 112ms/step - accuracy: 0.6320 - loss: 7.8248e-04 - precision: 0.9253 - recall: 0.8782 - val_accuracy: 0.5850 - val_loss: 0.0019 - val_precision: 0.8350 - val_recall: 0.8082
Epoch 3/50
[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 105ms/step - accuracy: 0.6317 - loss: 7.8487e-04 - precision: 0.9252 - recall: 0.8779 - val_accuracy: 0.5806 - val_loss: 0.0020 - val_precision: 0.8341 - val_recall: 0.8083
Epoch 4/50
[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 106ms/step - accuracy: 0.6350 - loss: 7.7674e-04 - precision: 0.9263 - recall: 0.8798 - val_accuracy: 0.5851 - val_loss: 0.0020 - val_precision: 0.8341 - val

In [None]:
# Train the model
history = model.fit(
    X_train, y_train,
    epochs=10,  # Increased from 30 to allow more time for training
    batch_size=128,
    validation_data=(X_val, y_val)
)

Epoch 1/10
[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 111ms/step - accuracy: 0.6363 - loss: 7.3614e-04 - precision: 0.9290 - recall: 0.8876 - val_accuracy: 0.5853 - val_loss: 0.0020 - val_precision: 0.8341 - val_recall: 0.8079
Epoch 2/10
[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 104ms/step - accuracy: 0.6380 - loss: 7.2807e-04 - precision: 0.9286 - recall: 0.8886 - val_accuracy: 0.5824 - val_loss: 0.0020 - val_precision: 0.8329 - val_recall: 0.8095
Epoch 3/10
[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 107ms/step - accuracy: 0.6382 - loss: 7.1870e-04 - precision: 0.9311 - recall: 0.8890 - val_accuracy: 0.5834 - val_loss: 0.0020 - val_precision: 0.8342 - val_recall: 0.8081
Epoch 4/10
[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m132s[0m 106ms/step - accuracy: 0.6358 - loss: 7.2125e-04 - precision: 0.9305 - recall: 0.8886 - val_accuracy: 0.5843 - val_loss: 0.0020 - val_precision: 0.8335 - val

In [63]:
# Step 1: Load the test embeddings
test_embeddings = np.load('test_data.npy')

In [64]:
# Step 2: Make predictions on the test data
# Load the model (assuming you've already trained and saved it if needed)
y_test_pred = model.predict(test_embeddings)

[1m3110/3110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 17ms/step


In [65]:
for i in range(10000):
    if y_test_pred[i][938]>=0.3:
        print('yes')

yes
yes
yes
yes
yes
yes
yes
yes
yes
yes


In [66]:
# Step 4: Evaluate the model
# Predict on the validation set and compute micro F2 score
y_val_pred = model.predict(X_val) > 0.42 # Convert probabilities to binary predictions
micro_f2_score = f1_score(y_val, y_val_pred, average='micro')

print(f'Micro F2 Score on validation set: {micro_f2_score:.4f}')

[1m1244/1244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 14ms/step
Micro F2 Score on validation set: 0.8183


In [None]:
# Step 3: Convert predictions to ICD10 code labels
# Apply threshold to determine which codes to include (e.g., 0.5 threshold)
threshold = 0.49
test_labels_pred = (y_test_pred > threshold).astype(int)

In [59]:
# Step 4: Create a submission file in the specified format
# Map indices back to ICD10 codes
index_to_code = {v: k for k, v in code_to_index.items()}

submission_data = []
for idx, label_vector in enumerate(test_labels_pred, start=1):
    # Get codes with predictions above the threshold and sort lexicographically
    codes = [index_to_code[i] for i, val in enumerate(label_vector) if val == 1]
    codes = sorted(codes)  # Sort lexicographically
    label_string = ';'.join(codes) if codes else ''  # Stitch with ';' or leave blank if no label
    submission_data.append({'id': idx, 'labels': label_string})

In [60]:
# Convert to DataFrame and save as CSV
submission_df = pd.DataFrame(submission_data)
submission_df

Unnamed: 0,id,labels
0,1,G56.21
1,2,M65.9;S83.242A
2,3,G56.01
3,4,M65.312
4,5,S83.241A;S83.281A
...,...,...
99485,99486,D12.0;K57.30;K63.5;K64.9
99486,99487,K29.50;K31.89
99487,99488,D12.2;D12.5;K64.8;Z12.11
99488,99489,B96.81;K21.9;K29.50


In [61]:
submission_df.to_csv('submission.csv', index=False)

print("Submission file 'submission.csv' created successfully.")

Submission file 'submission.csv' created successfully.
