In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras import regularizers

In [3]:
# from google.colab import drive
# drive.mount('/content/drive')

In [4]:
# array1 = np.load('/content/drive/MyDrive/dal_endsem/embeddings_1.npy')
# array2 = np.load('/content/drive/MyDrive/dal_endsem/embeddings_2.npy')
# X_train = np.concatenate((array1, array2))
# X_test = np.load('/content/drive/MyDrive/dal_endsem/test_data.npy')

In [5]:
array1 = np.load('embeddings_1.npy')
array2 = np.load('embeddings_2.npy')
X_train = np.concatenate((array1, array2))
X_test = np.load('test_data.npy')

In [6]:
def parse_label_file(filename, delimiter=';'):
    with open(filename, 'r') as f:
        lines = f.readlines()

    labels = []
    for line in lines:
        labels.append(line.strip().split(delimiter))

    return labels

def create_label_to_index(labels):
  unique_labels = set(labels)
  label_to_index = {label: i for i, label in enumerate(unique_labels)}
  return label_to_index

def to_multi_hot(labels, label_to_index):
  vocab_size = len(label_to_index)
  multi_hot = np.zeros(vocab_size)

  for label in labels:
    index = label_to_index[label]
    multi_hot[index] = 1

  return multi_hot

labelsfile1 = "icd_codes_1.txt"
labels1 = parse_label_file(labelsfile1)
labelsfile2 = "icd_codes_2.txt"
labels2 = parse_label_file(labelsfile2)
labels = labels1 + labels2

all_labels = []
for label in labels:
    all_labels += label
label_to_index = create_label_to_index(all_labels)

multi_hot = []
for label in labels:
    multi_hot.append(to_multi_hot(label, label_to_index))

y_train = np.array(multi_hot)

In [None]:
model = Sequential([
    Dense(1024, activation='relu', input_dim=1024),
    Dense(2048, activation='relu'),
    Dropout(0.3),
    Dense(512, activation='relu'),
    Dropout(0.3),
    Dense(1400, activation='sigmoid')
])

# Compile the model with Adam optimizer
optimizer = Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Callbacks for early stopping and learning rate reduction
early_stopping = EarlyStopping(monitor='val_loss', patience=8, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=8, min_lr=1e-6)

# Reduce batch size for less memory usage
history = model.fit(X_train, y_train, epochs=100, batch_size=64, validation_split=0.2,
                    callbacks=[early_stopping, lr_scheduler])

import matplotlib.pyplot as plt

plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.legend()
plt.title("Model Loss over Epochs")
plt.show()

# Predict the labels
y_pred_prob = model.predict(X_test)
threshold = 0.5
y_pred = (y_pred_prob > threshold).astype(int)

In [8]:
#Write Outputfile
################

def create_index_to_label(label_to_index):
  index_to_label = {v: k for k, v in label_to_index.items()}
  return index_to_label

index_to_label = create_index_to_label(label_to_index)

def create_txt_file(y_pred, index_to_label, filename="predicts.csv"):

  with open(filename, 'w') as f:
    f.write("id,labels\n")
    for i, prediction in enumerate(y_pred):
      labels = []
      for j, value in enumerate(prediction):
        if value == 1:
          labels.append(index_to_label[j])
      labels = sorted(labels)
      f.write(f"{i+1},{';'.join(labels)}\n")

create_txt_file(y_pred, index_to_label)