In [1]:
import torch
import torch.nn as nn

import json

import pandas as pd

from tqdm import tqdm, trange

from torch.utils.data import TensorDataset

In [2]:
def read_json_file(file_path):
  with open(file_path) as file:
    data = file.read()
    return json.loads(data)

In [3]:
conditions_file = "release_conditions.json"
evidences_file = "release_evidences.json"
train_file= "release_train_patients/release_train_patients.csv"
test_file = "release_test_patients/release_test_patients.csv"
valid_file = "release_validate_patients/release_validate_patients.csv"

In [4]:
with open("clean_dataset/diseases.json") as file:
  disease_dict = json.loads(file.read())

In [5]:
with open("clean_dataset/evidences.json") as file:
  evidence_dict = json.loads(file.read())

In [6]:
df_train = pd.read_csv(train_file)
df_test = pd.read_csv(test_file)
df_valid = pd.read_csv(valid_file)

In [7]:
def parse_evidence_string(evidence_string):
  json_evidence = evidence_string.replace('"', "!").replace("'", '"').replace("!", "'")
  return json.loads(json_evidence)

In [8]:
def evidences_string_to_dict(evidences):
  # swap " and ' to turn the string into valid JSON
  evidences = evidences.replace("'", '&').replace('"', "'").replace("&", '"')
  return json.loads(evidences)

In [9]:
def df_to_Tensors(df, evidence_dict, disease_dict):
  size = len(df)
  # we also take in age and sex as features, besides the symptoms
  feature_count = len(evidence_dict)

  # X_tensor[i][j] = 1 if patient i has symptom j
  # for sex, 0 means male and 1 means female
  # for age and categorical symptoms, X_tensor[i][j] is the number that
  # represents the age/index of the value
  # the last feature is the age, the feature before that is the sex
  X_tensor = torch.zeros(size, feature_count)

  # y_tensor[i][j] = 1 if patient i has disease j, and 0 otherwise
  y_tensor = torch.zeros(size).type(torch.LongTensor)

  for index, row in tqdm(enumerate(df.itertuples(index = False))):
    age = row.AGE
    evidences = row.EVIDENCES
    sex = row.SEX
    pathology = row.PATHOLOGY

    evidence_list = evidences_string_to_dict(evidences)

    for evidence in evidence_list:
      if '_@_' in evidence:
        evidence_name, evidence_value = evidence.split("_@_")

        # categorical symptom
        if evidence_name in evidence_dict:
          disease_index = evidence_dict[evidence_name]["index"]
          value_index = evidence_dict[evidence_name]["values"][evidence_value]["index"]
          X_tensor[index][disease_index] = value_index
        
        # multi symptom
        if evidence_name not in evidence_dict:
          disease_index = evidence_dict[evidence_name + "%" + evidence_value]["index"]
          X_tensor[index][disease_index] = 1
      
      else:
        # binary symptom
        X_tensor[index][evidence_dict[evidence]["index"]] = 1

      y_tensor[index] = disease_dict[pathology]

  return X_tensor, y_tensor

In [10]:
X_train, y_train = df_to_Tensors(df_train, evidence_dict, disease_dict)

1025602it [03:57, 4310.43it/s]


In [11]:
X_test, y_test = df_to_Tensors(df_test, evidence_dict, disease_dict)

0it [00:00, ?it/s]

134529it [00:31, 4267.09it/s]


In [12]:
X_valid, y_valid = df_to_Tensors(df_valid, evidence_dict, disease_dict)

0it [00:00, ?it/s]

132448it [00:30, 4276.89it/s]


In [None]:
torch.save(X_train, "clean_dataset/X_train.pt")
torch.save(y_train, "clean_dataset/y_train.pt")

In [None]:
torch.save(X_test, "clean_dataset/X_test.pt")
torch.save(y_test, "clean_dataset/y_test.pt")

In [None]:
torch.save(X_valid, "clean_dataset/X_valid.pt")
torch.save(y_valid, "clean_dataset/y_valid.pt")