In [1]:
import numpy as np 
import pandas as pd
import os

import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 10})

In [2]:
df = pd.read_csv('/kaggle/input/morticd10/codes.csv')

#structuring
df.columns = ['prefix_codes', 'decimal', 'codes', 'description', 'long_description', 'label']

In [3]:
df.head()

Unnamed: 0,prefix_codes,decimal,codes,description,long_description,label
0,A00,1,A001,"Cholera due to Vibrio cholerae 01, biovar eltor","Cholera due to Vibrio cholerae 01, biovar eltor",Cholera
1,A00,9,A009,"Cholera, unspecified","Cholera, unspecified",Cholera
2,A010,0,A0100,"Typhoid fever, unspecified","Typhoid fever, unspecified",Typhoid fever
3,A010,1,A0101,Typhoid meningitis,Typhoid meningitis,Typhoid fever
4,A010,2,A0102,Typhoid fever with heart involvement,Typhoid fever with heart involvement,Typhoid fever


In [4]:
len(df.prefix_codes.unique()), len(df)

(19927, 71703)

In [5]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader
from transformers import AdamW
import torch


caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [8]:
new_df = df.sample(frac=1, random_state=23)
print(f"Size: {len(new_df)}")
new_df.head()

Size: 71703


Unnamed: 0,prefix_codes,decimal,codes,description,long_description,label
13730,M2481,9,M24819,Oth specific joint derangements of unsp should...,Other specific joint derangements of unspecifi...,"Other specific joint derangements of shoulder,..."
22501,R197,,R197,"Diarrhea, unspecified","Diarrhea, unspecified","Diarrhea, unspecified"
20893,O891,,O891,Cardiac complications of anesthesia during the...,Cardiac complications of anesthesia during the...,Cardiac complications of anesthesia during the...
13975,M260,3,M2603,Mandibular hyperplasia,Mandibular hyperplasia,Major anomalies of jaw size
36830,S59032,S,S59032S,"Sltr-haris Type III physl fx lower end ulna, l...",Salter-Harris Type III physeal fracture of low...,Salter-Harris Type III physeal fracture of low...


In [9]:
def normalise_text (text):
    text = text.lower() # lowercase
    text = text.replace(r"\#","") # replaces hashtags
    text = text.replace(r"@","")
    text = text.replace(r"[^A-Za-z0-9()!?\'\`\"]", " ")
    text = text.replace("\s{2,}", " ")
    return text

new_df['long_description'] = new_df['long_description'].apply(normalise_text)
new_df.head()

Unnamed: 0,prefix_codes,decimal,codes,description,long_description,label
13730,M2481,9,M24819,Oth specific joint derangements of unsp should...,other specific joint derangements of unspecifi...,"Other specific joint derangements of shoulder,..."
22501,R197,,R197,"Diarrhea, unspecified","diarrhea, unspecified","Diarrhea, unspecified"
20893,O891,,O891,Cardiac complications of anesthesia during the...,cardiac complications of anesthesia during the...,Cardiac complications of anesthesia during the...
13975,M260,3,M2603,Mandibular hyperplasia,mandibular hyperplasia,Major anomalies of jaw size
36830,S59032,S,S59032S,"Sltr-haris Type III physl fx lower end ulna, l...",salter-harris type iii physeal fracture of low...,Salter-Harris Type III physeal fracture of low...


In [11]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

X = new_df[['long_description']]
y = new_df['prefix_codes']


encoder = LabelEncoder()
y = encoder.fit_transform(y)

# Split the dataset into a train and temporary set
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=23)

# Split the temporary set into validation and train set
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=23)

print(X_train.shape, X_val.shape, X_test.shape)

(43021, 1) (14341, 1) (14341, 1)


In [13]:
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, X, y, tokenizer, max_length):
        self.data = []
        for i in zip(X, y):
            text, target = i[0], i[1]
            self.data.append((text, int(target)))
            self.tokenizer = tokenizer
            self.max_length = max_length
            

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text, target = self.data[idx]
        inputs = self.tokenizer(text, padding='max_length', truncation=True, 
                                max_length=self.max_length, return_tensors='pt')
        inputs_ids = inputs['input_ids'].squeeze(0)
        attention_masks = inputs['attention_mask'].squeeze(0)
        return inputs_ids, attention_masks, target 

In [14]:
sentences = new_df["long_description"]
sentences_tokenized = [tokenizer.tokenize(sentence) for sentence in sentences ]

print('Max sentence length: ', max([len(sen) for sen in sentences_tokenized]))
max_len = max([len(sen) for sen in sentences_tokenized])

Max sentence length:  43


In [15]:
train_dataset = CustomDataset(X_train, y_train, tokenizer, max_len)
val_dataset = CustomDataset(X_val, y_val, tokenizer, max_len)

batch_size = 4
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)