In [1]:
# metrics
from sklearn.metrics import f1_score
from sklearn.metrics import jaccard_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import hamming_loss
from sklearn.metrics import multilabel_confusion_matrix

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

import regex as re
import json
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.nn import BCELoss
from collections import Counter

In [2]:
path_to_data = './data'

# create dataframe from sessions.json
df = pd.read_json(f'{path_to_data}/sessions.json')
df.head()

# create dictionaries for switching between symptom and id
id2sym = {}
sym2id = {}

with open(f'{path_to_data}/symptoms.json') as json_file:
    data = json.load(json_file)
    for sym in data:
        id2sym[sym['id']] = sym['name']
        sym2id[sym['name']] = sym['id']
        
        
# remove labels that have less than m occurrences
m = 0

labels_list = df['confirmed'].tolist()
labels_list = sum(labels_list, [])
c = Counter(labels_list)
for i in range(len(df)):
    to_remove = []
    
    # find labels that should be removed 
    for j in range(len(df['confirmed'][i])):
        if c[df['confirmed'][i][j]] < m:
            to_remove.append(j)
            
    # remove the labels
    shift = 0
    for j in range(len(to_remove)):
        df['confirmed'][i].pop(to_remove[j]-shift)
        shift += 1
    
        
# add column with the symptom names
sym_names = []

for syms in df['confirmed']:
    if len(syms) != 0:
        sym_names.append([id2sym[x] for x in syms])
    else:
        sym_names.append([])

df['labels'] = sym_names

# remove all rows with no confirmed labels
df = df[df['confirmed'].map(len) > 0]
df = df.reset_index(drop=True)

In [3]:
df.drop('confirmed', inplace=True, axis=1)
df.drop('suggested', inplace=True, axis=1)
df.head()

Unnamed: 0,text,labels
0,Slut på medicin.,"[Känd astma, Känd lungsjukdom]"
1,Behöver att prata med psykolog angående använd...,"[Nedstämdhet, Trötthet]"
2,Har fått besvärlig eksem på händerna,"[Hudbesvär, Synliga hudbesvär]"
3,Muskelsvaghet och trötthet känner mig skakig o...,"[Muskelsvaghet, Trötthet]"
4,Svår smärta i vänsterhanden/handleden precis n...,"[Smärta i handled eller fingrar, Förvärras av ..."


In [4]:
# a basic tokenizer to start off with
def basic_tokenization(text):
    text = text.lower()

    tokens = re.sub(r'[^\p{L} ]', '', text).split()
    
    return tokens

test = 'Hej, hur är läget?'
print(basic_tokenization(test))

['hej', 'hur', 'är', 'läget']


In [20]:
# create dictionary that assigns a unique integer id for each word in the embeddings
word2id = {}
id_count = 0

embedding_weights = None

with open('./embeddings/swectors-300dim.txt', encoding='utf-8') as file:
    # initialize the embedding weights matrix with zeros
    dims = [int(x) for x in file.readline().split()]
    print(dims)
    embedding_weights = torch.zeros((dims[0],dims[1]), dtype=torch.float64)    
    
    line = file.readline().split()
    while line != []:
        word2id[line[0]] = id_count
        embedding_weights[id_count,:] = torch.tensor([float(x) for x in line[1:]])
        id_count += 1
        
        line = file.readline()
        line = line.split()
        
        if id_count % 100000 == 0:
            print(id_count)
        
print(embedding_weights)

[192250, 300]
100000
tensor([[-2.2332,  3.3888,  2.0338,  ...,  0.1474,  2.6168,  0.1402],
        [ 2.1269, -1.5451,  1.0750,  ..., -1.2130,  0.4903,  1.5822],
        [ 4.6350,  0.8348,  0.9662,  ..., -4.5444, -3.1763, -1.2721],
        ...,
        [ 0.0438,  0.6291, -0.9723,  ..., -0.5011, -0.1992, -0.6892],
        [-0.9830, -1.0080,  0.4821,  ..., -0.2133,  0.6264,  0.1886],
        [-0.2024, -0.7025, -0.3341,  ...,  0.1908,  0.1050,  0.7176]],
       dtype=torch.float64)


In [25]:
# train a multilabel_binarizer on the labels
labels = df['labels'].tolist()
multilab_bin = MultiLabelBinarizer()
multilab_bin.fit(labels)


class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, word2id, multilab_bin, max_len):
        self.tokenizer = tokenizer
        self.word2id = word2id
        self.multilab_bin = multilab_bin
        self.data = dataframe
        self.text = self.data['text']
        self.labels = self.data['labels']
        self.max_len = max_len
        
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, index):
        text = str(self.text[index])
        text = ' '.join(text.split())
        
        tokens = self.tokenizer(text)
        ids = [self.word2id[w] for w in tokens]
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'labels': torch.tensor(np.sum(self.multilab_bin.transform([self.labels[index]]), axis=0), dtype=torch.float)
        }      

In [29]:
batch_size = 16

# max number of tokens in text
#max_len = 200
max_len = 0
for i in range(len(df['text'])):
    text = df['text'][i]
            
    tokens = basic_tokenization(text)
    
    if len(tokens) > max_len:
        max_len = len(tokens)

print(max_len)

train_dataset, test_dataset = train_test_split(df,
                                        random_state=42,
                                        test_size=0.2,
                                        shuffle=True)
train_dataset = train_dataset.reset_index(drop=True)
test_dataset = test_dataset.reset_index(drop=True)

train_set = CustomDataset(train_dataset, basic_tokenization, word2id, multilab_bin, max_len)
test_set = CustomDataset(test_dataset, basic_tokenization, word2id, multilab_bin, max_len)

train_params = {'batch_size': batch_size,
                'shuffle': True,
                'num_workers': 0
               }
test_params = {'batch_size': batch_size,
               'shuffle': True,
               'num_workers': 0
              }

train_loader = DataLoader(train_set, **train_params)
test_loader = DataLoader(test_set, **test_params)

print(f'Train set: {len(train_dataset)} samples')
print(f'Test set: {len(test_dataset)} samples')

99
Train set: 3027 samples
Test set: 757 samples
