In [25]:
import pandas as pd 
from sklearn.model_selection import train_test_split
import numpy as np
from torch import nn
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler,TensorDataset
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tqdm.notebook import tqdm
from sklearn.metrics import f1_score, confusion_matrix


In [3]:
def preprocessing_features():
    all_features = pd.DataFrame()
    all_features = pd.read_csv(r'Dataset/Features.csv')
    features = all_features.RepText
    features = features.str.lower()    
    features.to_csv('selected_features.csv')
def preprocessing_labels():
    all_labels = pd.read_csv(r'Dataset/Labels.csv')
    # pivoting target
    new_labels = all_labels.pivot(index='RepID',columns='ICDCODE', values='ICDCODE')
    new_labels =  new_labels.fillna(0)
    new_labels[new_labels != 0 ] = 1
    new_labels.to_csv('selected_labels.csv')    

In [4]:
def features_from_csv_():
    features = pd.read_csv(r'selected_features.csv')
    features = features['RepText']
    return features
def labels_from_csv():
    labels = pd.read_csv(r'selected_labels.csv')
    labels =labels.drop(['RepID'], axis=1)
    data_labels = labels[:59000]
    return data_labels

In [5]:
# fit the tokenizer to features
def tokenize_features(features):
    tokenizer = Tokenizer()
    def max_caption_length(features):
        return max(len(feature.split()) for feature in features)
    max_length = max_caption_length(features)
    tokenizer.fit_on_texts(features)
    # save tokenized features
    X = list()
    new_X = pd.DataFrame()
    for i in range(59000):    
        # i = i + 50000
        word_idxs = tokenizer.texts_to_sequences([features[i]])[0]
        text = pad_sequences([word_idxs], maxlen=max_length, padding='post')[0]                                
        X.append(text)
    new_X = pd.DataFrame(X)
    new_X.to_csv('token_features4.csv')

In [6]:
def token_features_from_csv():    
    features1 = pd.read_csv(r'token_features1.csv')
    features2 = pd.read_csv(r'token_features2.csv')
    features3 = pd.read_csv(r'token_features3.csv')
    arr_features1 = features1.to_numpy()
    arr_features2 = features2.to_numpy()
    arr_features3 = features3.to_numpy()
    arr = np.concatenate([arr_features1, arr_features2, arr_features3])
    token_features = pd.DataFrame(arr)
    return token_features

In [7]:
token_features = token_features_from_csv()
data_labels = labels_from_csv()

In [8]:
def preprocessing_train_test_data(token_features, data_labels):
    X_train, X_test, y_train, y_test = train_test_split(token_features, data_labels, test_size=0.2, random_state=42, shuffle = True, stratify = None)
    print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
    xtrain = X_train.to_numpy()
    ytrain = y_train.to_numpy()
    xtest = X_test.to_numpy()
    ytest = y_test.to_numpy()
    xtrain = np.expand_dims(xtrain, axis=1)
    xtest = np.expand_dims(xtest, axis=1)
    print('xtrain=', xtrain.shape,'   xtest=', xtest.shape)
    return xtrain, xtest, ytrain, ytest

In [9]:
def preprocessing_torch_data(xtrain, xtest, ytrain, ytest):
    batch_size = 3
    X_train_ts = torch.tensor(xtrain, dtype=torch.float32)
    y_train_ts = torch.tensor(ytrain, dtype=torch.float32)
    X_test_ts = torch.tensor(xtest, dtype=torch.float32)
    y_test_ts = torch.tensor(ytest, dtype=torch.float32)

    dataset_train = TensorDataset(X_train_ts, y_train_ts)
    dataset_val = TensorDataset(X_test_ts, y_test_ts)

    dataloader_train = DataLoader(dataset_train, 
                                sampler=RandomSampler(dataset_train), 
                                batch_size=batch_size)

    dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)
    return dataloader_train, dataloader_validation

In [10]:
xtrain, xtest, ytrain, ytest =  preprocessing_train_test_data(token_features, data_labels)
dataloader_train, dataloader_validation = preprocessing_torch_data(xtrain, xtest, ytrain, ytest)

(47200, 3640) (11800, 3640) (47200, 1416) (11800, 1416)
xtrain= (47200, 1, 3640)    xtest= (11800, 1, 3640)


In [11]:
# model building
class CNN_Net(nn.Module):
    def __init__(self, num_targetlabel):
        super().__init__()

        self.conv = nn.Sequential()
        self.conv.add_module('conv_1', nn.Conv1d(in_channels=1, out_channels=3640, kernel_size=5))
        self.conv.add_module('pool_1', nn.MaxPool1d(kernel_size=2))
        self.conv.add_module('relu_1', nn.ReLU())
        self.conv.add_module('conv_2', nn.Conv1d(in_channels=3640, out_channels=128, kernel_size=2))
        self.conv.add_module('pool_2', nn.MaxPool1d(kernel_size=2))
        self.conv.add_module('relu_2', nn.ReLU())

        self.dense = nn.Sequential()
        self.dense.add_module('flatten', nn.Flatten())
        self.dense.add_module('linear', nn.Linear(116224, num_targetlabel))
        self.dense.add_module('sigmoid', nn.Sigmoid())
        
    def forward(self, x):
        x = self.conv(x)
        y = self.dense(x)
        return y        

In [12]:
model = CNN_Net(1416)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

CNN_Net(
  (conv): Sequential(
    (conv_1): Conv1d(1, 3640, kernel_size=(5,), stride=(1,))
    (pool_1): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (relu_1): ReLU()
    (conv_2): Conv1d(3640, 128, kernel_size=(2,), stride=(1,))
    (pool_2): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (relu_2): ReLU()
  )
  (dense): Sequential(
    (flatten): Flatten(start_dim=1, end_dim=-1)
    (linear): Linear(in_features=116224, out_features=1416, bias=True)
    (sigmoid): Sigmoid()
  )
)

In [19]:
epochs = 100

In [13]:
def accuracy(y_pred, y_true):
    if type(y_pred)==list:
        y_pred = np.array(y_pred)
    y_pred = (y_pred > 0.5)
    if type(y_true)==list:
        y_true = np.array(y_true)
    acc = (y_pred==y_true).mean()
    return acc


def evaluate_valid(model, progress_bar, device):
    global cfg
    model.eval()
    
    y_true_lst, y_pred_lst = [], []
    with torch.no_grad():
        for batch in progress_bar:
            batch = tuple(b.to(device) for b in batch)
            y_pred = model(batch[0])
            y_pred_lst += list(y_pred.detach().cpu().numpy())
            y_true_lst += list(batch[1].detach().cpu().numpy())            
            
    model.train() 
    acc = accuracy(y_pred_lst, y_true_lst)

    return acc
    
def train(model, progress_bar, optimizer, loss_func, device):
    # global cfg, global_step_num
    model.train()  
    
    y_true_lst, y_pred_lst = [], []    
    
    for batch in progress_bar:
        # global_step_num += 1
        batch = tuple(b.to(device) for b in batch)
        # optimizer.zero_grad()        

        y_pred = model(batch[0])
        train_loss = loss_func(y_pred, batch[1])

        y_pred_lst += list(y_pred.detach().cpu().numpy())
        y_true_lst += list(batch[1].detach().cpu().numpy())

        train_loss.backward()
        optimizer.step()
        model.zero_grad()
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(train_loss.item()/len(batch))})
        
    # print('in train(), len(dl_train): ', len(dl_train))
        
    acc = accuracy(y_pred_lst, y_true_lst)
    return acc

print('ok')


ok


In [14]:
def train_model(model, dataloader_train, dataloader_validation):
    loss_func = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr = 0.01)

    epochs =3
    for epoch in tqdm(range(1, epochs+1)):
        model.train()
        loss_train_total = 0
        
        progress_bar_train = tqdm(dataloader_train, desc='train Epoch {:1d}'.format(epoch), leave=False, disable=False)    
        train_acc = train(model=model, progress_bar=progress_bar_train, optimizer=optimizer, loss_func=loss_func, device=device)    

        progress_bar_valid = tqdm(dataloader_validation, desc='valid Epoch {:1d}'.format(epoch), leave=False, disable=False)
        valid_acc = evaluate_valid(model=model, progress_bar=progress_bar_valid, device=device)
        print('valid accuracy:', valid_acc)      

In [30]:
train_model(model, dataloader_train, dataloader_validation)

  0%|          | 0/3 [00:00<?, ?it/s]

train Epoch 1:   0%|          | 0/15734 [00:00<?, ?it/s]

## f1_score calculation for testing

In [52]:
# save model
torch.save(model.state_dict(), 'model_weights.pth')

In [15]:
# load model
model.load_state_dict(torch.load('model_weights.pth'))

<All keys matched successfully>

In [19]:
# xtest1 = np.expand_dims(xtrain, axis=1)
# xtest1 = torch.tensor(xtest1, dtype=torch.float32)
y_predarr_for_test = []
pre_i = 0
for i in tqdm(range(100)):    
    i = (i+1)*10
    y_pred_for_test = model(torch.tensor(xtrain[pre_i:i], dtype=torch.float32))
    y_predarr_for_test.append(y_pred_for_test.detach().numpy())
    pre_i = i

  0%|          | 0/100 [00:00<?, ?it/s]

In [22]:
y_pred_for_testing = y_predarr_for_test[0]
for i in tqdm(range(99)):
    i += 1
    # print(y_predarr_for_test[i].shape)    
    y_pred_for_testing = np.concatenate((y_pred_for_testing, y_predarr_for_test[i]), axis=0)
print(y_pred_for_testing.shape)

  0%|          | 0/99 [00:00<?, ?it/s]

(1000, 1416)


In [95]:
def calculation_f1(y_pred, y_true):
    y_pred_list = y_pred
    y_pred_list[y_pred_list >= 0.5] = 1
    y_pred_list[y_pred_list < 0.5] = 0
    # export = pd.DataFrame(y_pred_list)
    # export.to_csv('export.csv')    
    y_true_list = y_true[:1000]        
    cm = f1_score(y_true = y_true_list, y_pred=y_pred_list, average='micro', zero_division=1)
    print(cm)

In [96]:
calculation_f1(y_pred_for_testing, ytrain)

0.04024356883048461


In [74]:
y_true2 = np.array([[1, 0.3, 0.9, 0], [1,1,0,0], [1,1,1,1]])
y_pred2 = np.array([[1,0,0,0], [1,1,1,0], [1,1,1,1]])
print(y_true2.shape)
print(y_pred2.shape)

(3, 4)
(3, 4)


In [77]:
y_true2

array([[1., 0., 1., 0.],
       [1., 1., 0., 0.],
       [1., 1., 1., 1.]])

In [54]:
# test model
x_series = pd.Series(test_issue)
test_series = preprocessing_featrue(x_series)
x = torch.tensor(test_series, dtype=torch.float32)
y = model(x)
print(y)
print(label_dict)

(1, 100, 100)
tensor([[0.4290, 0.3116, 0.0096, 0.0464, 0.0141, 0.1665, 0.0182, 0.0098, 0.0033,
         0.0073, 0.0017, 0.0027, 0.0010]], grad_fn=<SigmoidBackward0>)
{'Bug': 0, 'Improvement': 1, 'Task': 2, 'New Feature': 3, 'Documentation': 4, 'Sub-task': 5, 'Test': 6, 'Question': 7, 'Wish': 8, 'Umbrella': 9, 'Dependency upgrade': 10, 'Story': 11, 'Brainstorming': 12}
