In [451]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.utils import class_weight
import torch.nn.functional as F
import torch.nn.functional as nnf
import tqdm
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score, confusion_matrix, top_k_accuracy_score 

In [402]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

RANDOM_SEED = 42

In [403]:
df = pd.read_csv('../data/df_final.csv')
df.shape

(7905, 58)

In [404]:
df.head()

Unnamed: 0,id,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Status,Edema_N,Edema_S,Edema_Y,Age_category_Middle,Age_category_Old,Age_category_Young,N_Days_category_Few,N_Days_category_Many,N_Days_category_Middle,Bilirubin_category_High,Bilirubin_category_Low,Bilirubin_category_Middle,Cholesterol_category_High,Cholesterol_category_Low,Cholesterol_category_Middle,Albumin_category_High,Albumin_category_Low,Albumin_category_Middle,Copper_category_High,Copper_category_Low,Copper_category_Middle,Alk_Phos_category_High,Alk_Phos_category_Low,Alk_Phos_category_Middle,SGOT_category_High,SGOT_category_Low,SGOT_category_Middle,Tryglicerides_category_High,Tryglicerides_category_Low,Tryglicerides_category_Middle,Platelets_category_High,Platelets_category_Low,Platelets_category_Middle,Prothrombin_category_High,Prothrombin_category_Low,Prothrombin_category_Middle,Stage_1.0,Stage_2.0,Stage_3.0,Stage_4.0
0,0,-0.94243,1,0.858448,1,0,0,0,-0.077237,-0.176908,-0.57294,1.160785,-0.113334,1.336304,-0.996442,1.472341,-1.189049,D,1,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,1,0,1,0,0,0,0,1,1,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0
1,1,0.497025,0,0.23476,0,0,0,0,-0.444429,0.068784,-0.024043,-0.27542,-0.197909,0.414968,-0.520497,1.095026,0.474024,C,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,1,0,0,0,0,1,0,0,1,0
2,2,1.277529,0,-1.262634,0,0,0,0,0.185043,-0.263923,0.004846,0.620561,-0.413812,0.097266,-1.243933,-0.757248,1.369525,D,0,0,1,0,0,1,0,1,0,1,0,0,0,0,1,0,0,1,1,0,0,0,0,1,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,1
3,3,0.498852,0,0.023603,0,0,0,0,-0.523113,-0.484022,-0.139601,-0.341301,-0.086017,-0.88761,-0.368194,0.043117,0.090238,C,1,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0
4,4,-1.135271,0,-0.466107,0,0,0,0,-0.391973,-0.023351,0.293739,-0.27542,-0.333965,0.224347,-0.368194,0.374697,-0.037691,C,1,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1


In [405]:
X = df.drop(['id', 'Status'], axis=1)
y = df['Status']
y = y.apply(lambda x: 0 if x == 'C' else (1 if x == 'D' else 2))
y = y.astype('int64')

In [406]:
np.unique(df['Status'])

array(['C', 'CL', 'D'], dtype=object)

In [407]:
np.unique(y)

array([0, 1, 2], dtype=int64)

In [408]:
# split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, shuffle=True, random_state = RANDOM_SEED)
#y_train = y_train.type(torch.LongTensor)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2371, 56), (5534, 56), (2371,), (5534,))

In [320]:
# X_train_tensor = torch.from_numpy(X_train.to_numpy().reshape(-1, X_train.shape[1]).astype(np.float32)).to(device)
# y_train_tensor = torch.from_numpy(y_train.to_numpy().astype(np.float32)).to(device)

# X_test_tensor = torch.from_numpy(X_test.to_numpy().reshape(-1, X_test.shape[1]).astype(np.float32)).to(device)
# y_test_tensor = torch.from_numpy(y_test.to_numpy().astype(np.float32)).to(device)

In [409]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [410]:
class MyDataset(Dataset):
    def __init__(self, X, y):
        #self.X = torch.from_numpy(X.astype(np.float32))
        #self.y = torch.from_numpy(y.astype(np.float32))
        self.X = X
        self.y = y
        
    def __getitem__(self, index):
        return self.X[index], self.y[index]
    
    def __len__(self):
        return self.X.shape[0]

In [411]:
# # convert pandas DataFrame (X) and numpy array (y) into PyTorch tensors
# X = torch.tensor(X.values, dtype=torch.float32)
# y = torch.tensor(y, dtype=torch.float32)

In [412]:
#train_data = MyDataset(X_train.to_numpy(), y_train.to_numpy())

In [413]:
X_train_tensor = torch.from_numpy(X_train.to_numpy().astype(np.float32)).to(device)
y_train_tensor = torch.from_numpy(y_train.to_numpy().astype(np.float32)).to(device)

X_test_tensor = torch.from_numpy(X_test.to_numpy().astype(np.float32)).to(device)
y_test_tensor = torch.from_numpy(y_test.to_numpy().astype(np.float32)).to(device)

X_train_tensor.shape, y_train_tensor.shape, X_test_tensor.shape, y_test_tensor.shape, 

(torch.Size([2371, 56]),
 torch.Size([2371]),
 torch.Size([5534, 56]),
 torch.Size([5534]))

In [414]:
train_data = MyDataset(X_train_tensor, y_train_tensor)

In [426]:
batch_size = 1024
num_epochs = 1000
loss_test = []
roc_auc_test = []

loss_values = []
input_count = X_train_tensor.shape[1]

In [427]:
train_dataloader = DataLoader(
    train_data,
    batch_size=batch_size,
    shuffle=True    
)

In [428]:
class MyNet(nn.Module):
    def __init__(self, input_dim, hidden_dim_1, hidden_dim_2, output_dim):
        super().__init__()
        
        self.hidden1 = nn.Linear(input_dim, hidden_dim_1)
        self.f1 = nn.ReLU()

        self.hidden2 = nn.Linear(hidden_dim_1, hidden_dim_2)
        self.f2 = nn.Sigmoid()
        
        self.output = nn.Linear(hidden_dim_2, output_dim)
        self.f3 = nn.Sigmoid()
        
    def forward(self, x):
        x = self.f1(self.hidden1(x))
        x = self.f2(self.hidden2(x))
        x = self.f3(self.output(x))
        return x

In [429]:
#input_dim=X_train_tensor.shape[0]     # how many Variables are in the dataset
hidden_dim = 25 # hidden layers
#output_dim=len(np.unique(y))    # number of classes
output_dim=1
output_dim

1

In [430]:
myNet = MyNet(input_count, hidden_dim, hidden_dim, output_dim)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
myNet.to(device)
print(myNet)

MyNet(
  (hidden1): Linear(in_features=56, out_features=25, bias=True)
  (f1): ReLU()
  (hidden2): Linear(in_features=25, out_features=25, bias=True)
  (f2): Sigmoid()
  (output): Linear(in_features=25, out_features=1, bias=True)
  (f3): Sigmoid()
)


In [431]:
classes = np.unique(y_train)
print(f'classes: {classes}')
class_weights = class_weight.compute_class_weight(class_weight='balanced', 
                                                      classes=classes, 
                                                      y=y_train)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
class_weights

classes: [0 1 2]


tensor([0.5290, 0.9954, 9.5221])

In [432]:
torch.FloatTensor(class_weights)

tensor([0.5290, 0.9954, 9.5221])

In [387]:
loss_fn = nn.CrossEntropyLoss()

In [388]:
loss_fn = nn.CrossEntropyLoss(weight=torch.FloatTensor(class_weights))

In [433]:
loss_fn = nn.BCEWithLogitsLoss()

In [381]:
loss_fn = nn.BCELoss(weight=torch.FloatTensor(class_weights))

In [434]:
optimizer = torch.optim.SGD(myNet.parameters(), lr=0.1)

In [435]:
np.unique(y_train)

array([0, 1, 2], dtype=int64)

In [436]:
for epoch in range(num_epochs):
    for X, y in train_dataloader:
        #X = X_train_tensor.to(device) 
       # y = y_train_tensor.to(device)
        X, y = X.to(device), y.to(device)
        
        pred = myNet(X)
        weights = torch.zeros_like(y.unsqueeze(-1))
        
        weights[y==0] = class_weights[0]
        weights[y==1] = class_weights[1]
        weights[y==2] = class_weights[2]
        
        
        #loss = F.binary_cross_entropy(pred, y.unsqueeze(-1), weight=weights)
        loss = loss_fn(pred, y.unsqueeze(-1))
        loss_v = loss
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    #print(f'epoch {epoch} loss {loss_v}')    
    with torch.no_grad():
        loss = loss_fn(
            myNet(X_test_tensor),
            y_test_tensor.unsqueeze(-1)
        ).item()
        loss_test.append(loss)
        print(f'epoch {epoch} loss {loss}')


epoch 0 loss 0.7884014844894409
epoch 1 loss 0.7810671925544739
epoch 2 loss 0.7742217779159546
epoch 3 loss 0.7675707936286926
epoch 4 loss 0.7613166570663452
epoch 5 loss 0.7555537223815918
epoch 6 loss 0.7506237030029297
epoch 7 loss 0.7459102272987366
epoch 8 loss 0.7417603731155396
epoch 9 loss 0.7378076910972595
epoch 10 loss 0.734555184841156
epoch 11 loss 0.7315523624420166
epoch 12 loss 0.7288634181022644
epoch 13 loss 0.7264012098312378
epoch 14 loss 0.7243900895118713
epoch 15 loss 0.7225410342216492
epoch 16 loss 0.7207868695259094
epoch 17 loss 0.7192177176475525
epoch 18 loss 0.7178153991699219
epoch 19 loss 0.7164217233657837
epoch 20 loss 0.7152254581451416
epoch 21 loss 0.7140883207321167
epoch 22 loss 0.7131683230400085
epoch 23 loss 0.7122618556022644
epoch 24 loss 0.7113699316978455
epoch 25 loss 0.7106341123580933
epoch 26 loss 0.7099539041519165
epoch 27 loss 0.7093274593353271
epoch 28 loss 0.7087181210517883
epoch 29 loss 0.7081557512283325
epoch 30 loss 0.70769

In [437]:
y_pred = myNet(X_test_tensor)
y_pred.shape

torch.Size([5534, 1])

In [441]:
probs = nnf.softmax(y_pred, dim=1)
probs

tensor([[1.],
        [1.],
        [1.],
        ...,
        [1.],
        [1.],
        [1.]], grad_fn=<SoftmaxBackward0>)

In [444]:
y_test_tensor

tensor([0., 0., 0.,  ..., 0., 0., 0.])

In [455]:
top2 = top_k_accuracy_score(y_test_tensor.detach().numpy(), y_pred.detach().numpy(), k=2, labels=classes)

ValueError: Number of given labels (3) not equal to the number of classes in 'y_score' (1).

In [450]:
#roc_auc_score(y_test.to_numpy().astype(np.float32), y_pred.detach().numpy())
# row_sums = torch.sum(y_pred, 1) # normalization 
# row_sums = row_sums.repeat(1, num_classes) # expand to same size as out
# y_pred = torch.div( y_pred , row_sums ) # these should be histograms

roc_auc_score(y_test_tensor.detach().numpy(), probs.detach().numpy(), multi_class = 'ovo')

ValueError: Number of classes in y_true not equal to the number of columns in 'y_score'

In [377]:
top_p, top_class = prob.topk(1, dim = 1)
top_p

NameError: name 'prob' is not defined

In [378]:
top_class

NameError: name 'top_class' is not defined