In [2]:
# Pre-train용
import os
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
import torch
from sklearn.model_selection import train_test_split

class CustomBertForSequenceClassification(BertForSequenceClassification):
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        labels=None,
        output_hidden_states=True
    ):
        outputs = super().forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            labels=labels,
            output_hidden_states=output_hidden_states
        )
        logits = outputs.logits
        hidden_states = outputs.hidden_states[-8]  # n번째 레이어의 hidden states를 반환합니다.
        loss = outputs.loss
        return logits, loss, hidden_states

# 데이터 로드 및 전처리
data_A = pd.read_csv("output1.csv")  # data set A 파일명에 맞게 수정
data_B = pd.read_csv("infected.csv")  # data set B 파일명에 맞게 수정
# 모델 저장 경로
model_path = "Pre-trained.pt"

# X_train, Y_train 생성
X_train = []
Y_train = []

for index, row in data_A.iterrows():  # 중복 제거를 하지 않고 원본 데이터 사용
    patient_id = row["ID"]
    patient_info = [str(row[column]) for column in data_A.columns if column != "ID" and column != "DESCRIPTION"]
    symptoms = ", ".join(data_A[data_A["ID"] == patient_id]["DESCRIPTION"].tolist())
    combined_info = ", ".join(patient_info) + ", " + symptoms
    X_train.append(combined_info)
    if patient_id in data_B.values:
        Y_train.append(1)
    else:
        Y_train.append(0)

#print("X_train\n", X_train[:10])
#print("Y_train\n", Y_train[:10])
        
# BERT 토크나이저 및 모델 로드
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
model = CustomBertForSequenceClassification.from_pretrained('bert-large-uncased', num_labels=2)

# 입력 데이터를 BERT의 입력 형식으로 변환
max_len = 128  # 입력 시퀀스의 최대 길이

input_ids = []
attention_masks = []

for info in X_train:
    encoded_dict = tokenizer.encode_plus(
                        info,                         # 환자 정보 및 증상
                        add_special_tokens = True,    # [CLS], [SEP] 토큰 추가
                        max_length = max_len,         # 최대 길이 지정
                        pad_to_max_length = True,     # 패딩을 추가하여 최대 길이로 맞춤
                        return_attention_mask = True, # 어텐션 마스크 생성
                        return_tensors = 'pt',        # PyTorch 텐서로 반환
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(Y_train)

# 데이터셋 및 데이터로더 생성
dataset = TensorDataset(input_ids, attention_masks, labels)
train_size = 0.8
train_dataset, val_dataset = train_test_split(dataset, test_size=1-train_size, random_state=42)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=True)

# GPU 사용 가능 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(torch.cuda.is_available())

# 모델을 GPU로 이동
model.to(device)

# 옵티마이저 및 학습률 설정
# 기본 학습률 : 2e-6
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# 에폭 설정
epochs = 10

# 학습 루프
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs[1]  # loss가 outputs의 두 번째 값입니다.
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}')
    avg_train_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}')

    # 모델 저장 및 평가
    model_save_path = f"Pre_train_epoch{epoch + 1}_BERT_Large.pt"
    torch.save(model.state_dict(), model_save_path)
    print(f"Model saved for epoch {epoch + 1} at {model_save_path}")
    
    model.eval()
    val_accuracy = 0
    for batch in val_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}
        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs[0]  # logits가 outputs의 첫 번째 값입니다.
        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        val_accuracy += (logits.argmax(axis=1) == label_ids).mean().item()

    print(f'Validation Accuracy for epoch {epoch + 1}: {val_accuracy / len(val_dataloader)}')


X_train
 ['7/29/1966, nan, 999-91-3709, S99988287, X5601074X, Mrs., Celia938, Roberts511, nan, Mayert710, M, white, hispanic, F, Agawam  Massachusetts  US, 362 Pacocha Gateway Apt 1, Northborough, Massachusetts, Worcester County, 1532.0, 42.27341123, -71.63243239, 1166971.45, 13416.2, 4/20/1989, nan, 5cfda74f-b462-4c73-aa96-d90da4002f8a, 40055000.0, Chronic sinusitis (disorder), Body mass index 30+ - obesity (finding), Miscarriage in first trimester, Prediabetes, Hyperlipidemia, Nasal congestion (finding), Cough (finding), Sore throat symptom (finding), Sputum finding (finding), Muscle pain (finding), Joint pain (finding), Fever (finding)', '12/19/1965, 3/1/2020, 999-70-4989, S99948277, X2560575X, Mrs., Kala987, Prohaska837, nan, Gleason633, M, white, nonhispanic, F, Boston  Massachusetts  US, 310 Effertz Promenade, Gloucester, Massachusetts, Essex County, 1930.0, 42.63072986, -70.6443488, 1229943.52, 20003.74, 2/12/1984, nan, 3fc7077f-903c-4601-8078-a016e9b5a630, 59621000.0, Hypertens

Some weights of CustomBertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


True
Epoch 1/10, Batch Loss: 0.5948078632354736
Epoch 1/10, Batch Loss: 0.4141175150871277
Epoch 1/10, Batch Loss: 0.9622668027877808
Epoch 1/10, Batch Loss: 0.48511311411857605
Epoch 1/10, Batch Loss: 0.46169981360435486
Epoch 1/10, Batch Loss: 0.6177780628204346
Epoch 1/10, Batch Loss: 0.6891560554504395
Epoch 1/10, Batch Loss: 0.4923348128795624
Epoch 1/10, Batch Loss: 0.5581233501434326
Epoch 1/10, Batch Loss: 0.7338570952415466
Epoch 1/10, Batch Loss: 0.5559920072555542
Epoch 1/10, Batch Loss: 0.5882829427719116
Epoch 1/10, Batch Loss: 0.612876296043396
Epoch 1/10, Batch Loss: 0.5901021957397461
Epoch 1/10, Batch Loss: 0.5141141414642334
Epoch 1/10, Batch Loss: 0.6080482006072998
Epoch 1/10, Batch Loss: 0.552339494228363
Epoch 1/10, Batch Loss: 0.29683631658554077
Epoch 1/10, Batch Loss: 0.4292985200881958
Epoch 1/10, Batch Loss: 0.44860002398490906
Epoch 1/10, Batch Loss: 0.5967109799385071
Epoch 1/10, Batch Loss: 0.7726467847824097
Epoch 1/10, Batch Loss: 0.379714697599411
Epoch

Epoch 1/10, Batch Loss: 0.25408414006233215
Epoch 1/10, Batch Loss: 0.47630366683006287
Epoch 1/10, Batch Loss: 0.4940844476222992
Epoch 1/10, Batch Loss: 0.6591581702232361
Epoch 1/10, Batch Loss: 0.241426482796669
Epoch 1/10, Batch Loss: 0.39139968156814575
Epoch 1/10, Batch Loss: 0.26176148653030396
Epoch 1/10, Batch Loss: 0.6208778619766235
Epoch 1/10, Batch Loss: 0.4253423810005188
Epoch 1/10, Batch Loss: 0.4583967328071594
Epoch 1/10, Batch Loss: 0.20162606239318848
Epoch 1/10, Batch Loss: 0.4518060088157654
Epoch 1/10, Batch Loss: 0.2670995891094208
Epoch 1/10, Batch Loss: 0.39663153886795044
Epoch 1/10, Batch Loss: 0.1207134798169136
Epoch 1/10, Batch Loss: 0.2647384703159332
Epoch 1/10, Batch Loss: 0.2026481330394745
Epoch 1/10, Batch Loss: 0.730182945728302
Epoch 1/10, Batch Loss: 0.4835037887096405
Epoch 1/10, Batch Loss: 0.4554523825645447
Epoch 1/10, Batch Loss: 0.31033214926719666
Epoch 1/10, Batch Loss: 0.17102262377738953
Epoch 1/10, Batch Loss: 0.43716344237327576
Epoc

Epoch 1/10, Batch Loss: 0.3216312825679779
Epoch 1/10, Batch Loss: 0.12798450887203217
Epoch 1/10, Batch Loss: 0.3631243109703064
Epoch 1/10, Batch Loss: 0.12048667669296265
Epoch 1/10, Batch Loss: 0.36682096123695374
Epoch 1/10, Batch Loss: 0.2732905149459839
Epoch 1/10, Batch Loss: 0.12411116808652878
Epoch 1/10, Batch Loss: 0.3483610153198242
Epoch 1/10, Batch Loss: 0.11404358595609665
Epoch 1/10, Batch Loss: 0.37447044253349304
Epoch 1/10, Batch Loss: 0.3435271978378296
Epoch 1/10, Batch Loss: 0.3668426275253296
Epoch 1/10, Batch Loss: 0.22380492091178894
Epoch 1/10, Batch Loss: 0.22250209748744965
Epoch 1/10, Batch Loss: 0.5060283541679382
Epoch 1/10, Batch Loss: 0.07139237225055695
Epoch 1/10, Batch Loss: 0.23438046872615814
Epoch 1/10, Batch Loss: 0.1977069079875946
Epoch 1/10, Batch Loss: 0.5009791851043701
Epoch 1/10, Batch Loss: 0.23570337891578674
Epoch 1/10, Batch Loss: 0.19854247570037842
Epoch 1/10, Batch Loss: 0.24289563298225403
Epoch 1/10, Batch Loss: 0.214622870087623

Epoch 2/10, Batch Loss: 0.36476776003837585
Epoch 2/10, Batch Loss: 0.11052891612052917
Epoch 2/10, Batch Loss: 0.31898367404937744
Epoch 2/10, Batch Loss: 0.1192537322640419
Epoch 2/10, Batch Loss: 0.2137306183576584
Epoch 2/10, Batch Loss: 0.4418972134590149
Epoch 2/10, Batch Loss: 0.22884273529052734
Epoch 2/10, Batch Loss: 0.5592710971832275
Epoch 2/10, Batch Loss: 0.23405513167381287
Epoch 2/10, Batch Loss: 0.36403146386146545
Epoch 2/10, Batch Loss: 0.26686733961105347
Epoch 2/10, Batch Loss: 0.10722211748361588
Epoch 2/10, Batch Loss: 0.11102891713380814
Epoch 2/10, Batch Loss: 0.33663076162338257
Epoch 2/10, Batch Loss: 0.24196112155914307
Epoch 2/10, Batch Loss: 0.245708167552948
Epoch 2/10, Batch Loss: 0.5717613697052002
Epoch 2/10, Batch Loss: 0.3553922772407532
Epoch 2/10, Batch Loss: 0.20737874507904053
Epoch 2/10, Batch Loss: 0.7981641888618469
Epoch 2/10, Batch Loss: 0.43138930201530457
Epoch 2/10, Batch Loss: 0.20272313058376312
Epoch 2/10, Batch Loss: 0.212201938033103

Epoch 2/10, Batch Loss: 0.11468043923377991
Epoch 2/10, Batch Loss: 0.11218635737895966
Epoch 2/10, Batch Loss: 0.207257479429245
Epoch 2/10, Batch Loss: 0.5816338062286377
Epoch 2/10, Batch Loss: 0.548229455947876
Epoch 2/10, Batch Loss: 0.24304263293743134
Epoch 2/10, Batch Loss: 0.34932634234428406
Epoch 2/10, Batch Loss: 0.2301732897758484
Epoch 2/10, Batch Loss: 0.4059648811817169
Epoch 2/10, Batch Loss: 0.2898111343383789
Epoch 2/10, Batch Loss: 0.24344737827777863
Epoch 2/10, Batch Loss: 0.18425241112709045
Epoch 2/10, Batch Loss: 0.440733402967453
Epoch 2/10, Batch Loss: 0.24318745732307434
Epoch 2/10, Batch Loss: 0.48604199290275574
Epoch 2/10, Batch Loss: 0.21875350177288055
Epoch 2/10, Batch Loss: 0.6664556264877319
Epoch 2/10, Batch Loss: 0.19494062662124634
Epoch 2/10, Batch Loss: 0.14270660281181335
Epoch 2/10, Batch Loss: 0.24428825080394745
Epoch 2/10, Batch Loss: 0.22593754529953003
Epoch 2/10, Batch Loss: 0.7167632579803467
Epoch 2/10, Batch Loss: 0.5565463900566101
E

Epoch 3/10, Batch Loss: 0.434532105922699
Epoch 3/10, Batch Loss: 0.5644709467887878
Epoch 3/10, Batch Loss: 0.3751179873943329
Epoch 3/10, Batch Loss: 0.32393878698349
Epoch 3/10, Batch Loss: 0.39530590176582336
Epoch 3/10, Batch Loss: 0.1480053961277008
Epoch 3/10, Batch Loss: 0.45451971888542175
Epoch 3/10, Batch Loss: 0.2704889178276062
Epoch 3/10, Batch Loss: 0.3163966238498688
Epoch 3/10, Batch Loss: 0.5258164405822754
Epoch 3/10, Batch Loss: 0.32988664507865906
Epoch 3/10, Batch Loss: 0.3478582799434662
Epoch 3/10, Batch Loss: 0.15758681297302246
Epoch 3/10, Batch Loss: 0.46237069368362427
Epoch 3/10, Batch Loss: 0.11538927257061005
Epoch 3/10, Batch Loss: 0.12498921155929565
Epoch 3/10, Batch Loss: 0.4225130081176758
Epoch 3/10, Batch Loss: 0.5386319756507874
Epoch 3/10, Batch Loss: 0.12910886108875275
Epoch 3/10, Batch Loss: 0.33519524335861206
Epoch 3/10, Batch Loss: 0.4388473629951477
Epoch 3/10, Batch Loss: 0.38394618034362793
Epoch 3/10, Batch Loss: 0.31126776337623596
Epo

Epoch 3/10, Batch Loss: 0.8637329936027527
Epoch 3/10, Batch Loss: 0.5929728150367737
Epoch 3/10, Batch Loss: 0.5313339233398438
Epoch 3/10, Batch Loss: 0.6762797236442566
Epoch 3/10, Batch Loss: 0.7434151768684387
Epoch 3/10, Batch Loss: 0.504677951335907
Epoch 3/10, Batch Loss: 0.6736072301864624
Epoch 3/10, Batch Loss: 0.6721469163894653
Epoch 3/10, Batch Loss: 0.7232152223587036
Epoch 3/10, Batch Loss: 0.44177496433258057
Epoch 3/10, Batch Loss: 0.7215025424957275
Epoch 3/10, Batch Loss: 0.5318666100502014
Epoch 3/10, Batch Loss: 0.6336197853088379
Epoch 3/10, Batch Loss: 0.6861889362335205
Epoch 3/10, Batch Loss: 0.6570265293121338
Epoch 3/10, Batch Loss: 0.6002522706985474
Epoch 3/10, Batch Loss: 0.6455762386322021
Epoch 3/10, Batch Loss: 0.5318251252174377
Epoch 3/10, Batch Loss: 0.5945788025856018
Epoch 3/10, Batch Loss: 0.5895785689353943
Epoch 3/10, Batch Loss: 0.6540151238441467
Epoch 3/10, Batch Loss: 0.38634923100471497
Epoch 3/10, Batch Loss: 0.7011598348617554
Epoch 3/10

Epoch 4/10, Batch Loss: 0.5250555276870728
Epoch 4/10, Batch Loss: 0.8559890389442444
Epoch 4/10, Batch Loss: 0.5248561501502991
Epoch 4/10, Batch Loss: 0.8604393601417542
Epoch 4/10, Batch Loss: 0.9000163674354553
Epoch 4/10, Batch Loss: 0.6066642999649048
Epoch 4/10, Batch Loss: 0.5529175400733948
Epoch 4/10, Batch Loss: 0.6180764436721802
Epoch 4/10, Batch Loss: 0.47370845079421997
Epoch 4/10, Batch Loss: 0.5727359652519226
Epoch 4/10, Batch Loss: 0.5468273758888245
Epoch 4/10, Batch Loss: 0.5739675164222717
Epoch 4/10, Batch Loss: 0.684780478477478
Epoch 4/10, Batch Loss: 0.6933670043945312
Epoch 4/10, Batch Loss: 0.5067875385284424
Epoch 4/10, Batch Loss: 0.8093979358673096
Epoch 4/10, Batch Loss: 0.6730056405067444
Epoch 4/10, Batch Loss: 0.6830881237983704
Epoch 4/10, Batch Loss: 0.7040009498596191
Epoch 4/10, Batch Loss: 0.6427336931228638
Epoch 4/10, Batch Loss: 0.5078014135360718
Epoch 4/10, Batch Loss: 0.7310107350349426
Epoch 4/10, Batch Loss: 0.6280449628829956
Epoch 4/10,

Epoch 4/10, Batch Loss: 0.3202593922615051
Epoch 4/10, Batch Loss: 0.641838014125824
Epoch 4/10, Batch Loss: 0.5764975547790527
Epoch 4/10, Batch Loss: 0.6660592555999756
Epoch 4/10, Batch Loss: 0.6024842262268066
Epoch 4/10, Batch Loss: 0.7287573218345642
Epoch 4/10, Batch Loss: 0.6651892066001892
Epoch 4/10, Batch Loss: 0.6049145460128784
Epoch 4/10, Batch Loss: 0.5407819151878357
Epoch 4/10, Batch Loss: 0.5532221794128418
Epoch 4/10, Batch Loss: 0.6237477660179138
Epoch 4/10, Batch Loss: 0.5427470803260803
Epoch 4/10, Batch Loss: 0.5326001048088074
Epoch 4/10, Batch Loss: 0.6019906997680664
Epoch 4/10, Batch Loss: 0.5432001948356628
Epoch 4/10, Batch Loss: 0.49850064516067505
Epoch 4/10, Batch Loss: 0.40291303396224976
Epoch 4/10, Batch Loss: 0.517487645149231
Epoch 4/10, Batch Loss: 0.6384865045547485
Epoch 4/10, Batch Loss: 0.4873877167701721
Epoch 4/10, Batch Loss: 0.6392702460289001
Epoch 4/10, Batch Loss: 0.6966561675071716
Epoch 4/10, Batch Loss: 0.5236292481422424
Epoch 4/10,

Epoch 4/10, Batch Loss: 0.4831252098083496
Epoch 4/10, Batch Loss: 0.5784982442855835
Epoch 4/10, Batch Loss: 0.4964717626571655
Epoch 4/10, Batch Loss: 0.7707061171531677
Epoch 4/10, Batch Loss: 0.7003787159919739
Epoch 4/10, Batch Loss: 0.656517744064331
Epoch 4/10, Batch Loss: 0.83979332447052
Epoch 4/10, Batch Loss: 0.6652956008911133
Epoch 4/10, Batch Loss: 0.3633812665939331
Epoch 4/10, Average Training Loss: 0.598670219626505
Model saved for epoch 4 at Pre_train_epoch4.pt
Validation Accuracy for epoch 4: 0.7276368491321763
Epoch 5/10, Batch Loss: 0.3972143530845642
Epoch 5/10, Batch Loss: 0.7090664505958557
Epoch 5/10, Batch Loss: 0.38202178478240967
Epoch 5/10, Batch Loss: 0.6078557372093201
Epoch 5/10, Batch Loss: 0.7215650677680969
Epoch 5/10, Batch Loss: 0.6081039905548096
Epoch 5/10, Batch Loss: 0.5581713914871216
Epoch 5/10, Batch Loss: 0.6698586940765381
Epoch 5/10, Batch Loss: 0.5992743372917175
Epoch 5/10, Batch Loss: 0.6496667265892029
Epoch 5/10, Batch Loss: 0.7686629

Epoch 5/10, Batch Loss: 0.837300717830658
Epoch 5/10, Batch Loss: 0.5587681531906128
Epoch 5/10, Batch Loss: 0.592712938785553
Epoch 5/10, Batch Loss: 0.5983709096908569
Epoch 5/10, Batch Loss: 0.6030000448226929
Epoch 5/10, Batch Loss: 0.3380046784877777
Epoch 5/10, Batch Loss: 0.42045944929122925
Epoch 5/10, Batch Loss: 0.5578382015228271
Epoch 5/10, Batch Loss: 0.7255436778068542
Epoch 5/10, Batch Loss: 0.7206558585166931
Epoch 5/10, Batch Loss: 0.6045880913734436
Epoch 5/10, Batch Loss: 0.7149237990379333
Epoch 5/10, Batch Loss: 0.2685834467411041
Epoch 5/10, Batch Loss: 0.6010447144508362
Epoch 5/10, Batch Loss: 0.7089847326278687
Epoch 5/10, Batch Loss: 0.7819063663482666
Epoch 5/10, Batch Loss: 0.5732846260070801
Epoch 5/10, Batch Loss: 0.5870869159698486
Epoch 5/10, Batch Loss: 0.56510990858078
Epoch 5/10, Batch Loss: 0.5014272928237915
Epoch 5/10, Batch Loss: 0.8660175204277039
Epoch 5/10, Batch Loss: 0.6429541110992432
Epoch 5/10, Batch Loss: 0.5540784597396851
Epoch 5/10, Ba

Epoch 5/10, Batch Loss: 0.567374587059021
Epoch 5/10, Batch Loss: 0.506483256816864
Epoch 5/10, Batch Loss: 0.48919951915740967
Epoch 5/10, Batch Loss: 0.4945330023765564
Epoch 5/10, Batch Loss: 0.659608781337738
Epoch 5/10, Batch Loss: 0.5881980657577515
Epoch 5/10, Batch Loss: 0.701559841632843
Epoch 5/10, Batch Loss: 0.49371182918548584
Epoch 5/10, Batch Loss: 0.47109732031822205
Epoch 5/10, Batch Loss: 0.5792324542999268
Epoch 5/10, Batch Loss: 0.6176326274871826
Epoch 5/10, Batch Loss: 0.4430297017097473
Epoch 5/10, Batch Loss: 0.7083331346511841
Epoch 5/10, Batch Loss: 0.7360939383506775
Epoch 5/10, Batch Loss: 0.6521527767181396
Epoch 5/10, Batch Loss: 0.6630277633666992
Epoch 5/10, Batch Loss: 0.31550705432891846
Epoch 5/10, Batch Loss: 0.5840118527412415
Epoch 5/10, Batch Loss: 0.5081456303596497
Epoch 5/10, Batch Loss: 0.32096296548843384
Epoch 5/10, Batch Loss: 0.4702383875846863
Epoch 5/10, Batch Loss: 0.6000988483428955
Epoch 5/10, Batch Loss: 0.4792003631591797
Epoch 5/10

KeyboardInterrupt: 

In [1]:
# Fine-tune용
import os
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
import torch
from sklearn.model_selection import train_test_split

class CustomBertForSequenceClassification(BertForSequenceClassification):
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        labels=None,
        output_hidden_states=True
    ):
        outputs = super().forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            labels=labels,
            output_hidden_states=output_hidden_states
        )
        logits = outputs.logits
        hidden_states = outputs.hidden_states[-8]  # n번째 레이어의 hidden states를 반환합니다.
        loss = outputs.loss
        return logits, loss, hidden_states

# 데이터 로드 및 전처리
data_A = pd.read_csv("output3.csv")  # data set A 파일명에 맞게 수정
data_B = pd.read_csv("infected.csv")  # data set B 파일명에 맞게 수정
# 모델 불러오는 경로
model_path = "Pre_train_epoch2_BERT_Large.pt"
# 모델 저장경로
model_path2 = "Fine-tuned.pt"

# X_train, Y_train 생성
X_train = []
Y_train = []

for index, row in data_A.iterrows():  # 중복 제거를 하지 않고 원본 데이터 사용
    patient_id = row["ID"]
    patient_info = [str(row[column]) for column in data_A.columns if column != "ID" and column != "DESCRIPTION"]
    symptoms = ", ".join(data_A[data_A["ID"] == patient_id]["DESCRIPTION"].tolist())
    combined_info = ", ".join(patient_info) + ", " + symptoms
    X_train.append(combined_info)
    if patient_id in data_B.values:
        Y_train.append(1)
    else:
        Y_train.append(0)

#print("X_train\n", X_train[:10])
#print("Y_train\n", Y_train[:10])
        
# BERT 토크나이저 및 모델 로드
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
# 모델이 이미 저장되어 있는지 확인하고, 저장된 모델이 있으면 불러오고 없으면 새로운 모델 생성
if os.path.exists(model_path):
    # 저장된 모델이 있을 경우 불러오기
    model = CustomBertForSequenceClassification.from_pretrained('bert-large-uncased', num_labels=2)
    model.load_state_dict(torch.load(model_path))
    print("Pre-train model loaded.")
else:
    # 저장된 모델이 없을 경우 새로운 모델 생성
    model = CustomBertForSequenceClassification.from_pretrained('bert-large-uncased', num_labels=2)
    print("New model generated.")

# 입력 데이터를 BERT의 입력 형식으로 변환
max_len = 128  # 입력 시퀀스의 최대 길이

input_ids = []
attention_masks = []

for info in X_train:
    encoded_dict = tokenizer.encode_plus(
                        info,                         # 환자 정보 및 증상
                        add_special_tokens = True,    # [CLS], [SEP] 토큰 추가
                        max_length = max_len,         # 최대 길이 지정
                        pad_to_max_length = True,     # 패딩을 추가하여 최대 길이로 맞춤
                        return_attention_mask = True, # 어텐션 마스크 생성
                        return_tensors = 'pt',        # PyTorch 텐서로 반환
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(Y_train)

# 데이터셋 및 데이터로더 생성
dataset = TensorDataset(input_ids, attention_masks, labels)
train_size = 0.8
train_dataset, val_dataset = train_test_split(dataset, test_size=1-train_size, random_state=42)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=True)

# GPU 사용 가능 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(torch.cuda.is_available())

# 모델을 GPU로 이동
model.to(device)

# 옵티마이저 및 학습률 설정
# 기본 학습률 : 2e-6
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-6)

# 에폭 설정
epochs = 20

# 학습 루프
hidden_states_list = []  # 모든 에폭에 대한 hidden state를 저장할 리스트
# 학습 루프
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs[1]  # loss가 outputs의 두 번째 값입니다.
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        print(f'Epoch {epoch + 1}/{epochs}, Batch Loss: {loss.item()}')
    avg_train_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss}')

    # 모델 저장 및 평가
    model_save_path = f"Fine_tuned_epoch{epoch + 1}_BERT_Large.pt"
    torch.save(model.state_dict(), model_save_path)
    print(f"Model saved for epoch {epoch + 1} at {model_save_path}")
    
    model.eval()
    val_accuracy = 0
    for batch in val_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}
        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs[0]  # logits가 outputs의 첫 번째 값입니다.
        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        val_accuracy += (logits.argmax(axis=1) == label_ids).mean().item()

    print(f'Validation Accuracy for epoch {epoch + 1}: {val_accuracy / len(val_dataloader)}')


Some weights of CustomBertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Pre-train model loaded.




True
Epoch 1/20, Batch Loss: 0.3308902084827423
Epoch 1/20, Batch Loss: 0.14640307426452637
Epoch 1/20, Batch Loss: 0.12646996974945068
Epoch 1/20, Batch Loss: 0.43731361627578735
Epoch 1/20, Batch Loss: 0.3690773546695709
Epoch 1/20, Batch Loss: 0.3478721082210541
Epoch 1/20, Batch Loss: 0.4624733626842499
Epoch 1/20, Batch Loss: 0.08641890436410904
Epoch 1/20, Batch Loss: 0.19177794456481934
Epoch 1/20, Batch Loss: 0.14352092146873474
Epoch 1/20, Batch Loss: 0.34213677048683167
Epoch 1/20, Batch Loss: 0.18757636845111847
Epoch 1/20, Batch Loss: 0.3164503872394562
Epoch 1/20, Batch Loss: 0.3291611075401306
Epoch 1/20, Batch Loss: 0.7779244780540466
Epoch 1/20, Batch Loss: 0.5579509139060974
Epoch 1/20, Batch Loss: 0.40172895789146423
Epoch 1/20, Batch Loss: 0.09963130950927734
Epoch 1/20, Batch Loss: 0.13323451578617096
Epoch 1/20, Batch Loss: 0.3186250627040863
Epoch 1/20, Batch Loss: 0.13826097548007965
Epoch 1/20, Batch Loss: 0.415262907743454
Epoch 1/20, Batch Loss: 0.433831900358

Epoch 2/20, Batch Loss: 0.4249988794326782
Epoch 2/20, Batch Loss: 0.5439640879631042
Epoch 2/20, Batch Loss: 0.21461258828639984
Epoch 2/20, Batch Loss: 0.33965763449668884
Epoch 2/20, Batch Loss: 0.0884118527173996
Epoch 2/20, Batch Loss: 0.42314207553863525
Epoch 2/20, Batch Loss: 0.3232156038284302
Epoch 2/20, Batch Loss: 0.3046879768371582
Epoch 2/20, Batch Loss: 0.4255339503288269
Epoch 2/20, Batch Loss: 0.3486599922180176
Epoch 2/20, Batch Loss: 0.2701864242553711
Epoch 2/20, Batch Loss: 0.2589777410030365
Epoch 2/20, Batch Loss: 0.23708772659301758
Epoch 2/20, Batch Loss: 0.30392563343048096
Epoch 2/20, Batch Loss: 0.24922388792037964
Epoch 2/20, Batch Loss: 0.25344884395599365
Epoch 2/20, Batch Loss: 0.22737576067447662
Epoch 2/20, Batch Loss: 0.46291157603263855
Epoch 2/20, Batch Loss: 0.35108277201652527
Epoch 2/20, Batch Loss: 0.44227826595306396
Epoch 2/20, Batch Loss: 0.20201782882213593
Epoch 2/20, Batch Loss: 0.21920746564865112
Epoch 2/20, Batch Loss: 0.513879001140594

Model saved for epoch 3 at Fine_tuned_epoch3_BERT_Large.pt
Validation Accuracy for epoch 3: 0.8915770609318997
Epoch 4/20, Batch Loss: 0.44169414043426514
Epoch 4/20, Batch Loss: 0.2499275654554367
Epoch 4/20, Batch Loss: 0.3145858645439148
Epoch 4/20, Batch Loss: 0.36423948407173157
Epoch 4/20, Batch Loss: 0.08441095054149628
Epoch 4/20, Batch Loss: 0.45803046226501465
Epoch 4/20, Batch Loss: 0.24256236851215363
Epoch 4/20, Batch Loss: 0.07667956501245499
Epoch 4/20, Batch Loss: 0.4053555428981781
Epoch 4/20, Batch Loss: 0.25008949637413025
Epoch 4/20, Batch Loss: 0.1888357698917389
Epoch 4/20, Batch Loss: 0.22686167061328888
Epoch 4/20, Batch Loss: 0.32194435596466064
Epoch 4/20, Batch Loss: 0.32705387473106384
Epoch 4/20, Batch Loss: 0.1298724263906479
Epoch 4/20, Batch Loss: 0.12496038526296616
Epoch 4/20, Batch Loss: 0.3317922353744507
Epoch 4/20, Batch Loss: 0.5296743512153625
Epoch 4/20, Batch Loss: 0.6884816884994507
Epoch 4/20, Batch Loss: 0.20833174884319305
Epoch 4/20, Batch

Epoch 5/20, Batch Loss: 0.545096755027771
Epoch 5/20, Batch Loss: 0.09181273728609085
Epoch 5/20, Batch Loss: 0.1913040429353714
Epoch 5/20, Batch Loss: 0.0846710130572319
Epoch 5/20, Batch Loss: 0.3274506628513336
Epoch 5/20, Batch Loss: 0.3061172068119049
Epoch 5/20, Batch Loss: 0.3061218559741974
Epoch 5/20, Batch Loss: 0.4521753489971161
Epoch 5/20, Batch Loss: 0.24797192215919495
Epoch 5/20, Batch Loss: 0.13990555703639984
Epoch 5/20, Batch Loss: 0.10803885012865067
Epoch 5/20, Batch Loss: 0.47448107600212097
Epoch 5/20, Batch Loss: 0.5225085020065308
Epoch 5/20, Batch Loss: 0.43173378705978394
Epoch 5/20, Batch Loss: 0.4584067165851593
Epoch 5/20, Batch Loss: 0.448576420545578
Epoch 5/20, Batch Loss: 0.31164249777793884
Epoch 5/20, Batch Loss: 0.15844422578811646
Epoch 5/20, Batch Loss: 0.5310812592506409
Epoch 5/20, Batch Loss: 0.20239347219467163
Epoch 5/20, Batch Loss: 0.12966813147068024
Epoch 5/20, Batch Loss: 0.38668692111968994
Epoch 5/20, Batch Loss: 0.3709001839160919
Ep

Epoch 6/20, Batch Loss: 0.05702095851302147
Epoch 6/20, Average Training Loss: 0.3101183300519862
Model saved for epoch 6 at Fine_tuned_epoch6_BERT_Large.pt
Validation Accuracy for epoch 6: 0.8915770609318997
Epoch 7/20, Batch Loss: 0.243715301156044
Epoch 7/20, Batch Loss: 0.07560081034898758
Epoch 7/20, Batch Loss: 0.19830553233623505
Epoch 7/20, Batch Loss: 0.15946219861507416
Epoch 7/20, Batch Loss: 0.10080860555171967
Epoch 7/20, Batch Loss: 0.133074089884758
Epoch 7/20, Batch Loss: 0.5572410225868225
Epoch 7/20, Batch Loss: 0.34711337089538574
Epoch 7/20, Batch Loss: 0.3354780972003937
Epoch 7/20, Batch Loss: 0.2137962281703949
Epoch 7/20, Batch Loss: 0.4720677435398102
Epoch 7/20, Batch Loss: 0.21800081431865692
Epoch 7/20, Batch Loss: 0.2266256958246231
Epoch 7/20, Batch Loss: 0.28760331869125366
Epoch 7/20, Batch Loss: 0.22590728104114532
Epoch 7/20, Batch Loss: 0.23111240565776825
Epoch 7/20, Batch Loss: 0.20700323581695557
Epoch 7/20, Batch Loss: 0.18938034772872925
Epoch 7/

Epoch 8/20, Batch Loss: 0.48426830768585205
Epoch 8/20, Batch Loss: 0.262050062417984
Epoch 8/20, Batch Loss: 0.3231600821018219
Epoch 8/20, Batch Loss: 0.2254493236541748
Epoch 8/20, Batch Loss: 0.22737933695316315
Epoch 8/20, Batch Loss: 0.23151002824306488
Epoch 8/20, Batch Loss: 0.3516201674938202
Epoch 8/20, Batch Loss: 0.16941368579864502
Epoch 8/20, Batch Loss: 0.5181132555007935
Epoch 8/20, Batch Loss: 0.1318100094795227
Epoch 8/20, Batch Loss: 0.2100490927696228
Epoch 8/20, Batch Loss: 0.1976119875907898
Epoch 8/20, Batch Loss: 0.44589486718177795
Epoch 8/20, Batch Loss: 0.40800684690475464
Epoch 8/20, Batch Loss: 0.5147242546081543
Epoch 8/20, Batch Loss: 0.23760978877544403
Epoch 8/20, Batch Loss: 0.2901090085506439
Epoch 8/20, Batch Loss: 0.6227566599845886
Epoch 8/20, Batch Loss: 0.4298724830150604
Epoch 8/20, Batch Loss: 0.4868530333042145
Epoch 8/20, Batch Loss: 0.35648322105407715
Epoch 8/20, Batch Loss: 0.5221354961395264
Epoch 8/20, Batch Loss: 0.09898529201745987
Epo

Epoch 9/20, Batch Loss: 0.3356807827949524
Epoch 9/20, Batch Loss: 0.37545090913772583
Epoch 9/20, Batch Loss: 0.2232290357351303
Epoch 9/20, Batch Loss: 0.09584622830152512
Epoch 9/20, Average Training Loss: 0.304811847282619
Model saved for epoch 9 at Fine_tuned_epoch9_BERT_Large.pt
Validation Accuracy for epoch 9: 0.8915770609318997
Epoch 10/20, Batch Loss: 0.11155981570482254
Epoch 10/20, Batch Loss: 0.5969366431236267
Epoch 10/20, Batch Loss: 0.30927595496177673
Epoch 10/20, Batch Loss: 0.32679200172424316
Epoch 10/20, Batch Loss: 0.21726766228675842
Epoch 10/20, Batch Loss: 0.3540950417518616
Epoch 10/20, Batch Loss: 0.2854781448841095
Epoch 10/20, Batch Loss: 0.21634066104888916
Epoch 10/20, Batch Loss: 0.30813390016555786
Epoch 10/20, Batch Loss: 0.5944252014160156
Epoch 10/20, Batch Loss: 0.2316715568304062
Epoch 10/20, Batch Loss: 0.10060626268386841
Epoch 10/20, Batch Loss: 0.4919961094856262
Epoch 10/20, Batch Loss: 0.5033677816390991
Epoch 10/20, Batch Loss: 0.385129570960

Epoch 11/20, Batch Loss: 0.5336858630180359
Epoch 11/20, Batch Loss: 0.052354030311107635
Epoch 11/20, Batch Loss: 0.16520437598228455
Epoch 11/20, Batch Loss: 0.30349934101104736
Epoch 11/20, Batch Loss: 0.3084453344345093
Epoch 11/20, Batch Loss: 0.5167108774185181
Epoch 11/20, Batch Loss: 0.4835541248321533
Epoch 11/20, Batch Loss: 0.05393841117620468
Epoch 11/20, Batch Loss: 0.11627361923456192
Epoch 11/20, Batch Loss: 0.3222385346889496
Epoch 11/20, Batch Loss: 0.44505128264427185
Epoch 11/20, Batch Loss: 0.31007638573646545
Epoch 11/20, Batch Loss: 0.17127801477909088
Epoch 11/20, Batch Loss: 0.0697362869977951
Epoch 11/20, Batch Loss: 0.43099698424339294
Epoch 11/20, Batch Loss: 0.5408755540847778
Epoch 11/20, Batch Loss: 0.3268386125564575
Epoch 11/20, Batch Loss: 0.37497594952583313
Epoch 11/20, Batch Loss: 0.17370693385601044
Epoch 11/20, Batch Loss: 0.3635988235473633
Epoch 11/20, Batch Loss: 0.21087589859962463
Epoch 11/20, Batch Loss: 0.3973011076450348
Epoch 11/20, Batch 

Epoch 12/20, Batch Loss: 0.19912753999233246
Epoch 12/20, Batch Loss: 0.332487016916275
Epoch 12/20, Batch Loss: 0.20607773959636688
Epoch 12/20, Batch Loss: 0.1585891991853714
Epoch 12/20, Batch Loss: 0.18150559067726135
Epoch 12/20, Batch Loss: 0.12206932157278061
Epoch 12/20, Batch Loss: 0.2810411751270294
Epoch 12/20, Batch Loss: 0.16532403230667114
Epoch 12/20, Batch Loss: 0.1894705593585968
Epoch 12/20, Batch Loss: 0.15184880793094635
Epoch 12/20, Batch Loss: 0.2027232050895691
Epoch 12/20, Batch Loss: 0.2280677706003189
Epoch 12/20, Batch Loss: 0.4563782513141632
Epoch 12/20, Batch Loss: 0.27855384349823
Epoch 12/20, Batch Loss: 0.6962082982063293
Epoch 12/20, Batch Loss: 0.05181952193379402
Epoch 12/20, Average Training Loss: 0.2606612867940732
Model saved for epoch 12 at Fine_tuned_epoch12_BERT_Large.pt
Validation Accuracy for epoch 12: 0.8911290322580645
Epoch 13/20, Batch Loss: 0.06108178570866585
Epoch 13/20, Batch Loss: 0.20412175357341766
Epoch 13/20, Batch Loss: 0.120021

Epoch 14/20, Batch Loss: 0.26276713609695435
Epoch 14/20, Batch Loss: 0.46352627873420715
Epoch 14/20, Batch Loss: 0.14483094215393066
Epoch 14/20, Batch Loss: 0.2605418860912323
Epoch 14/20, Batch Loss: 0.21006068587303162
Epoch 14/20, Batch Loss: 0.1844845414161682
Epoch 14/20, Batch Loss: 0.18021754920482635
Epoch 14/20, Batch Loss: 0.14604908227920532
Epoch 14/20, Batch Loss: 0.32514405250549316
Epoch 14/20, Batch Loss: 0.18880704045295715
Epoch 14/20, Batch Loss: 0.06427573412656784
Epoch 14/20, Batch Loss: 0.28998202085494995
Epoch 14/20, Batch Loss: 0.17924854159355164
Epoch 14/20, Batch Loss: 0.11033443361520767
Epoch 14/20, Batch Loss: 0.20067590475082397
Epoch 14/20, Batch Loss: 0.03883372247219086
Epoch 14/20, Batch Loss: 0.24681007862091064
Epoch 14/20, Batch Loss: 0.10617101192474365
Epoch 14/20, Batch Loss: 0.1415925770998001
Epoch 14/20, Batch Loss: 0.10548442602157593
Epoch 14/20, Batch Loss: 0.2510630190372467
Epoch 14/20, Batch Loss: 0.6142996549606323
Epoch 14/20, Ba

Epoch 15/20, Batch Loss: 0.19013717770576477
Epoch 15/20, Batch Loss: 0.3148745894432068
Epoch 15/20, Batch Loss: 0.09901607781648636
Epoch 15/20, Batch Loss: 0.24039316177368164
Epoch 15/20, Batch Loss: 0.07671504467725754
Epoch 15/20, Batch Loss: 0.1397416740655899
Epoch 15/20, Batch Loss: 0.1015990749001503
Epoch 15/20, Batch Loss: 0.2733100354671478
Epoch 15/20, Batch Loss: 0.2375720739364624
Epoch 15/20, Batch Loss: 0.13918901979923248
Epoch 15/20, Batch Loss: 0.09152858704328537
Epoch 15/20, Batch Loss: 0.2754032015800476
Epoch 15/20, Batch Loss: 0.07145310938358307
Epoch 15/20, Batch Loss: 0.1868813931941986
Epoch 15/20, Batch Loss: 0.19092418253421783
Epoch 15/20, Batch Loss: 0.12204782664775848
Epoch 15/20, Batch Loss: 0.14708314836025238
Epoch 15/20, Batch Loss: 0.2007797360420227
Epoch 15/20, Batch Loss: 0.21827970445156097
Epoch 15/20, Batch Loss: 0.02983647771179676
Epoch 15/20, Batch Loss: 0.23305273056030273
Epoch 15/20, Batch Loss: 0.1931701898574829
Epoch 15/20, Batch 

Epoch 17/20, Batch Loss: 0.10128048807382584
Epoch 17/20, Batch Loss: 0.11621648073196411
Epoch 17/20, Batch Loss: 0.19030247628688812
Epoch 17/20, Batch Loss: 0.009902235120534897
Epoch 17/20, Batch Loss: 0.014426426962018013
Epoch 17/20, Batch Loss: 0.12686313688755035
Epoch 17/20, Batch Loss: 0.08638321608304977
Epoch 17/20, Batch Loss: 0.0658082515001297
Epoch 17/20, Batch Loss: 0.007705136202275753
Epoch 17/20, Batch Loss: 0.02463855966925621
Epoch 17/20, Batch Loss: 0.19893580675125122
Epoch 17/20, Batch Loss: 0.012671394273638725
Epoch 17/20, Batch Loss: 0.11689683794975281
Epoch 17/20, Batch Loss: 0.2630070447921753
Epoch 17/20, Batch Loss: 0.1663386970758438
Epoch 17/20, Batch Loss: 0.05296388640999794
Epoch 17/20, Batch Loss: 0.08943135291337967
Epoch 17/20, Batch Loss: 0.11785611510276794
Epoch 17/20, Batch Loss: 0.3053772449493408
Epoch 17/20, Batch Loss: 0.21584977209568024
Epoch 17/20, Batch Loss: 0.1367301493883133
Epoch 17/20, Batch Loss: 0.08547291159629822
Epoch 17/20

Epoch 18/20, Batch Loss: 0.008050523698329926
Epoch 18/20, Batch Loss: 0.2926294207572937
Epoch 18/20, Batch Loss: 0.17167049646377563
Epoch 18/20, Batch Loss: 0.16632629930973053
Epoch 18/20, Batch Loss: 0.052897896617650986
Epoch 18/20, Batch Loss: 0.09491753578186035
Epoch 18/20, Batch Loss: 0.07684548944234848
Epoch 18/20, Batch Loss: 0.03623305633664131
Epoch 18/20, Batch Loss: 0.14714865386486053
Epoch 18/20, Batch Loss: 0.0288859736174345
Epoch 18/20, Batch Loss: 0.03675215318799019
Epoch 18/20, Batch Loss: 0.1643875241279602
Epoch 18/20, Batch Loss: 0.22752827405929565
Epoch 18/20, Batch Loss: 0.037227120250463486
Epoch 18/20, Batch Loss: 0.012126506306231022
Epoch 18/20, Batch Loss: 0.27505046129226685
Epoch 18/20, Batch Loss: 0.16207093000411987
Epoch 18/20, Batch Loss: 0.0847943052649498
Epoch 18/20, Batch Loss: 0.09003118425607681
Epoch 18/20, Batch Loss: 0.038719650357961655
Epoch 18/20, Batch Loss: 0.03900662809610367
Epoch 18/20, Batch Loss: 0.17195899784564972
Epoch 18/

Epoch 20/20, Batch Loss: 0.08106157928705215
Epoch 20/20, Batch Loss: 0.1301436573266983
Epoch 20/20, Batch Loss: 0.13358084857463837
Epoch 20/20, Batch Loss: 0.013361196033656597
Epoch 20/20, Batch Loss: 0.13468234241008759
Epoch 20/20, Batch Loss: 0.08331257104873657
Epoch 20/20, Batch Loss: 0.02119387686252594
Epoch 20/20, Batch Loss: 0.24267525970935822
Epoch 20/20, Batch Loss: 0.1249944344162941
Epoch 20/20, Batch Loss: 0.02781441994011402
Epoch 20/20, Batch Loss: 0.21030697226524353
Epoch 20/20, Batch Loss: 0.008212164975702763
Epoch 20/20, Batch Loss: 0.07069560140371323
Epoch 20/20, Batch Loss: 0.06323979049921036
Epoch 20/20, Batch Loss: 0.004910886287689209
Epoch 20/20, Batch Loss: 0.03518778830766678
Epoch 20/20, Batch Loss: 0.04432138055562973
Epoch 20/20, Batch Loss: 0.06531185656785965
Epoch 20/20, Batch Loss: 0.008474848233163357
Epoch 20/20, Batch Loss: 0.04587271064519882
Epoch 20/20, Batch Loss: 0.12203588336706161
Epoch 20/20, Batch Loss: 0.006231021601706743
Epoch 2

In [16]:
# 데이터 랜덤분할(500/500/250)
import pandas as pd
import numpy as np

def sample_csv_and_additional(input_file, output_file_500, output_file_100, n_500):
    # CSV 파일을 읽어옵니다.
    data = pd.read_csv(input_file)
    
    # 데이터를 랜덤하게 샘플링합니다.
    sampled_data_750 = data.sample(n=n_500, random_state=42)
    
    # 첫 250개 데이터를 output_file_500과 output_file_100에 순서대로 삽입합니다.
    first_250 = sampled_data_750[:250]
    first_250.to_csv(output_file_500, index=False)
    first_250.to_csv(output_file_100, index=False)
    
    # 나머지 500개 데이터를 절반으로 나누어 각각 output_file_500과 output_file_100에 추가합니다.
    remaining_500 = sampled_data_750[250:]
    split_idx = len(remaining_500) // 2
    second_250_500 = remaining_500[:split_idx]
    second_250_100 = remaining_500[split_idx:]
    
    # 파일에 추가합니다.
    second_250_500.to_csv(output_file_500, mode='a', header=False, index=False)
    second_250_100.to_csv(output_file_100, mode='a', header=False, index=False)

# 입력 CSV 파일 경로
input_file = "output6.csv"

# 출력 CSV 파일 경로
output_file_500 = "random_500_D.csv"
output_file_100 = "random_500_C.csv"

# 랜덤하게 추출할 데이터 개수
n_500 = 750

# 함수 호출
sample_csv_and_additional(input_file, output_file_500, output_file_100, n_500)



In [18]:
# 데이터 랜덤분할(300/500)
import pandas as pd

def sample_csv_and_additional(input_file, output_file_500, output_file_100, n_500):
    # CSV 파일을 읽어옵니다.
    data = pd.read_csv(input_file)
    
    # 데이터를 랜덤하게 샘플링합니다.
    sampled_data_500 = data.sample(n=n_500, random_state=42)
    
    # 샘플링된 500개의 데이터를 CSV 파일로 내보냅니다.
    sampled_data_500.to_csv(output_file_500, index=False)
    
    # sampled_data_500에서 첫 100개의 데이터를 선택합니다.
    sampled_data_100 = sampled_data_500.head(500)
    
    # 선택된 첫 100개의 데이터를 CSV 파일로 내보냅니다.
    sampled_data_100.to_csv(output_file_100, index=False)

# 입력 CSV 파일 경로
input_file = "output6.csv"

# 출력 CSV 파일 경로
output_file_500 = "random_500.csv"
output_file_100 = "random_300.csv"

# 랜덤하게 추출할 데이터 개수
n_500 = 1000

# 함수 호출
sample_csv_and_additional(input_file, output_file_500, output_file_100, n_500)


In [89]:
# smashed data 생성 (500/server side)
import os
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
import torch
from sklearn.model_selection import train_test_split

class CustomBertForSequenceClassification(BertForSequenceClassification):
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        labels=None,
        output_hidden_states=True
    ):
        outputs = super().forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            labels=labels,
            output_hidden_states=output_hidden_states
        )
        logits = outputs.logits
        hidden_states = outputs.hidden_states[12]  # n번째 레이어의 hidden states를 반환합니다.
        loss = outputs.loss
        return logits, loss, hidden_states

# 데이터 로드 및 전처리
data_A = pd.read_csv("random_500.csv")  # data set A 파일명에 맞게 수정
data_B = pd.read_csv("infected.csv")  # data set B 파일명에 맞게 수정
# 모델 저장 경로
model_path = "Pre_train_epoch2_BERT_Large.pt"

# X_train, Y_train 생성
X_train = []
Y_train = []

for index, row in data_A.iterrows():  # 중복 제거를 하지 않고 원본 데이터 사용
    patient_id = row["ID"]
    patient_info = [str(row[column]) for column in data_A.columns if column != "ID" and column != "DESCRIPTION"]
    symptoms = ", ".join(data_A[data_A["ID"] == patient_id]["DESCRIPTION"].tolist())
    combined_info = ", ".join(patient_info) + ", " + symptoms
    X_train.append(combined_info)
    if patient_id in data_B.values:
        Y_train.append(1)
    else:
        Y_train.append(0)

#print("X_train\n", X_train[:10])
#print("Y_train\n", Y_train[:10])
        
# BERT 토크나이저 및 모델 로드
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
# 모델이 이미 저장되어 있는지 확인하고, 저장된 모델이 있으면 불러오고 없으면 새로운 모델 생성
if os.path.exists(model_path):
    # 저장된 모델이 있을 경우 불러오기
    model = CustomBertForSequenceClassification.from_pretrained('bert-large-uncased', num_labels=2)
    model.load_state_dict(torch.load(model_path))
    print("Pre-train model loaded.")
else:
    # 저장된 모델이 없을 경우 새로운 모델 생성
    model = CustomBertForSequenceClassification.from_pretrained('bert-large-uncased', num_labels=2)
    print("New model generated.")

# 입력 데이터를 BERT의 입력 형식으로 변환
max_len = 128  # 입력 시퀀스의 최대 길이

input_ids = []
attention_masks = []

for info in X_train:
    encoded_dict = tokenizer.encode_plus(
                        info,                         # 환자 정보 및 증상
                        add_special_tokens = True,    # [CLS], [SEP] 토큰 추가
                        max_length = max_len,         # 최대 길이 지정
                        pad_to_max_length = True,     # 패딩을 추가하여 최대 길이로 맞춤
                        return_attention_mask = True, # 어텐션 마스크 생성
                        return_tensors = 'pt',        # PyTorch 텐서로 반환
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(Y_train)

# 데이터셋 생성
dataset = TensorDataset(input_ids, attention_masks, labels)

# 데이터로더 생성
dataloader = DataLoader(dataset, batch_size=16, shuffle=False)

# GPU 사용 가능 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(torch.cuda.is_available())

# 모델을 GPU로 이동
model.to(device)

# 모델 평가
model.eval()
val_accuracy = 0
hidden_states_list = []  # 평가할 때 hidden state를 저장할 리스트
for batch in dataloader:
    batch = tuple(t.to(device) for t in batch)
    inputs = {'input_ids': batch[0],
              'attention_mask': batch[1],
              'labels': batch[2]}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs[0]  # logits가 outputs의 첫 번째 값입니다.
    logits = logits.detach().cpu().numpy()
    label_ids = inputs['labels'].cpu().numpy()
    val_accuracy += (logits.argmax(axis=1) == label_ids).mean().item()
    # hidden state를 저장합니다.
    hidden_states = outputs[2]
    hidden_states_list.append(hidden_states)
hidden_states_concat = torch.cat(hidden_states_list, dim=0)
hidden_states_concat = hidden_states_concat[:, 0, :].cpu().detach().numpy()
hidden_states_df = pd.DataFrame(hidden_states_concat)
hidden_states_df.to_csv("Dictionary_smashed_data_layer24.csv", index=False)

print(f'Validation Accuracy: {val_accuracy / len(dataloader)}')


Some weights of CustomBertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Pre-train model loaded.




True
Validation Accuracy: 0.8819444444444444


In [94]:
# smashed data 생성 (100/client side) #라이브러리 개변
import os
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
import torch
from sklearn.model_selection import train_test_split

class CustomBertForSequenceClassification(BertForSequenceClassification):
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        labels=None,
        output_hidden_states=True
    ):
        outputs = super().forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            labels=labels,
            output_hidden_states=output_hidden_states
        )
        logits = outputs.logits
        hidden_states = outputs.hidden_states[12]  # n번째 레이어의 hidden states를 반환합니다.
        #for j in range(len(hidden_states)):  # hidden_states는 768차원으로 구성되어있음
        #    noise = np.random.normal(0, 10.0)  # 표준 정규 분포에서 적절한 분산값을 사용하여 랜덤한 노이즈 생성
        #    hidden_states[j] += noise  # hidden_states의 값에 노이즈 추가
        loss = outputs.loss
        return logits, loss, hidden_states

# 데이터 로드 및 전처리
data_A = pd.read_csv("random_300.csv")  # data set A 파일명에 맞게 수정
data_B = pd.read_csv("infected.csv")  # data set B 파일명에 맞게 수정
# 모델 저장 경로
model_path = "Fine_tuned_epoch20_BERT_Large.pt"

# X_train, Y_train 생성
X_train = []
Y_train = []

for index, row in data_A.iterrows():  # 중복 제거를 하지 않고 원본 데이터 사용
    patient_id = row["ID"]
    patient_info = [str(row[column]) for column in data_A.columns if column != "ID" and column != "DESCRIPTION"]
    symptoms = ", ".join(data_A[data_A["ID"] == patient_id]["DESCRIPTION"].tolist())
    combined_info = ", ".join(patient_info) + ", " + symptoms
    X_train.append(combined_info)
    if patient_id in data_B.values:
        Y_train.append(1)
    else:
        Y_train.append(0)

#print("X_train\n", X_train[:10])
#print("Y_train\n", Y_train[:10])
        
# BERT 토크나이저 및 모델 로드
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
# 모델이 이미 저장되어 있는지 확인하고, 저장된 모델이 있으면 불러오고 없으면 새로운 모델 생성
if os.path.exists(model_path):
    # 저장된 모델이 있을 경우 불러오기
    model = CustomBertForSequenceClassification.from_pretrained('bert-large-uncased', num_labels=2)
    model.load_state_dict(torch.load(model_path), strict=False)
    print("Pre-train model loaded.")
else:
    # 저장된 모델이 없을 경우 새로운 모델 생성
    model = CustomBertForSequenceClassification.from_pretrained('bert-large-uncased', num_labels=2)
    print("New model generated.")

# 입력 데이터를 BERT의 입력 형식으로 변환
max_len = 128  # 입력 시퀀스의 최대 길이

input_ids = []
attention_masks = []

for info in X_train:
    encoded_dict = tokenizer.encode_plus(
                        info,                         # 환자 정보 및 증상
                        add_special_tokens = True,    # [CLS], [SEP] 토큰 추가
                        max_length = max_len,         # 최대 길이 지정
                        pad_to_max_length = True,     # 패딩을 추가하여 최대 길이로 맞춤
                        return_attention_mask = True, # 어텐션 마스크 생성
                        return_tensors = 'pt',        # PyTorch 텐서로 반환
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(Y_train)

# 데이터셋 생성
dataset = TensorDataset(input_ids, attention_masks, labels)

# 데이터로더 생성
dataloader = DataLoader(dataset, batch_size=16, shuffle=False)

# GPU 사용 가능 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(torch.cuda.is_available())

# 모델을 GPU로 이동
model.to(device)

# 모델 평가
model.eval()
val_accuracy = 0
hidden_states_list = []  # 평가할 때 hidden state를 저장할 리스트
for batch in dataloader:
    batch = tuple(t.to(device) for t in batch)
    inputs = {'input_ids': batch[0],
              'attention_mask': batch[1],
              'labels': batch[2]}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs[0]  # logits가 outputs의 첫 번째 값입니다.
    logits = logits.detach().cpu().numpy()
    label_ids = inputs['labels'].cpu().numpy()
    val_accuracy += (logits.argmax(axis=1) == label_ids).mean().item()
    # hidden state를 저장합니다.
    hidden_states = outputs[2]
    hidden_states_list.append(hidden_states)
hidden_states_concat = torch.cat(hidden_states_list, dim=0)
hidden_states_concat = hidden_states_concat[:, 0, :].cpu().detach().numpy()
hidden_states_df = pd.DataFrame(hidden_states_concat)
hidden_states_df.to_csv("Client_smashed_data_layer24.csv", index=False)

print(f'Validation Accuracy: {val_accuracy / len(dataloader)}')


Some weights of CustomBertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Pre-train model loaded.




True
Validation Accuracy: 0.99609375


In [95]:
# 유클리드 거리 유사도
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances

def calculate_accuracy_and_distance(client_file, dictionary_file, original_file_client, original_file_dictionary, n=5):
    # 변환된 파일을 읽어옵니다.
    client_data = pd.read_csv(client_file)
    dictionary_data = pd.read_csv(dictionary_file)
    
    # 원본 파일을 읽어옵니다.
    original_client_data = pd.read_csv(original_file_client)
    original_dictionary_data = pd.read_csv(original_file_dictionary)
    
    # 데이터 포인트 간의 유클리드 거리를 계산합니다.
    distances = euclidean_distances(client_data.values, dictionary_data.values)
    # Top@n 유사도를 찾습니다.
    topn_similarities = np.argsort(distances, axis=1)[:, :n]
    topn_values = np.sort(distances, axis=1)[:, :n]
    
    # 모든 결과를 출력하고 정확도를 계산합니다.
    successful_distances = []
    unsuccessful_distances = []
    successes = 0
    success_indices = []  # 성공한 인덱스를 저장할 리스트
    
    success_ranks_count = {rank: 0 for rank in range(1, n+1)}  # 각 성공한 서버 측 랭크의 수를 저장할 딕셔너리
    for i, (indices, scores) in enumerate(zip(topn_similarities, topn_values)):
        """print(f"\nTop {n} inferences for client {i + 1}:")"""
        for rank, (idx, score) in enumerate(zip(indices, scores), 1):
            """print(f"Server {idx + 1} with distance {score}")"""
            if original_client_data.iloc[i].equals(original_dictionary_data.iloc[idx]):
                successes += 1
                successful_distances.append(score)
                success_indices.append((i + 1, rank))  # 성공한 인덱스를 추가
                success_ranks_count[rank] += 1  # 해당 랭크의 수를 증가시킴
            else:
                unsuccessful_distances.append(score)
        if successes == 0:
            print("No successful match found.")
    
    # 정확도 계산
    accuracy = successes / len(client_data)
    
    # 성공적으로 일치하는 데이터 포인트와 클라이언트 데이터 포인트, 그리고 일치하지 않는 데이터 포인트와 클라이언트 데이터 포인트 간의 평균 거리를 계산합니다.
    successful_mean_distance = np.mean(successful_distances)
    unsuccessful_mean_distance = np.mean(unsuccessful_distances)
    
    # 평균 거리의 분산 계산
    successful_distance_variance = np.var(successful_distances)
    unsuccessful_distance_variance = np.var(unsuccessful_distances)
    
    return accuracy, successful_mean_distance, unsuccessful_mean_distance, success_indices, successful_distance_variance, unsuccessful_distance_variance, success_ranks_count

# 변환된 파일 경로
dictionary_file = "Dictionary_smashed_data_layer24.csv"

# 원본 파일 경로
original_file_client = "random_300.csv"
original_file_dictionary = "random_500.csv"

# Top n 설정
n = 5

# 정확도 계산 및 평균 거리 계산

client_file = f'Client_smashed_data_layer24.csv'
accuracy, successful_mean_distance, unsuccessful_mean_distance, success_indices, successful_distance_variance, unsuccessful_distance_variance, success_ranks_count = calculate_accuracy_and_distance(client_file, dictionary_file, original_file_client, original_file_dictionary, n)

print("\nFor file:", client_file)
print("Accuracy:", accuracy)
print("Successful Mean Distance:", successful_mean_distance)
print("Unsuccessful Mean Distance:", unsuccessful_mean_distance)

# 분산 출력
print("Successful Distance Variance:", successful_distance_variance)
print("Unsuccessful Distance Variance:", unsuccessful_distance_variance)

# 성공한 인덱스들을 출력합니다.
print("Success Indices:", success_indices)

# 각 성공한 서버 측 랭크의 수를 출력합니다.
print("Success Ranks Count:")
for rank, count in success_ranks_count.items():
    print(f"Rank {rank}: {count} successes")



For file: Client_smashed_data_layer24.csv
Accuracy: 0.774
Successful Mean Distance: 4.1270358728788334
Unsuccessful Mean Distance: 4.406488580300589
Successful Distance Variance: 0.12063768643880003
Unsuccessful Distance Variance: 0.15130207213235836
Success Indices: [(1, 1), (2, 1), (3, 2), (4, 1), (7, 1), (8, 1), (9, 1), (11, 1), (12, 1), (13, 2), (14, 1), (15, 2), (18, 1), (19, 1), (20, 4), (21, 1), (22, 1), (25, 2), (27, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (37, 1), (38, 1), (40, 5), (41, 1), (42, 1), (43, 1), (44, 1), (45, 4), (46, 1), (47, 1), (48, 1), (49, 3), (50, 1), (53, 1), (54, 1), (55, 4), (56, 1), (58, 1), (59, 1), (61, 1), (63, 1), (65, 3), (66, 1), (67, 1), (68, 4), (70, 1), (72, 1), (74, 1), (75, 1), (76, 1), (78, 1), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (87, 2), (89, 1), (91, 1), (92, 1), (93, 1), (94, 2), (95, 1), (97, 1), (98, 1), (99, 1), (100, 2), (101, 1), (102, 1), (103, 3), (104, 1), (108, 2), (109, 1), (110, 3), 