In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from google.colab import drive
drive.mount('/content/drive')

DATA_PATH = "/content/drive/MyDrive/data/"

SEED = 42 # 시드값

# 의료비 예측 데이터
train = pd.read_csv(f"{DATA_PATH}insurance_train.csv")
test = pd.read_csv(f"{DATA_PATH}insurance_test.csv")

# 이진 범주 인코딩
sex_dict = {"male": 1, "female": 0}
smoker_dict = {"yes":1, "no": 0}
train["sex"] = train["sex"].map(sex_dict)
train["smoker"] = train["smoker"].map(smoker_dict)
test["sex"] = test["sex"].map(sex_dict)
test["smoker"] = test["smoker"].map(smoker_dict)

# 특성으로 사용할 변수 선택
train_ft = train.iloc[:,:-1].copy()
test_ft = test.copy()

# 범주형 변수 원핫인코딩
cols = ['region']
enc = OneHotEncoder(handle_unknown = 'ignore')
enc.fit(train[cols])
tmp = pd.DataFrame(
    enc.transform(train_ft[cols]).toarray(),
    columns = enc.get_feature_names_out()
)
train_ft = pd.concat([train_ft,tmp],axis=1).drop(columns=cols)
tmp = pd.DataFrame(
    enc.transform(test_ft[cols]).toarray(),
    columns = enc.get_feature_names_out()
)
test_ft = pd.concat([test_ft,tmp],axis=1).drop(columns=cols)

# Min-Max Scaling
scaler = MinMaxScaler()
scaler.fit(train_ft)
train_ft = scaler.transform(train_ft)
test_ft = scaler.transform(test_ft)

# 정답 데이터
target = train["target"].to_numpy().reshape(-1,1) # 정답데이터 2차원 형태 변환

train_ft.shape, test_ft.shape, target.shape

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


((936, 9), (402, 9), (936, 1))

# 재현성 함수(Reproduction)

In [None]:
import random, os
import torch
def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

# 데이터셋 클래스
- 회귀 문제 : 입력데이터+정답데이터 모두 float32형태 텐서 > Tensor클래스

In [None]:
class InsuranceDataset(torch.utils.data.Dataset):
  def __init__(self, x, y= None):
    self.x = x
    self.y = y

  def __len__(self):
    return len(self.x)

  def __getitem__(self, idx):
    item = {}
    item['x'] = torch.Tensor(self.x[idx]) # float32형태
    if self.y is not None: # y값이 있을 경우
      item['y'] = torch.Tensor(self.y[idx]) # float32형태
    return item

- 결과 확인하기

In [None]:
dt = InsuranceDataset(train_ft, target)
dt[0]

{'x': tensor([0.4130, 1.0000, 0.5443, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000]),
 'y': tensor([19214.7051])}

In [None]:
dl = torch.utils.data.DataLoader(dt, batch_size = 2, shuffle = False)
batch = next(iter(dl))
batch

{'x': tensor([[0.4130, 1.0000, 0.5443, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000],
         [0.0000, 1.0000, 0.3608, 0.2000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000]]),
 'y': tensor([[19214.7051],
         [ 1719.4363]])}

In [None]:
batch['x'].shape, batch['y'].shape

(torch.Size([2, 9]), torch.Size([2, 1]))

# 인공신경망 모델클래스

In [None]:
class Net(torch.nn.Module):
  def __init__(self, n_features):
    super().__init__() # 부모의 __init__메서드를 받아야함
    self.seq = torch.nn.Sequential(
        torch.nn.Linear(n_features, 12), #FC, 선형변환
        torch.nn.BatchNorm1d(12), # 배치정규화
        torch.nn.LeakyReLU(), # H: 활성화
        torch.nn.Linear(12, 8),
        torch.nn.BatchNorm1d(8),
        torch.nn.LeakyReLU(),
        torch.nn.Linear(8, 4),
        torch.nn.BatchNorm1d(4),
        torch.nn.LeakyReLU(),
        torch.nn.Linear(4, 1)
    )
  def forward(self, x): # 생성된 신경망 객체 사용
    return self.seq(x)

- 결과 확인하기

In [None]:
model = Net(train_ft.shape[1])
model(batch['x'])

tensor([[-0.0214],
        [-0.5611]], grad_fn=<AddmmBackward0>)

# 하이퍼파라미터 정의

In [None]:
optimizer = torch.optim.Adam(model.parameters())
loss_fn = torch.nn.MSELoss() # 회귀문제
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# 학습데이터 배치단위 loop함수화
- dataloader객체, 모델 객체, 손실함수 객체, 옵티마이저 객체, 장치이동 문자열 변수

In [None]:
def train_loop(dataloader, model, loss_fn, optimizer, device):
  model.train() # 학습 모드 전환
  epoch_loss = 0
  for batch in dataloader: # 배치단위 데이터 반환
    pred = model(batch['x'].to(device)) # 예측
    loss = loss_fn(pred, batch['y'].to(device)) # 손실함수로 계산

    optimizer.zero_grad() # 이전 경사 누적되는 걸 방지하기 위해 기울기 0으로 초기화
    loss.backward() # 역전파
    optimizer.step() # 가중치 업데이트
    epoch_loss += loss.item() # epoch loss를 계산하기 위해 배치 loss 모두 합치기

  epoch_loss /= len(dataloader) # 평균내서 epoch loss 구하기
  return epoch_loss

- 결과 확인하기

In [None]:
train_loop(dl, model, loss_fn, optimizer, device) # epoch_loss

327904932.241453

# 테스트데이터 loop함수화
- dataloader객체, 모델객체, 손실함수 객체, 장치이동 문자열 변수

In [None]:
@torch.no_grad() # with 사용과 같은 의미, 기울기 '0'초기화 x
def test_loop(dataloader, model, loss_fn, device):
  epoch_loss = 0
  model.eval() # 평가모드
  # act = torch.nn.Sigmoid() 회귀에서 사용 x

  pred_list = []
  for batch in dataloader:
    pred = model(batch['x'].to(device))
    if batch.get('y') is not None: # 검증데이터일 경우, y키에 텐서가 있을 경우만 loss계산
      loss = loss_fn(pred, batch['y'].to(device))
      epoch_loss += loss.item() # loss를 자료형으로 변환


    pred = pred.to('cpu').numpy()
    pred_list.append(pred)

  epoch_loss /= len(dataloader)
  pred = np.concatenate(pred_list)

  return epoch_loss, pred

- 결과 확인하기

In [None]:
dt = InsuranceDataset(test_ft)
dt[0]

{'x': tensor([0.5435, 1.0000, 0.2709, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000])}

In [None]:
dl = torch.utils.data.DataLoader(dt, batch_size = 32, shuffle = False)
test_loop(dl, model, loss_fn, device)

(0.0,
 array([[ 1.5901839 ],
        [ 2.5913196 ],
        [ 2.0153883 ],
        [ 1.9816443 ],
        [ 2.4407916 ],
        [ 2.2267334 ],
        [ 1.4737632 ],
        [ 1.2028006 ],
        [ 0.93321675],
        [ 1.2395098 ],
        [ 0.761082  ],
        [ 2.1146073 ],
        [ 1.5221583 ],
        [ 1.2666556 ],
        [ 1.4212456 ],
        [ 1.2618707 ],
        [ 0.76301706],
        [ 1.5831192 ],
        [ 2.1816368 ],
        [ 0.9332702 ],
        [ 1.9968833 ],
        [-0.07767741],
        [ 2.8234005 ],
        [ 1.6249398 ],
        [ 2.019032  ],
        [ 1.2288489 ],
        [ 1.9498318 ],
        [ 1.4348046 ],
        [ 0.95595807],
        [ 0.74870074],
        [ 0.9589471 ],
        [ 1.0762931 ],
        [ 1.4612439 ],
        [ 2.1285121 ],
        [ 0.72214264],
        [ 1.8495363 ],
        [ 0.5571919 ],
        [ 1.4682883 ],
        [ 2.5996523 ],
        [ 0.6454389 ],
        [ 0.44759154],
        [ 1.5026822 ],
        [ 0.9395996 ],
     

In [None]:
_, pred = test_loop(dl, model, loss_fn, device)
pred.shape

(402, 1)

# loop함수화 n회 반복문

In [None]:
dt = InsuranceDataset(train_ft, target)
dl = torch.utils.data.DataLoader(dt, batch_size = 32, shuffle = False)

In [None]:
for _ in range(10): # 10회 반복
  loss = train_loop(dl, model, loss_fn, optimizer, device)
  print(loss)

327338226.1333333
327328778.6666667
327320022.4
327310851.73333335
327300812.8
327289936.0
327277845.3333333
327264895.46666664
327251141.3333333
327236547.73333335


# Holdout방식 학습 및 검증

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(train_ft,target, random_state = SEED)
x_train.shape, x_valid.shape, y_train.shape, y_valid.shape

((702, 9), (234, 9), (702, 1), (234, 1))

# 하이퍼파라미터 정의

In [None]:
batch_size = 32
loss_fn = torch.nn.L1Loss() # MAE
epochs = 20 # epoch 개수
n_features = x_train.shape[1]

- epoch 수만큼 반복

In [None]:
train_dt = InsuranceDataset(x_train, y_train)
train_dl = torch.utils.data.DataLoader(train_dt, batch_size = batch_size, shuffle = True) # 학습데이터 shuffle > True

valid_dt = InsuranceDataset(x_valid, y_valid)
valid_dl = torch.utils.data.DataLoader(train_dt, batch_size = batch_size, shuffle = False) # 테스트데이터 shuffle > False

model = Net(n_features) # 입력데이터 피처 개수, 인공 신경망모델
optimizer = torch.optim.Adam(model.parameters()) # 최적화
for _ in range(epochs):
  train_loss = train_loop(train_dl, model, loss_fn, optimizer, device)
  valid_loss, pred = test_loop(valid_dl, model, loss_fn, device)
  print(train_loss, valid_loss)

13139.122292258522 13124.174360795454
13126.5517578125 13123.991432883522
13132.63631924716 13123.862482244318
13125.81729403409 13123.782049005682
13129.223366477272 13123.678000710228
13133.185236150568 13123.57186612216
13143.908647017046 13123.443137428978
13144.54403409091 13123.32510653409
13120.936212713068 13123.176047585228
13129.400213068182 13123.041681463068
13122.774636008522 13122.873046875
13128.375133167614 13122.70370205966
13140.535333806818 13122.503595525568
13133.541148792614 13122.303533380682
13115.111061789772 13122.11922940341
13130.197221235796 13121.941761363636
13134.14169034091 13121.72811612216
13135.987082741478 13121.47159090909
13126.15234375 13121.17147549716
13125.035378196022 13120.894176136364
