### 변수 조합별 R2 Score 측정

In [1]:
import pandas as pd

origin_data1 = pd.read_csv('data2/CGM1_dS&dSr.csv', encoding='cp949')
origin_data2 = pd.read_csv('data2/CGM2_dS&dSr.csv', encoding='cp949')
origin_data3 = pd.read_csv('data2/CGM3_dS&dSr.csv', encoding='cp949')
origin_data4 = pd.read_csv('data2/CGM4_dS&dSr.csv', encoding='cp949')
origin_data5 = pd.read_csv('data2/CGM5_dS&dSr.csv', encoding='cp949')

def filter_data(origin_data, columns):
    data = origin_data[columns]
    data = data[data["Glu(mg/dl)"] > 0]
    return data

In [2]:
common_var = ["Glu(mg/dl)", "LD1 Temp", "LD2 Temp", "LD3 Temp", "Rx1 Temp", "Rx2 Temp", "S1 T(C)", "S2 T(C)", "S3 T(C)", "FR Mon", "LD_Bias_Av",
             "mPD1_dS", "mPD2_dS", "mPD3_dS", "T-rPD_L1dS", "T-rPD_L2dS", "T-rPD_L3dS", "R-rPD_L1dS", "R-rPD_L2dS", "R-rPD_L3dS",
             "mPDdSr31", "mPDdSr32", "mPDdSr21", "T-rPDdSr31", "T-rPDdSr32", "T-rPDdSr21", "R-rPDdSr31", "R-rPDdSr32", "R-rPDdSr21"]


dataR1 = filter_data(origin_data1, common_var)
dataR2 = filter_data(origin_data2, common_var)
dataR3 = filter_data(origin_data3, common_var)
dataR4 = filter_data(origin_data4, common_var)
dataR5 = filter_data(origin_data5, common_var)

# dataC= pd.concat([dataR1, dataR2, dataR4, dataR5])

print(len(dataR1), len(dataR2), len(dataR3), len(dataR4), len(dataR5))

295 106 187 342 170


In [3]:
import torch
import numpy
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

def preprocess(train_data, test_data) -> tuple:
    train_data = train_data.astype({'Glu(mg/dl)':'int'})
    test_data = test_data.astype({'Glu(mg/dl)':'int'})
    
    X_train = train_data.drop('Glu(mg/dl)', axis=1).values
    y_train = train_data['Glu(mg/dl)'].values
    y_train = y_train.round(0)
    
    X_test = test_data.drop('Glu(mg/dl)', axis=1).values
    y_test = test_data['Glu(mg/dl)'].values
    y_test = y_test.round(0)
    
    sum_data = pd.concat([train_data, test_data])
    sum_data = sum_data.drop('Glu(mg/dl)', axis=1).values
    
    scaler = StandardScaler()
    scaler.fit(sum_data)
    
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    return torch.tensor(X_train, dtype=torch.float32), \
        torch.tensor(y_train, dtype=torch.float32), \
        torch.tensor(X_test, dtype=torch.float32), \
        torch.tensor(y_test, dtype=torch.float32)


def count_r2(model, loder, isDT) -> tuple:
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    y_pred = []
    y_true = []
    
    if isDT == False:
        model.to(device)
        model.eval()
        with torch.no_grad():
            for X, y in loder:
                X = X.to(device)
                y = y.to(device)
                y_hat = model(X)
                y_pred.append(y_hat.cpu().numpy())
                y_true.append(y.cpu().numpy())
        y_pred = numpy.concatenate(y_pred)
        y_true = numpy.concatenate(y_true)
        r2 = r2_score(y_true, y_pred.squeeze())
        return r2
    else:
        for X, y in loder:
            y_hat = model.predict(X.squeeze(0).tolist()) # model(X)
            y_hat = torch.Tensor(y_hat)
            y_pred.append(y_hat.cpu().numpy())
            y_true.append(y.cpu().numpy())

        y_pred = numpy.concatenate(y_pred)
        y_true = numpy.concatenate(y_true)
        r2 = r2_score(y_true, y_pred.squeeze())
        return r2

In [4]:
from torch import onnx

def train_model(model, train_loader, num_epochs, learning_rate, name:str):
    criterion = torch.nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    model.to(device)
    epoch_losses = []
    for epoch in tqdm(range(num_epochs), desc=f'Train {name}'):
        total_loss = 0
        total_batches = 0
        for X, y in train_loader:
            X = X.to(device)
            y = y.to(device)
            with torch.set_grad_enabled(True):
                y_hat = model(X)
                loss = criterion(y_hat.squeeze(), y)
                total_loss += loss.item()
                total_batches += 1
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        epoch_loss = total_loss / total_batches
        epoch_losses.append(epoch_loss)
    
    torch.save(model.state_dict(), f'property2/{name}.pt')
    onnx.export(model, X.to(device), f'property2/{name}.onnx')
    return epoch_losses

In [5]:
from model import MLP
from sklearn.ensemble import RandomForestClassifier

target_var = ["Glu(mg/dl)", "R-rPD_L3dS", "mPDdSr31", "T-rPDdSr31", "T-rPDdSr32", "R-rPDdSr31", "R-rPDdSr21"]
common_var = ["Glu(mg/dl)", "Rx1 Temp", "Rx2 Temp", "S1 T(C)", "S2 T(C)", "S3 T(C)", "FR Mon",
             "mPD1_dS", "mPD2_dS", "mPD3_dS", "T-rPD_L1dS", "T-rPD_L2dS", "T-rPD_L3dS", "R-rPD_L1dS", "R-rPD_L2dS", "R-rPD_L3dS",
             "mPDdSr31", "mPDdSr32", "mPDdSr21", "T-rPDdSr31", "T-rPDdSr32", "T-rPDdSr21", "R-rPDdSr31", "R-rPDdSr32", "R-rPDdSr21"]
# for one_var in common_var:
#     train_target = ['Glu(mg/dl)' ,one_var]

#     dataR1 = filter_data(origin_data1, train_target)
#     dataR2 = filter_data(origin_data2, train_target)
#     dataR4 = filter_data(origin_data4, train_target)
#     dataR5 = filter_data(origin_data5, train_target)

#     dataC= pd.concat([dataR2, dataR4, dataR5])
#     datas = preprocess(dataC, dataR1)
#     train_loader = DataLoader(TensorDataset(datas[0].unsqueeze(1), datas[1]), batch_size=8, shuffle=True)

#     model_mlp = MLP(1)
#     losses_mlp = train_model(model_mlp, train_loader, 100, 0.001, 'mlp')

#     test_loader = DataLoader(TensorDataset(datas[2].unsqueeze(1), datas[3]), batch_size=1, shuffle=True)
#     r2 = count_r2(model_mlp, test_loader, False)

#     print(one_var, " : ", r2)
                    
dataR1 = filter_data(origin_data1, common_var)
dataR2 = filter_data(origin_data2, common_var)
dataR4 = filter_data(origin_data4, common_var)
dataR5 = filter_data(origin_data5, common_var)
dataC= pd.concat([dataR1,dataR4, dataR5])
datas = preprocess(dataC, dataR2)

train_loader = DataLoader(TensorDataset(datas[0].unsqueeze(1), datas[1]), batch_size=8, shuffle=True)

model_mlp = MLP(24)
losses_mlp = train_model(model_mlp, train_loader, 50, 0.001, 'mlp')

test_loader = DataLoader(TensorDataset(datas[2].unsqueeze(1), datas[3]), batch_size=1, shuffle=True)
r2 = count_r2(model_mlp, test_loader, False)
print(r2)

Train mlp: 100%|███████████████████████████████████████████████████████████████████████| 50/50 [00:12<00:00,  3.96it/s]

-14.40076979087705





In [6]:
from sklearn.ensemble import RandomForestClassifier

num_epochs = 10
RF = RandomForestClassifier(max_depth=20, n_estimators=100, random_state=0)

for epoch in tqdm(range(num_epochs), desc=f'Train {str}'):
    RF.fit(datas[0], datas[1])

print(RF.score(datas[2], datas[3]))
r2 = count_r2(RF, test_loader, True)
print(r2)

Train <class 'str'>: 100%|█████████████████████████████████████████████████████████████| 10/10 [00:10<00:00,  1.07s/it]


0.0
-1.5058602991186354
