# 데이터 로드

In [1]:
import zipfile 
import pickle
import pandas

pandas를 굳이 import하는 이유는 어차피 .pkl 파일 내에서 pandas.DataFrame 형식으로 저장되어 있어서 pandas가 없다면 pickle.load시에 문제가 생길 것이기에 미리 방지하기 위함입니다.

In [2]:
archive=zipfile.ZipFile('result.zip')
zroot=zipfile.Path(archive)
list(zroot.iterdir())

[Path('result.zip', '구매(수정)/'), Path('result.zip', '절도(수정)/')]

In [3]:
t1=next((zroot/'구매(수정)').iterdir())
with t1.open('rb') as f:
    df=pickle.load(f)
df

Unnamed: 0,Neck-RShoulder-x,Neck-RShoulder-y,Neck-LShoulder-x,Neck-LShoulder-y,RShoulder-RElbow-x,RShoulder-RElbow-y,RElbow-RWrist-x,RElbow-RWrist-y,LShoulder-LElbow-x,LShoulder-LElbow-y,...,RHip-RKnee-x,RHip-RKnee-y,RKnee-RAnkle-x,RKnee-RAnkle-y,Neck-LHip-x,Neck-LHip-y,LHip-LKnee-x,LHip-LKnee-y,LKnee-LAnkle-x,LKnee-LAnkle-y
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-0.40794,0.913009,-0.509138,0.860685,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.0,0.0
296,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.0,0.0
297,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.0,0.0
298,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.0,0.0


In [3]:
import torch
device=torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [4]:
def load_dfs(myzip:zipfile.ZipFile,path:str):
    mypath=zipfile.Path(myzip)/path
    out=[]
    for file in mypath.iterdir():
        if file.suffix!='.pkl':
            continue
        try:
            with file.open('rb') as f:
                df=pickle.load(f)
        except Exception:
            import traceback
            print('errer in file ',file)
            traceback.print_exc()
        df.to_numpy()
        out.append(torch.tensor(df.to_numpy(),device=device,dtype=torch.float32))
    return out

In [5]:
gumaes=load_dfs(archive,'구매(수정)')
juldos=load_dfs(archive,'절도(수정)')


In [6]:
import random
random.shuffle(gumaes)
random.shuffle(juldos)

In [7]:
{len(x) for x in gumaes},{len(x) for x in juldos}

({299, 300}, {179, 180})

길이가 거의 동일하니, 그냥 gumaes는 299로, juldos는 179로 맞추려고 한다.

In [7]:
gumaes2=[(x if len(x)==299 else x[1:]) for x in gumaes ]
gumaes2=torch.stack(gumaes2).contiguous()
juldos2=[(x if len(x)==179 else x[1:]) for x in juldos ]
juldos2=torch.stack(juldos2).contiguous()

In [8]:
{len(x) for x in gumaes2},{len(x) for x in juldos2}

({299}, {179})

# 데이터 전처리 관련 유틸리티

In [9]:
import random

def random_forever(data):
    mydata=list(data)
    random.shuffle(mydata)
    while True:
        yield from mydata
        random.shuffle(mydata)

def batched(it,batchsize:int):
    it=iter(it)
    try:
        while True:
            yield [next(it) for _ in range(batchsize)]
    except StopIteration:
        pass
def forever(x):
    while True:
        yield from x
def sample_forever(data,batchsize):
    #for datas in batched(random_forever(data),batchsize):
    #    yield torch.nn.utils.rnn.pack_sequence(datas,enforce_sorted=False)
    for indices in batched(random_forever(list(range(len(data)))),batchsize):
        yield data.to(device)[torch.tensor(indices).to(device)]
        #yield torch.stack(datas,1)
    #for tups in batched(random_forever(data),batchsize):
    #    X=torch.nn.utils.rnn.pack_sequence([x[0] for x in tups],enforce_sorted=False)
    #    y=torch.tensor([i[1]for i in tups],dtype=torch.float32,device=device)
    #    yield X,y

In [20]:
@torch.jit.script
def get_loss(g_pred,j_pred,w_g:float=0.5):
    return (((g_pred)**2).mean()*w_g+((1-j_pred)**2).mean()*(1-w_g))
@torch.jit.script
def get_acc(g_pred,j_pred,w_g):
    return (torch.mean((g_pred<0.5),dtype=torch.float32)*w_g+torch.mean((j_pred>0.5),dtype=torch.float32).mean()*(1-w_g))

여기서 절도 데이터는 한 데이터당 179프레임, 구매 데이터는 한 데이터당 299프레임으로 길이가 다르기 때문에, 굳이 패딩을 시도하기 보다는 각각 loss를 구한 뒤 그 loss를 합쳐서 새 loss를 만드는데, 이때 양쪽 다 단순한 mean을 써서 0.5:0.5로 섞이는 것을 방지하기 위해 loss를 구할 때 굳이 w_g를 두어 어느쪽 loss에 더 집중할지를 매 step마다 바뀌게 하여 overfit을 방지하고자 하였습니다.

In [18]:
def split_by(x,p):
    bound=int(len(x)*p)
    return x[bound:],x[:bound]


gumae_train,gumae_test=split_by(gumaes2,0.8)
juldo_train,juldo_test=split_by(juldos2,0.8)

In [59]:
next(iter(torch.utils.data.DataLoader(gumae_test,batch_size=10))).shape

torch.Size([10, 299, 24])

# 모델 작성

In [10]:
import torch.nn as nn
import mylayers as mynn

In [35]:
DROPOUT=0
import torch.functional as F
import itertools

class MyModel(nn.Module):
    def __init__(self,lstms,embedding,nomal):
        super().__init__()
        self.LSTMS=nn.ModuleList(lstms)
        self.embedding=embedding
        self.nomal=nomal
        self.dropout=nn.Dropout(DROPOUT)
    def forward(self,x,hxs=None):
        if hxs is None:
            hxs=itertools.repeat(None)
            lasthx=None
        else:
            lasthx=hxs[-1]
        newhxs=[]
        for layer,hx in zip(self.LSTMS,hxs):
            x,newhx=layer(x,hx)
            x=self.dropout(torch.tanh(x))
            newhxs.append(newhx)
        x,newhx=self.embedding(x,lasthx)
        newhxs.append(newhx)
        x=self.nomal(x)

        return x,tuple(newhxs)
        


tanh=nn.Tanh()
relu=nn.ReLU()
drop=nn.Dropout(DROPOUT)
tanh=nn.Tanh()
mymodel=MyModel(
[
    nn.LSTM(24,64),
    nn.LSTM(64,128),
    nn.LSTM(128,256),
    nn.LSTM(256,256,num_layers=3),
    
],
mynn.EmbeddingLSTM(256,128,dropout=DROPOUT)
,
nn.Sequential(
    tanh,
    nn.Linear(128,64),
    relu,
    drop,
    nn.Linear(64,32),
    drop,
    relu,
    nn.Linear(32,16),
    drop,
    relu,
    nn.Linear(16,1),
    nn.Sigmoid()
)
)

mymodel=mymodel.to(device)
optim=torch.optim.Adam(mymodel.parameters(),weight_decay=1e-4)

sX=next(sample_forever(juldos2,20))
mymodel_jit=torch.jit.trace(mymodel.forward,sX)

매 프레임마다 받는 입력 크기가 24개이므로 첫 layer의 input_size는 24여야 합니다.

모델 구조 자체는 LSTM들을 
- nn.LSTM(24,64),
- nn.LSTM(64,128),
- nn.LSTM(128,256),
- nn.LSTM(256,256,num_layers=3)
순으로 쌓은 뒤에,
- mynn.EmbeddingLSTM(256,128,dropout=DROPOUT)
으로 LSTM(256,128)의 맨 마지막 출력만을 가져오고,

그 뒤는 보이는 대로 모델을 쌓았습니다.

마지막 layer에서는

적당히 Linear,dropout,activation을 깔다가

- nn.Linear(16,1)
- nn.Sigmoid()


마지막에는 원하는 출력값의 개수는 1이니 dim_hidden은 1로,

출력값에서 절도면 1, 절도가 아니면 0을 원하니 0<출력값<1이 되게 하는 sigmoid을 맨 마지막에 사용하였습니다.


나머지 복잡한 부분들(특히 forward의 리턴값이 x가 아니라 튜플인 점)은 혹시라도 실시간 처리에 사용하게 될 경우, LSTM들의 hidden state를 보존하여 이전 입력 프레임들을 매번 다시 주는 대신, 이전 입력 프레임들을 받고 나온 hidden state와 새 프레임 정보들만을 주어도 현재의 수상함?을 판단할 수 있게 하기 위해서입니다.


In [13]:
sX2=next(sample_forever(gumaes2,10))
mymodel_jit(sX2)[0].shape

torch.Size([299, 1])

In [34]:
bigruns=0

In [15]:
import torch.utils.data

In [22]:
from torch.utils.tensorboard import SummaryWriter
from typing import NamedTuple
import tqdm

class TrainResult(NamedTuple):
    loss_train:torch.Tensor
    accuracy_train:torch.Tensor
    loss_test:torch.Tensor
    accuracy_test:torch.Tensor


def train_step(model:mynn.RNNthenSequential,forward,optim:torch.optim.Optimizer,
               #sample_train,sample_test
               gtrain,gtest,jtrain,jtest,w_g,need_stat:bool
               )->TrainResult:
    model.train()
    g_pred,_=forward(gtrain)
    j_pred,_=forward(jtrain)
    #sX,sy=sample_train
    #train_pred,_=model(sX)

    #loss:torch.Tensor=((train_pred-sy)**2).mean()
    loss=get_loss(g_pred,j_pred,w_g)

    #writer.add_scalar("Loss/train", loss, step)
        
    optim.zero_grad()
    loss.backward()
    optim.step()
    if not need_stat:
        return
    with torch.no_grad():
        #acc=torch.mean((train_pred>0.5)==(sy>0.5),dtype=torch.float32)
        acc=get_acc(g_pred,j_pred,w_g)

        #tX,ty=sample_test
        #test_pred,_=model(tX)
        #loss_test=((test_pred-ty)**2).mean()
        #accuracy_test=torch.mean((test_pred>0.5)==(ty>0.5),dtype=torch.float32)
        model.eval()
        G,_=forward(gtest)
        J,_=forward(jtest)
        loss_test=get_loss(G,J,w_g)
        accuracy_test=get_acc(G,J,w_g)



        return TrainResult(
            loss_train=loss,
            accuracy_train=acc,
            loss_test=loss_test,
            accuracy_test=accuracy_test
        )


def train_once(model,forward,optim,
               #data_train,data_test,
               gumae_train,gumae_test,juldo_test,juldo_train,
               run=1,n_sample=70,epochs=200,n_test_sample=50,mix_min=0.3,mix_max=0.5):
    writer = SummaryWriter(f'logs/v0/runs-{bigruns}-{run}')
    #as_sampler=lambda data,batchsize:forever(torch.utils.data.DataLoader(data,batchsize,shuffle=True,drop_last=True))
    

    #train_it=sample_forever(data_train,n_sample)
    #test_it=sample_forever(data_test,n_train_sample)
    its=list(map(sample_forever,(gumae_train,gumae_test,juldo_train,juldo_test),
                                                                 (n_sample,n_test_sample,n_sample,n_test_sample)))
    assert len(its)==4
    #assert len([*map(next,its)])==4
    for step in tqdm.tqdm(range(epochs)):
        mix=random.random()*mix_max+mix_min
        mix=torch.tensor((mix,),device=device)#


        result=train_step(model,forward,optim,*map(next,its),w_g=mix,need_stat=step%5==0)
        if result is not None:
            writer.add_scalar("Loss/train", result.loss_train, step)
            writer.add_scalar("Accuracy/train", result.accuracy_train, step)
            writer.add_scalar("Loss/test",result.loss_test,step)
            writer.add_scalar("Accuracy/test",result.accuracy_test,step)
        if step%30==0:
            writer.flush()

    writer.close()

훈련 상황을 실시간으로 확인하기 위해 tensorboard를 사용하였습니다.


In [36]:
bigruns+=1

for i in range(10):
    train_once(mymodel,mymodel.forward,optim,
               gumae_train=gumae_train,gumae_test=gumae_test,juldo_test=juldo_test,juldo_train=juldo_train
               #data_train=data_train,data_test=data_test
               ,run=i)

  0%|          | 0/200 [00:00<?, ?it/s]

100%|██████████| 200/200 [00:36<00:00,  5.53it/s]
100%|██████████| 200/200 [00:35<00:00,  5.64it/s]
100%|██████████| 200/200 [00:35<00:00,  5.66it/s]
100%|██████████| 200/200 [00:35<00:00,  5.68it/s]
100%|██████████| 200/200 [00:34<00:00,  5.77it/s]
100%|██████████| 200/200 [00:34<00:00,  5.84it/s]
100%|██████████| 200/200 [00:34<00:00,  5.84it/s]
100%|██████████| 200/200 [00:34<00:00,  5.78it/s]
100%|██████████| 200/200 [00:34<00:00,  5.80it/s]
100%|██████████| 200/200 [00:34<00:00,  5.87it/s]


바로 위의 블록을 실행하면 훈련이 진행되고, 이 블록 자체를 여러번 실행할 것을 염두에 두었기에 전역변수인 bigruns가 이 블록의 실행 회수를 기억하게 하였습니다.

`pip install tensorboard`
`python -m tensorboard --logdir {path_to_log} `

로 방금 `logs/v0/runs-{bigruns}-{run}`에 저장한 로그들을 읽을 수 있습니다.
실행 결과를 보자면 loss도 accuracy도 수렴할 조짐 자체를 아예 보이지 않고 있습니다.

거기에 혹시 tensorboard를 실행할 수 없는 환경이라면 제가 직접 `logs_svg/v0` 에 tensorboard에서 볼 수 있는 그래프를 .svg로 저장해두었습니다.

`logs_svg/v0/Loss_train.svg` 만 보더라도 전혀 감소하는 기색을 보이지 않고 있습니다...
