In [2]:
import torch
from torch import nn
import numpy as np
import matplotlib.pyplot as plt

use_cuda = torch.cuda.is_available()
# device = torch.device("cpu")
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cpu


In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import torchvision
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Dataset
import os

In [4]:
class LSTM_net(nn.Module):
    def __init__(self,input_dim,hidden_dim,output_dim,layer_num):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.lstm = torch.nn.LSTM(input_dim, hidden_dim,layer_num,batch_first=True)
        self.dr = torch.nn.Dropout2d(0.1)
        self.fc = torch.nn.Linear(hidden_dim,output_dim)
        
        
    def forward(self,inputs):
        x = inputs
        lstm_out,(hn,cn) = self.lstm(x)
        out = self.fc(lstm_out[:,-1,:])
        return out

In [5]:
data_path = "./nturgb+d_skeletons/"
broken_files_path = "./NTU_RGBD_samples_with_missing_skeletons.txt"
training_classes = [8, 10, 22, 23, 27, 21, 1, 2, 3, 4]
training_cameras = [1, 2, 3]
LABELS = {0: "sitting down",
          1: "clapping",
          2: "cheer up",
          3: "hand waving",
          4: "jump up",
          5: "take off a hat/cap",
          6: "drink water",
          7: "eat meal/snack",
          8: "brushing teeth",
          9: "brushing hair"}

In [17]:
class Skeleton2_Dataset(Dataset):
    def __init__(self, data_path, broken_files_path, training_classes, training_cameras, chonk_len=45, transform=None):
        self.training_classes = training_classes
        self.training_cameras = training_cameras
        self.data_path = data_path
        self.chonk_len = chonk_len
        self.transform = transform
        
        self.files, self.action_classes = self.read_data(data_path, broken_files_path, self.training_classes, self.training_cameras)
                
        self.data, self.labels = self.prepare_data_and_labels(self.files, self.chonk_len)


    def __len__(self):
        return len(self.data) 
    
    
    def __getitem__(self, idx):
        item = np.asarray(self.data.iloc[idx,:]).reshape(self.chonk_len,75)
        label = self.labels[idx]
        if self.transform != None:
            item = transform(item)
        return (item, label)
    
    def read_data(self, data_path, broken_files_path, training_classes, training_cameras):
        labels = []
        files = []
        action_classes = {}
        counter = 0
        files_counter = {}

        with open(broken_files_path, 'r') as f:
            broken_files = f.read().split("\n")

        raw_files = os.listdir(data_path)
        num_frames = 0

        for filename in raw_files:
            if filename not in broken_files:
                action_class = int(filename[filename.find('A') + 1:filename.find('A') + 4])
                subject_id = int(filename[filename.find('P') + 1:filename.find('P') + 4])
                camera_id = int(filename[filename.find('C') + 1:filename.find('C') + 4])
                if action_class in training_classes and camera_id in training_cameras:  #and subject_id in training_subjects:
                    if action_class in action_classes:
                        if files_counter[action_class] < 120:
                            files.append([filename,action_classes[action_class]])
                            files_counter[action_class] = files_counter[action_class] + 1
                    else:
                        action_classes.update({action_class : counter})
                        files_counter.update({action_class : 1})
                        counter+=1
                        files.append([filename,action_classes[action_class]])
    #                     labels.append([action_class])
        print("action classes: ", action_classes)
        print("action files: ", files_counter)

        return files, action_classes
    
    def read_skeleton_filter(self, file):
        with open(file, 'r') as f:
            skeleton_sequence = {}
            skeleton_sequence['numFrame'] = int(f.readline())
            skeleton_sequence['frameInfo'] = []
            for t in range(skeleton_sequence['numFrame']):
                frame_info = {}
                frame_info['numBody'] = int(f.readline())
                frame_info['bodyInfo'] = []

                for m in range(frame_info['numBody']):
                    body_info = {}
                    body_info_key = [
                        'bodyID', 'clipedEdges', 'handLeftConfidence',
                        'handLeftState', 'handRightConfidence', 'handRightState',
                        'isResticted', 'leanX', 'leanY', 'trackingState'
                    ]
                    body_info = {
                        k: float(v)
                        for k, v in zip(body_info_key, f.readline().split())
                    }
                    body_info['numJoint'] = int(f.readline())
                    body_info['jointInfo'] = []
                    for v in range(body_info['numJoint']):
                        joint_info_key = [
                            'x', 'y', 'z', 'depthX', 'depthY', 'colorX', 'colorY',
                            'orientationW', 'orientationX', 'orientationY',
                            'orientationZ', 'trackingState'
                        ]
                        joint_info = {
                            k: float(v)
                            for k, v in zip(joint_info_key, f.readline().split())
                        }
                        body_info['jointInfo'].append(joint_info)
                    frame_info['bodyInfo'].append(body_info)
                skeleton_sequence['frameInfo'].append(frame_info)

        return skeleton_sequence

    def read_xyz(self, file, max_body=1, num_joint=25):
        seq_info = self.read_skeleton_filter(file)
        data = np.zeros((max_body, seq_info['numFrame'], num_joint, 3))
        for n, f in enumerate(seq_info['frameInfo']):
            for m, b in enumerate(f['bodyInfo']):
                for j, v in enumerate(b['jointInfo']):
                    if m < max_body and j < num_joint:
                        data[m, n, j, :] = [v['x'], v['y'], v['z']]

                    else:
                        pass

        return data
    
    def create_coords_blocks(self, test_file, chonk_len):   
        frame_counter = 0
        new_labels = []
        new_frames = []
        blocks = []

        test_frames = self.read_xyz(self.data_path + test_file[0])[0]
        label = test_file[1]
        slice_len = chonk_len * int(len(test_frames)/chonk_len)


        for index in range(len(test_frames[:slice_len])):
            frame_counter += 1
            new_frames.append(test_frames[index].flatten())
            if frame_counter == chonk_len:
                frame_counter = 0
                blocks.append(np.array(new_frames))
                new_labels = new_labels + [label]
                new_frames = []


        return blocks, new_labels

    def prepare_data_and_labels(self, working_files_with_labels, chonk_len):
        data = []
        labels = []
        ##########################################################################
        numbers = {0: 0, 1 : 0, 2 : 0, 3 : 0, 4 :0, 5 :0, 6 :0, 7 :0, 8 :0, 9 :0} #####
        ##################################################################
        for file in working_files_with_labels:
            frames_blocks, label = self.create_coords_blocks(file, chonk_len)
            if label != [] and numbers[label[0]] <= 150:
                numbers[label[0]] = numbers[label[0]] + len(label)
                data = data + frames_blocks
                labels = labels + label
        data_np = np.asarray(data)
        labels_np = np.asarray(labels).astype(np.int64)

        data_sq = data_np.reshape(len(data_np), -1)
        test_data = pd.DataFrame(data_sq)
        test_labels = labels_np
        return test_data, test_labels

## 45 кадров

In [7]:
dataset = Skeleton2_Dataset(data_path=data_path,
                            broken_files_path=broken_files_path,
                            training_classes=training_classes,
                            training_cameras=training_cameras,
                            chonk_len=45,
                            transform=None)

action classes:  {1: 0, 2: 1, 3: 2, 4: 3, 8: 4, 10: 5, 21: 6, 22: 7, 23: 8, 27: 9}
action files:  {1: 120, 2: 120, 3: 120, 4: 120, 8: 120, 10: 120, 21: 120, 22: 120, 23: 120, 27: 120}


In [8]:
skel, lab = dataset.__getitem__(41)

In [9]:
type(lab)

numpy.int64

In [10]:
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [int(0.75*len(dataset)),int(0.25*len(dataset))])
train_loader = DataLoader(train_dataset, batch_size = 16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size = 1, shuffle=True)

In [11]:
n_hidden = 128
n_joints = 25*3
n_categories = 10
n_layer = 2
rnn = LSTM_net(n_joints,n_hidden,n_categories,n_layer)
rnn.to(device)

LSTM_net(
  (lstm): LSTM(75, 128, num_layers=2, batch_first=True)
  (dr): Dropout2d(p=0.1, inplace=False)
  (fc): Linear(in_features=128, out_features=10, bias=True)
)

In [12]:
def categoryFromOutput(output):
    top_n, top_i = output.topk(1)
    category_i = top_i[0].item()
#     print(output.topk(5))
    return LABELS[category_i], category_i

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [13]:
from torch import optim
import time
import math

criterion = nn.CrossEntropyLoss()
learning_rate = 0.0007
optimizer = optim.SGD(rnn.parameters(),lr=learning_rate,momentum=0.9)

all_losses = []
start = time.time()
counter = 0
for epoch in range(600):  
    current_loss = 0
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        
        inputs, labels = data[0].to(device), data[1].to(device)
        optimizer.zero_grad()
    
        output = rnn(inputs.float())
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step() 


        current_loss += loss.item()
        category = LABELS[int(labels[0])]

        if counter % 500 == 0:
            guess, guess_i = categoryFromOutput(output)
            correct = '✓' if guess == category else '✗ (%s)' % category
            print('epoch : %d iter : %d (%s) %.4f  / %s %s' % (epoch, i, timeSince(start), loss, guess, correct))

        
        counter = counter + 1
    if counter % 100 == 0:
        all_losses.append(current_loss / 25)
        current_loss = 0
    

epoch : 0 iter : 0 (0m 0s) 2.2893  / clapping ✗ (drink water)
epoch : 7 iter : 31 (0m 20s) 2.2824  / clapping ✗ (drink water)
epoch : 14 iter : 62 (0m 39s) 2.2529  / jump up ✓
epoch : 22 iter : 26 (0m 59s) 2.2073  / hand waving ✗ (clapping)
epoch : 29 iter : 57 (1m 18s) 2.0982  / hand waving ✗ (clapping)
epoch : 37 iter : 21 (1m 37s) 1.8299  / jump up ✗ (sitting down)
epoch : 44 iter : 52 (1m 57s) 2.0894  / sitting down ✗ (cheer up)
epoch : 52 iter : 16 (2m 16s) 2.3752  / brushing teeth ✓
epoch : 59 iter : 47 (2m 35s) 1.5921  / brushing teeth ✗ (take off a hat/cap)
epoch : 67 iter : 11 (2m 54s) 1.6463  / take off a hat/cap ✗ (clapping)
epoch : 74 iter : 42 (3m 13s) 1.7014  / take off a hat/cap ✗ (brushing teeth)
epoch : 82 iter : 6 (3m 32s) 1.4419  / clapping ✓
epoch : 89 iter : 37 (3m 51s) 1.1796  / take off a hat/cap ✗ (brushing teeth)
epoch : 97 iter : 1 (4m 10s) 1.1612  / jump up ✓
epoch : 104 iter : 32 (4m 29s) 1.0486  / take off a hat/cap ✗ (eat meal/snack)
epoch : 111 iter : 63 

In [14]:
total = 0
right = 0
counter = 0

rnn.eval()
with torch.no_grad():
    for i, data in enumerate(test_loader, 0):
        counter = counter + 1
        inputs, labels = data[0].to(device), data[1].to(device)  
        output = rnn(inputs.float())
        guess, guess_i = categoryFromOutput(output)
        category = LABELS[int(labels[0])]
        
        if guess == category:
            right = right + 1


print('Accuracy of the network:  ',  (100 * right / counter))

Accuracy of the network:   67.2316384180791


## 30 кадров

In [18]:
dataset = Skeleton2_Dataset(data_path=data_path,
                            broken_files_path=broken_files_path,
                            training_classes=training_classes,
                            training_cameras=training_cameras,
                            chonk_len=30,
                            transform=None)

action classes:  {1: 0, 2: 1, 3: 2, 4: 3, 8: 4, 10: 5, 21: 6, 22: 7, 23: 8, 27: 9}
action files:  {1: 120, 2: 120, 3: 120, 4: 120, 8: 120, 10: 120, 21: 120, 22: 120, 23: 120, 27: 120}


In [26]:
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [int(0.75*len(dataset)),int(0.25*len(dataset)+1)])
train_loader = DataLoader(train_dataset, batch_size = 16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size = 1, shuffle=True)
n_hidden = 128
n_joints = 25*3
n_categories = 10
n_layer = 2
rnn = LSTM_net(n_joints,n_hidden,n_categories,n_layer)
rnn.to(device)

criterion = nn.CrossEntropyLoss()
learning_rate = 0.0007
optimizer = optim.SGD(rnn.parameters(),lr=learning_rate,momentum=0.9)

all_losses = []
start = time.time()
counter = 0
for epoch in range(600):  
    current_loss = 0
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        
        inputs, labels = data[0].to(device), data[1].to(device)
        optimizer.zero_grad()
    
        output = rnn(inputs.float())
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step() 


        current_loss += loss.item()
        category = LABELS[int(labels[0])]

        if counter % 500 == 0:
            guess, guess_i = categoryFromOutput(output)
            correct = '✓' if guess == category else '✗ (%s)' % category
            print('epoch : %d iter : %d (%s) %.4f  / %s %s' % (epoch, i, timeSince(start), loss, guess, correct))

        
        counter = counter + 1
    if counter % 100 == 0:
        all_losses.append(current_loss / 25)
        current_loss = 0    

epoch : 0 iter : 0 (0m 0s) 2.3196  / hand waving ✗ (brushing teeth)
epoch : 7 iter : 3 (0m 12s) 2.2712  / sitting down ✗ (jump up)
epoch : 14 iter : 6 (0m 24s) 2.2651  / sitting down ✓
epoch : 21 iter : 9 (0m 36s) 2.1304  / jump up ✗ (take off a hat/cap)
epoch : 28 iter : 12 (0m 48s) 2.2469  / sitting down ✗ (jump up)
epoch : 35 iter : 15 (0m 59s) 1.8564  / cheer up ✗ (sitting down)
epoch : 42 iter : 18 (1m 11s) 1.4583  / jump up ✓
epoch : 49 iter : 21 (1m 23s) 2.1324  / cheer up ✗ (hand waving)
epoch : 56 iter : 24 (1m 35s) 1.7151  / brushing teeth ✗ (cheer up)
epoch : 63 iter : 27 (1m 47s) 2.1959  / take off a hat/cap ✓
epoch : 70 iter : 30 (1m 58s) 1.7573  / sitting down ✗ (clapping)
epoch : 77 iter : 33 (2m 10s) 2.0244  / clapping ✗ (hand waving)
epoch : 84 iter : 36 (2m 22s) 1.4836  / cheer up ✗ (brushing teeth)
epoch : 91 iter : 39 (2m 33s) 1.7165  / drink water ✗ (jump up)
epoch : 98 iter : 42 (2m 45s) 1.1632  / drink water ✓
epoch : 105 iter : 45 (2m 57s) 1.4168  / take off a h

In [27]:
total = 0
right = 0
counter = 0

rnn.eval()
with torch.no_grad():
    for i, data in enumerate(test_loader, 0):
        counter = counter + 1
        inputs, labels = data[0].to(device), data[1].to(device)  
        output = rnn(inputs.float())
        guess, guess_i = categoryFromOutput(output)
        category = LABELS[int(labels[0])]
        
        if guess == category:
            right = right + 1


print('Accuracy of the network:  ',  (100 * right / counter))

Accuracy of the network:   65.96306068601584


## 40 кадров

In [28]:
dataset = Skeleton2_Dataset(data_path=data_path,
                            broken_files_path=broken_files_path,
                            training_classes=training_classes,
                            training_cameras=training_cameras,
                            chonk_len=40,
                            transform=None)

action classes:  {1: 0, 2: 1, 3: 2, 4: 3, 8: 4, 10: 5, 21: 6, 22: 7, 23: 8, 27: 9}
action files:  {1: 120, 2: 120, 3: 120, 4: 120, 8: 120, 10: 120, 21: 120, 22: 120, 23: 120, 27: 120}


In [29]:
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [int(0.75*len(dataset)),int(0.25*len(dataset)+1)])
train_loader = DataLoader(train_dataset, batch_size = 16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size = 1, shuffle=True)
n_hidden = 128
n_joints = 25*3
n_categories = 10
n_layer = 2
rnn = LSTM_net(n_joints,n_hidden,n_categories,n_layer)
rnn.to(device)

criterion = nn.CrossEntropyLoss()
learning_rate = 0.0007
optimizer = optim.SGD(rnn.parameters(),lr=learning_rate,momentum=0.9)

all_losses = []
start = time.time()
counter = 0
for epoch in range(600):  
    current_loss = 0
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        
        inputs, labels = data[0].to(device), data[1].to(device)
        optimizer.zero_grad()
    
        output = rnn(inputs.float())
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step() 


        current_loss += loss.item()
        category = LABELS[int(labels[0])]

        if counter % 500 == 0:
            guess, guess_i = categoryFromOutput(output)
            correct = '✓' if guess == category else '✗ (%s)' % category
            print('epoch : %d iter : %d (%s) %.4f  / %s %s' % (epoch, i, timeSince(start), loss, guess, correct))

        
        counter = counter + 1
    if counter % 100 == 0:
        all_losses.append(current_loss / 25)
        current_loss = 0    

epoch : 0 iter : 0 (0m 0s) 2.3101  / clapping ✗ (cheer up)
epoch : 7 iter : 10 (0m 16s) 2.2983  / sitting down ✗ (cheer up)
epoch : 14 iter : 20 (0m 32s) 2.2652  / jump up ✗ (drink water)
epoch : 21 iter : 30 (0m 47s) 2.3170  / eat meal/snack ✓
epoch : 28 iter : 40 (1m 3s) 1.9795  / hand waving ✗ (cheer up)
epoch : 35 iter : 50 (1m 19s) 2.0955  / sitting down ✗ (cheer up)
epoch : 42 iter : 60 (1m 35s) 1.8171  / brushing teeth ✗ (drink water)
epoch : 50 iter : 0 (1m 50s) 2.0583  / jump up ✗ (sitting down)
epoch : 57 iter : 10 (2m 6s) 1.9440  / sitting down ✗ (hand waving)
epoch : 64 iter : 20 (2m 22s) 1.4136  / eat meal/snack ✓
epoch : 71 iter : 30 (2m 37s) 1.6531  / take off a hat/cap ✗ (drink water)
epoch : 78 iter : 40 (2m 53s) 1.4292  / take off a hat/cap ✓
epoch : 85 iter : 50 (3m 9s) 1.1675  / brushing hair ✓
epoch : 92 iter : 60 (3m 25s) 1.5121  / eat meal/snack ✓
epoch : 100 iter : 0 (3m 41s) 0.9985  / take off a hat/cap ✓
epoch : 107 iter : 10 (3m 57s) 1.2373  / sitting down ✗ 

In [30]:
total = 0
right = 0
counter = 0

rnn.eval()
with torch.no_grad():
    for i, data in enumerate(test_loader, 0):
        counter = counter + 1
        inputs, labels = data[0].to(device), data[1].to(device)  
        output = rnn(inputs.float())
        guess, guess_i = categoryFromOutput(output)
        category = LABELS[int(labels[0])]
        
        if guess == category:
            right = right + 1


print('Accuracy of the network:  ',  (100 * right / counter))

Accuracy of the network:   68.18181818181819


## 42 кадра

In [31]:
dataset = Skeleton2_Dataset(data_path=data_path,
                            broken_files_path=broken_files_path,
                            training_classes=training_classes,
                            training_cameras=training_cameras,
                            chonk_len=42,
                            transform=None)

action classes:  {1: 0, 2: 1, 3: 2, 4: 3, 8: 4, 10: 5, 21: 6, 22: 7, 23: 8, 27: 9}
action files:  {1: 120, 2: 120, 3: 120, 4: 120, 8: 120, 10: 120, 21: 120, 22: 120, 23: 120, 27: 120}


In [32]:
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [int(0.75*len(dataset)),int(0.25*len(dataset)+1)])
train_loader = DataLoader(train_dataset, batch_size = 16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size = 1, shuffle=True)
n_hidden = 128
n_joints = 25*3
n_categories = 10
n_layer = 2
rnn = LSTM_net(n_joints,n_hidden,n_categories,n_layer)
rnn.to(device)

criterion = nn.CrossEntropyLoss()
learning_rate = 0.0007
optimizer = optim.SGD(rnn.parameters(),lr=learning_rate,momentum=0.9)

all_losses = []
start = time.time()
counter = 0
for epoch in range(600):  
    current_loss = 0
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        
        inputs, labels = data[0].to(device), data[1].to(device)
        optimizer.zero_grad()
    
        output = rnn(inputs.float())
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step() 


        current_loss += loss.item()
        category = LABELS[int(labels[0])]

        if counter % 500 == 0:
            guess, guess_i = categoryFromOutput(output)
            correct = '✓' if guess == category else '✗ (%s)' % category
            print('epoch : %d iter : %d (%s) %.4f  / %s %s' % (epoch, i, timeSince(start), loss, guess, correct))

        
        counter = counter + 1
    if counter % 100 == 0:
        all_losses.append(current_loss / 25)
        current_loss = 0    

epoch : 0 iter : 0 (0m 0s) 2.2957  / take off a hat/cap ✗ (jump up)
epoch : 7 iter : 17 (0m 16s) 2.3154  / cheer up ✗ (brushing teeth)
epoch : 14 iter : 34 (0m 32s) 2.2918  / clapping ✗ (sitting down)
epoch : 21 iter : 51 (0m 49s) 2.2423  / cheer up ✗ (hand waving)
epoch : 28 iter : 68 (1m 6s) 2.2578  / jump up ✓
epoch : 36 iter : 16 (1m 22s) 2.2356  / jump up ✗ (brushing hair)
epoch : 43 iter : 33 (1m 39s) 1.8459  / hand waving ✗ (brushing hair)
epoch : 50 iter : 50 (1m 56s) 1.6244  / eat meal/snack ✗ (hand waving)
epoch : 57 iter : 67 (2m 12s) 2.1018  / brushing hair ✓
epoch : 65 iter : 15 (2m 29s) 1.7779  / cheer up ✗ (hand waving)
epoch : 72 iter : 32 (2m 46s) 1.5604  / eat meal/snack ✓
epoch : 79 iter : 49 (3m 2s) 1.6468  / drink water ✗ (brushing hair)
epoch : 86 iter : 66 (3m 19s) 1.4056  / clapping ✓
epoch : 94 iter : 14 (3m 36s) 1.9166  / drink water ✗ (cheer up)
epoch : 101 iter : 31 (3m 53s) 1.7151  / clapping ✗ (drink water)
epoch : 108 iter : 48 (4m 9s) 1.5222  / brushing 

In [33]:
total = 0
right = 0
counter = 0

rnn.eval()
with torch.no_grad():
    for i, data in enumerate(test_loader, 0):
        counter = counter + 1
        inputs, labels = data[0].to(device), data[1].to(device)  
        output = rnn(inputs.float())
        guess, guess_i = categoryFromOutput(output)
        category = LABELS[int(labels[0])]
        
        if guess == category:
            right = right + 1


print('Accuracy of the network:  ',  (100 * right / counter))

Accuracy of the network:   71.38964577656675


## 42 кадра повтор

In [34]:
dataset = Skeleton2_Dataset(data_path=data_path,
                            broken_files_path=broken_files_path,
                            training_classes=training_classes,
                            training_cameras=training_cameras,
                            chonk_len=42,
                            transform=None)

action classes:  {1: 0, 2: 1, 3: 2, 4: 3, 8: 4, 10: 5, 21: 6, 22: 7, 23: 8, 27: 9}
action files:  {1: 120, 2: 120, 3: 120, 4: 120, 8: 120, 10: 120, 21: 120, 22: 120, 23: 120, 27: 120}


In [35]:
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [int(0.75*len(dataset)),int(0.25*len(dataset)+1)])
train_loader = DataLoader(train_dataset, batch_size = 16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size = 1, shuffle=True)
n_hidden = 128
n_joints = 25*3
n_categories = 10
n_layer = 2
rnn = LSTM_net(n_joints,n_hidden,n_categories,n_layer)
rnn.to(device)

criterion = nn.CrossEntropyLoss()
learning_rate = 0.0007
optimizer = optim.SGD(rnn.parameters(),lr=learning_rate,momentum=0.9)

all_losses = []
start = time.time()
counter = 0
for epoch in range(600):  
    current_loss = 0
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        
        inputs, labels = data[0].to(device), data[1].to(device)
        optimizer.zero_grad()
    
        output = rnn(inputs.float())
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step() 


        current_loss += loss.item()
        category = LABELS[int(labels[0])]

        if counter % 500 == 0:
            guess, guess_i = categoryFromOutput(output)
            correct = '✓' if guess == category else '✗ (%s)' % category
            print('epoch : %d iter : %d (%s) %.4f  / %s %s' % (epoch, i, timeSince(start), loss, guess, correct))

        
        counter = counter + 1
    if counter % 100 == 0:
        all_losses.append(current_loss / 25)
        current_loss = 0    

epoch : 0 iter : 0 (0m 0s) 2.3245  / brushing hair ✗ (sitting down)
epoch : 7 iter : 17 (0m 16s) 2.3150  / hand waving ✓
epoch : 14 iter : 34 (0m 33s) 2.2156  / hand waving ✗ (cheer up)
epoch : 21 iter : 51 (0m 49s) 2.0869  / hand waving ✗ (cheer up)
epoch : 28 iter : 68 (1m 6s) 2.0845  / jump up ✗ (brushing hair)
epoch : 36 iter : 16 (1m 23s) 1.8565  / hand waving ✗ (sitting down)
epoch : 43 iter : 33 (1m 39s) 1.8725  / hand waving ✗ (sitting down)
epoch : 50 iter : 50 (1m 56s) 2.0797  / cheer up ✗ (brushing teeth)
epoch : 57 iter : 67 (2m 13s) 1.8407  / eat meal/snack ✓
epoch : 65 iter : 15 (2m 29s) 1.7032  / cheer up ✗ (hand waving)
epoch : 72 iter : 32 (2m 46s) 1.9041  / jump up ✗ (clapping)
epoch : 79 iter : 49 (3m 3s) 1.4263  / sitting down ✗ (brushing teeth)
epoch : 86 iter : 66 (3m 19s) 1.6672  / jump up ✗ (clapping)
epoch : 94 iter : 14 (3m 36s) 1.1965  / jump up ✗ (sitting down)
epoch : 101 iter : 31 (3m 52s) 1.7480  / take off a hat/cap ✗ (brushing teeth)
epoch : 108 iter : 

In [36]:
total = 0
right = 0
counter = 0

rnn.eval()
with torch.no_grad():
    for i, data in enumerate(test_loader, 0):
        counter = counter + 1
        inputs, labels = data[0].to(device), data[1].to(device)  
        output = rnn(inputs.float())
        guess, guess_i = categoryFromOutput(output)
        category = LABELS[int(labels[0])]
        
        if guess == category:
            right = right + 1


print('Accuracy of the network:  ',  (100 * right / counter))

Accuracy of the network:   71.11716621253406


Вывод: у меня получилось, что оптимальное количество кадров чуть-чуть меньше 45 (42). Полагаю, края анимации (начальные и конечные кадры) не относятся к самой анимации и потому взяв меньше кадров, мы отрезаем часть этой ненужной информации. При этом датасет был построен таким образом, чтобы содержать самую суть, поэтому счёт тут идёт на единицы.

## 2 слоя LSTM

In [40]:
class LSTM2_net(nn.Module):
    def __init__(self,input_dim,hidden_dim,output_dim,layer_num):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.lstm = torch.nn.LSTM(input_dim, hidden_dim,layer_num,batch_first=True)
        self.lstm2 = torch.nn.LSTM(hidden_dim, hidden_dim,layer_num,batch_first=True)
        self.dr = torch.nn.Dropout2d(0.1)
        self.fc = torch.nn.Linear(hidden_dim,output_dim)
        
        
    def forward(self,inputs):
        x = inputs
        lstm_out,(hn,cn) = self.lstm(x)
        lstm_out2,(hn2,cn2) = self.lstm2(lstm_out)
        out = self.fc(lstm_out2[:,-1,:])
        return out

In [41]:
dataset = Skeleton2_Dataset(data_path=data_path,
                            broken_files_path=broken_files_path,
                            training_classes=training_classes,
                            training_cameras=training_cameras,
                            chonk_len=42,
                            transform=None)

action classes:  {1: 0, 2: 1, 3: 2, 4: 3, 8: 4, 10: 5, 21: 6, 22: 7, 23: 8, 27: 9}
action files:  {1: 120, 2: 120, 3: 120, 4: 120, 8: 120, 10: 120, 21: 120, 22: 120, 23: 120, 27: 120}


In [42]:
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [int(0.75*len(dataset)),int(0.25*len(dataset)+1)])
train_loader = DataLoader(train_dataset, batch_size = 16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size = 1, shuffle=True)
n_hidden = 128
n_joints = 25*3
n_categories = 10
n_layer = 2
rnn = LSTM2_net(n_joints,n_hidden,n_categories,n_layer)
rnn.to(device)

criterion = nn.CrossEntropyLoss()
learning_rate = 0.0007
optimizer = optim.SGD(rnn.parameters(),lr=learning_rate,momentum=0.9)

all_losses = []
start = time.time()
counter = 0
for epoch in range(600):  
    current_loss = 0
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        
        inputs, labels = data[0].to(device), data[1].to(device)
        optimizer.zero_grad()
    
        output = rnn(inputs.float())
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step() 


        current_loss += loss.item()
        category = LABELS[int(labels[0])]

        if counter % 500 == 0:
            guess, guess_i = categoryFromOutput(output)
            correct = '✓' if guess == category else '✗ (%s)' % category
            print('epoch : %d iter : %d (%s) %.4f  / %s %s' % (epoch, i, timeSince(start), loss, guess, correct))

        
        counter = counter + 1
    if counter % 100 == 0:
        all_losses.append(current_loss / 25)
        current_loss = 0    

epoch : 0 iter : 0 (0m 0s) 2.3161  / sitting down ✓
epoch : 7 iter : 17 (0m 32s) 2.3110  / cheer up ✗ (eat meal/snack)
epoch : 14 iter : 34 (1m 4s) 2.3246  / cheer up ✗ (jump up)
epoch : 21 iter : 51 (1m 36s) 2.3000  / cheer up ✓
epoch : 28 iter : 68 (2m 9s) 2.2768  / cheer up ✗ (eat meal/snack)
epoch : 36 iter : 16 (2m 42s) 2.3453  / cheer up ✗ (jump up)
epoch : 43 iter : 33 (3m 15s) 2.2967  / cheer up ✗ (clapping)
epoch : 50 iter : 50 (3m 48s) 2.3430  / cheer up ✗ (drink water)
epoch : 57 iter : 67 (4m 20s) 2.2811  / cheer up ✗ (eat meal/snack)
epoch : 65 iter : 15 (4m 53s) 2.2969  / cheer up ✗ (take off a hat/cap)
epoch : 72 iter : 32 (5m 26s) 2.3022  / cheer up ✗ (sitting down)
epoch : 79 iter : 49 (5m 59s) 2.3155  / cheer up ✗ (take off a hat/cap)
epoch : 86 iter : 66 (6m 32s) 2.2927  / cheer up ✗ (brushing teeth)
epoch : 94 iter : 14 (7m 4s) 2.3222  / cheer up ✗ (hand waving)
epoch : 101 iter : 31 (7m 37s) 2.2764  / cheer up ✗ (drink water)
epoch : 108 iter : 48 (8m 10s) 2.2948  

In [43]:
total = 0
right = 0
counter = 0

rnn.eval()
with torch.no_grad():
    for i, data in enumerate(test_loader, 0):
        counter = counter + 1
        inputs, labels = data[0].to(device), data[1].to(device)  
        output = rnn(inputs.float())
        guess, guess_i = categoryFromOutput(output)
        category = LABELS[int(labels[0])]
        
        if guess == category:
            right = right + 1


print('Accuracy of the network:  ',  (100 * right / counter))

Accuracy of the network:   66.4850136239782


Второй слой LSTM ухудшил результат. На самом деле у меня достаточно мало данных взято из датасета - всего по 120 на класс. Мне кажется, что сложную сеть легко переобучить, и если бы примеров было больше, то второй слой улучшил бы результат.