In [None]:
"""Creating one unified .csv file containing frames from labeled data made via DeepLabCut"""

In [None]:
import os
import json
import re
import torch
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from torch_geometric_temporal.nn.recurrent import DCRNN, A3TGCN, GConvLSTM,TGCN
from torch_geometric_temporal.signal import StaticGraphTemporalSignal
import networkx as nx
import torch_geometric



In [None]:
device = torch.device("cpu")

In [None]:
labeled_data_path = r"dlc_project_2022_08_02\labeled-data"
dst_path = r'new_ds_1.csv'

In [None]:
class CreateUnifiedCsv(object):
    
    """Class to transform many csv files created by DeepLabCut labeling into one, unified CSV which
    describes frames and source videos.
    If a frame contains missing label, the frame will NOT be included into merged dataset."""

    
    def __init__(self) -> None:

        pass

    def make_unified_dataset(self, source_path : str, dst_path : str) -> None:
        """Method to create .csv file containing the dataset.
            Args types:
            source_path (str) - Location of labeled-data folder in the DeepLabCut project.
            dst_pat (str) - Path where the file should be saved (ending with filename.csv).
        """
        self.source_path = source_path
        self.dst_path = dst_path
        buffer_dict = {}
        
        for n,folder in enumerate(os.listdir(self.source_path)):

            frame_label = re.split("\_",folder)[0]
            folderpath = os.path.join(self.source_path,folder)
            folder_files = os.listdir(folderpath)
            csv_file = str(next(filter(lambda a: '.csv' in a, folder_files)))
            csv_path = os.path.join(folderpath,csv_file)
            labeled_data = pd.read_csv(csv_path)
            buffer_dict[f'vid_id_{n}'] = folder

            if n == 0:

                buffer_dict['bodyparts'] = labeled_data.loc[0][3:].values
                buffer_dict['coords'] = labeled_data.loc[1][3:].values

            for index,row in labeled_data.iterrows():

                if index >=2:

                    if True in pd.isna(labeled_data.loc[index][3:].values):
                        print("Skipping bad index")

                    else:
                        buffer_dict[labeled_data.loc[index][2]] = labeled_data.loc[index][3:].values

        new_df = pd.DataFrame(buffer_dict,index = None)
        new_df = new_df.transpose()
        new_df.to_csv(self.dst_path)
            

In [None]:
new_ds_csv_merger = CreateUnifiedCsv()
new_ds_csv_merger.make_unified_dataset(labeled_data_path,dst_path)

In [16]:
class CreateJsonDataset(object):

    """Dataset consisting of key feature points coordinates. Data is extracted from 
    .csv files created using DeepLabCut, a software for labeling video frames. 
    One video should have corresponding .csv file, with the filename starting with age of a dog followed by
    underscore (Example: adult_1.csv). Every set of keypoint for given frame will be associated with the 
    corresponding age group label."""

    def __init__(self)-> None:
        
        pass

    def _extract_features_and_timesteps(self) -> None:
        # Method to extract features and labels from .csv files defining dataset created with DeepLabCut.
        # .csv files must be stored in the folders created during labeling with DeepLabCut, every folder has to be named
        # age_[...].csv, where age is a name describing the category in which given film belongs to
        label_list = {}
        data_list = {}
        labeled_data = pd.read_csv(self.path_to_dataset)

        for index, row in labeled_data.iterrows():

            tuple_list = []

            if index == 0:

                vid_name = row[1]
                data_list[vid_name] = []
                label_list[vid_name] = re.split("\_",vid_name)[0]

            if index >= 3: ## skip first rows

                row = row.values[1:]

                for i in range(1,len(row),2):

                    try: 
                        x_coord = round(float((row[i-1])),4) ## extract x coordinate
                        y_coord = round(float((row[i])),4) ## extract y coordinate
                        assert np.isnan(x_coord) == False, f"Nan coordinate x found in row: {index+1}"
                        assert np.isnan(y_coord) == False, f"Nan coordinate y found in row: {index+1}"
                        tuple_list.append((x_coord, y_coord)) ## write every coordinate pair into list
                        
                    except: 
                        ## when during iteration a name of next video is found in the rows, new label is assigned
                        vid_name = row[1]
                        data_list[vid_name] = []
                        label_list[vid_name] = re.split("\_",vid_name)[0]
                        pass
                    
                if tuple_list:

                    data_list[vid_name].append((tuple_list))

        self._features = data_list
        self._labels = label_list
        
    def _assign_bodypart_to_node(self) -> None:
        # Method to assign bodypart name to a node
        node_dict = {}
        n = 0
        dataframe = pd.read_csv(self.path_to_dataset) ## read the dataset
        bodypart_row = list(dataframe.iloc[1][1:]) # extract row containing bodypart names

        for bp_idx in range(1,len(bodypart_row),2):

            node_dict[bodypart_row[bp_idx]] = n # assign bodypart to a node number
            n += 1
        self._node_ids = node_dict
        
    def _create_fc_graph_edges(self) -> None:
        # Method to create fully connected undirected graph.
        # Can only be called after _assign_bodypart_to_node() is called
        edges = []

        for i in range(len(self._node_ids)):
            for j in range(len(self._node_ids)):
                edges.append((i,j))

        self._edges = edges
            
    def create_json_dataset(self, path_to_csv : str, save_path : str) -> None:
        """Method to create .json file containing the dataset.
            Args types:
            save_path (str) - path where the file should be saved (ending with filename.json)
            path_to_csv (str) - path to csv file containing dataset
        """
        self.path_to_dataset = path_to_csv
        self._extract_features_and_timesteps()
        self._assign_bodypart_to_node()
        self._create_fc_graph_edges()

        create_graph_ds = {
        "edges" : self._edges,
        "node_ids" : self._node_ids,
        "features" : self._features,
        "labels" : self._labels
        }

        json_dataset = json.dumps(create_graph_ds)
        with open(save_path, "w") as outfile:
            outfile.write(json_dataset)

In [None]:
#TODO Add variable for class constructor that defines if the user wants to input edges created independetly 
# (in form extracted from nx graph from dr. Wielgosz code) or to use the one provided in json dataset (fully connected undirected graph)
#TODO Add variable for class constructor that defines if user wants to flatten the graph (if the features have more than 1 dimension, create 
# separate node for every dimension (eg. tuple to be mapped into two separate, connected nodes.)). The user can still pass his own adjecency 
# matrix that will be reshaped into the one corresponding to the flattened graph

In [19]:
class JsonDatasetLoader(object):

    """Creates a loader for Dog Age keypoints video dataset.

     Args types:
            path_to_dataset (str) - path to .json dataset
            one_hot (bool) - whether to use one hot label encoding
            sparse (bool) - whether to use sparse label encoding
            timestep (int) - number of frames in a single sample
            overlap (int) - amount of overlapping consecutive frames between the clips
            faltten_features (bool) - whether to flatten every single node feature into separate node
            (resulting in number of nodes equal to num_nodes*len(node_features)).
            custom_edges : (np.array) - np.array of shape [num_nodes,2] containing pairs of integers describing connections
            between corresponding nodes
    """
    def __init__(self, 
                 path_to_dataset : str,
                 one_hot : bool = False, 
                 sparse : bool = True,
                 timestep : int = 4, 
                 overlap : int = 0,
                 faltten_features : bool = False,
                 custom_edges : np.array = None) -> None:

        if  (one_hot and sparse):
            raise AttributeError("Cannot get one hot and sparse labels at once!")
        

        self._path_to_dataset = path_to_dataset
        self._timestep = timestep
        self._overlap = overlap
        self._sparse = sparse
        self._one_hot = one_hot
        self._flatten_features = faltten_features
        self._custom_edges = custom_edges
        self._open_json_dataset() 
        if custom_edges:
            assert custom_edges.ndim == 2, "Custom edges must have 2 dmiensions!"
            assert custom_edges.shape[1] == 2, "2-nd dimension of custom edges must be 2!"
            assert custom_edges.shape[0] == len(self._dataset["node_ids"]), "Dataset was created for \n"
            "different number of nodes than specified in custom edges!"

    def _open_json_dataset(self):
        self._dataset = json.load(open(self._path_to_dataset))

    def _get_edges(self):
        
        if self._custom_edges is None:
            if not self._flatten_features:
                self._edges = np.array(self._dataset["edges"]).T
            else:
                num_of_nodes = len(self._dataset["node_ids"])
                num_of_features = len(list(self._dataset["features"].values())[0][0][0]) #epic oneliner lol
                G = nx.complete_graph(num_of_nodes*num_of_features)
                a = torch_geometric.utils.from_networkx(G)
                self._edges = np.array(a)[0][1].numpy()
        else:
            if not self._flatten_features:
                self._edges = self._custom_edges.T
            #else: TODO

    def _get_edge_weights(self):
        self._edge_weights = np.ones(self._edges.shape[1])

    def _get_labels_and_features(self):

        features_dict = self._dataset["features"]
        labels_dict = self._dataset["labels"]

        for n,key in enumerate(features_dict.keys()):

            vid_label = labels_dict[key]
            features_reshaped = np.array(features_dict[key]).swapaxes(0,2).swapaxes(0,1)

            if n == 0: ## create features and lables list for the first video
                self._features = [features_reshaped[:,:,i:i+self._timestep] 
                for i in range(0, features_reshaped.shape[2] - self._timestep, self._timestep - self._overlap)
                ]
                self._labels = [vid_label for _ in range(len(self._features))]

            else: ## append features and labels with the data for each consecutive video in the dataset

                temp_features = [features_reshaped[:,:,i:i+self._timestep] 
                for i in range(0,features_reshaped.shape[2] - self._timestep, self._timestep - self._overlap)
                ]
                temp_labels = [vid_label for _ in range(len(temp_features))]
                self._features = self._features + temp_features
                self._labels = self._labels + temp_labels

            if self._flatten_features:
                self._features = [np.reshape(item,(item.shape[0]*item.shape[1], item.shape[2])) 
                for item in self._features
                ] ## TODO sprawdzić czy to działa bo wygląda źle

        if self._one_hot is True:
            
            self._labels = np.array(self._labels)
            self._labels = self._labels.reshape((self._labels.shape[0],1))
            self._labels = list(OneHotEncoder(sparse=False).fit_transform(self._labels))

        elif self._sparse is True:

            self._labels = np.array(self._labels)
            self._labels = self._labels.reshape((self._labels.shape[0],1))
            self._labels = list(OrdinalEncoder().fit_transform(self._labels))

    def get_dataset(self) -> StaticGraphTemporalSignal:
        """Creating the Dog age video keypoints data iterator. The iterator yelds static,
        fully connected, unweighted graphs with bodyparts assigned to given node and label for every
        set of features. A set of features describes given clip (collection of following frames).
        Features are of shape [nodes,features,timesteps].

        Return types:
            * **dataset** *(StaticGraphTemporalSignal)* - The Dog Age Video dataset.
        """
               
        self._get_edges()
        self._get_edge_weights()
        self._get_labels_and_features()

        dataset = StaticGraphTemporalSignal(
        self._edges, self._edge_weights, self._features, self._labels
        )
        
        return dataset

In [20]:
TIMESTEP = 5
new_loader = JsonDatasetLoader("json_datasets\sample_2.json", one_hot= False, sparse= True, timestep = TIMESTEP, overlap = 0)
valid_dataset = new_loader.get_dataset()

In [None]:
g = nx.DiGraph()
g.add_nodes_from(list(range(0,18 * 3)))
for i in range(0, 3):
    g.add_edge((i * 18) + 10,(i * 18) + 9)
    g.add_edge((i * 18) + 9,(i * 18) + 10)
    g.add_edge((i * 18) + 9,(i * 18) + 8)
    g.add_edge((i * 18) + 8, (i * 18) + 9)
    g.add_edge((i * 18) + 8, (i * 18) + 1)
    g.add_edge((i * 18) + 1, (i * 18) + 8)
    g.add_edge((i * 18) + 13, (i * 18) + 12)
    g.add_edge((i * 18) + 12, (i * 18) + 13)
    g.add_edge((i * 18) + 12, (i * 18) + 11)
    g.add_edge((i * 18) + 11, (i * 18) + 12)
    g.add_edge((i * 18) + 11, (i * 18) + 1)
    g.add_edge((i * 18) + 1, (i * 18) + 11)
    g.add_edge((i * 18) + 1, (i * 18) + 5)
    g.add_edge((i * 18) + 5, (i * 18) + 1)
    g.add_edge((i * 18) + 5, (i * 18) + 6)
    g.add_edge((i * 18) + 6, (i * 18) + 5)
    g.add_edge((i * 18) + 6, (i * 18) + 7)
    g.add_edge((i * 18) + 7, (i * 18) + 6)
    g.add_edge((i * 18) + 1, (i * 18) + 2)
    g.add_edge((i * 18) + 2, (i * 18) + 1)
    g.add_edge((i * 18) + 2, (i * 18) + 3)
    g.add_edge((i * 18) + 3, (i * 18) + 2)
    g.add_edge((i * 18) + 3, (i * 18) + 4)
    g.add_edge((i * 18) + 4, (i * 18) + 3)
    g.add_edge((i * 18) + 1, (i * 18) + 0)
    g.add_edge((i * 18) + 0, (i * 18) + 1)
    g.add_edge((i * 18) + 0, (i * 18) + 15)
    g.add_edge((i * 18) + 15, (i * 18) + 0)
    g.add_edge((i * 18) + 0, (i * 18) + 14)
    g.add_edge((i * 18) + 14, (i * 18) + 0)
    g.add_edge((i * 18) + 16, (i * 18) + 14)
    g.add_edge((i * 18) + 14, (i * 18) + 16)
    g.add_edge((i * 18) + 17, (i * 18) + 15)
    g.add_edge((i * 18) + 15, (i * 18) + 17)

for i in range(0, 18):
    g.add_edge(i, i + 18)
    g.add_edge(i + 18, i)
    g.add_edge(i, i + 36)
    g.add_edge(i + 36, i)
    g.add_edge(i +36, i + 18)
    g.add_edge(i + 18, i + 36)

# A = nx.edges(g)
a = torch_geometric.utils.from_networkx(g)
a = np.array(a)[0][1].numpy()

In [None]:
class RecurrentGCN(torch.nn.Module):
    def __init__(self, node_features):
        super(RecurrentGCN, self).__init__()
        self.conv1d_1 = torch.nn.Conv1d(2,1,1,stride=1,padding=0)
        self.recurrent = DCRNN(node_features, 32, 1)
        self.fc1 = torch.nn.Linear(224, 1)
        self.flatten = torch.nn.Flatten(start_dim=0)
        self.squeeze = torch.squeeze
    def forward(self, x, edge_index, edge_weight):
        h = self.conv1d_1(x)
        h = torch.squeeze(h)
        h = self.recurrent(h, edge_index, edge_weight)
        h = F.relu(h)
        h = self.flatten(h)
        h = self.fc1(h)
        h = torch.sigmoid(h)
        return h

In [None]:

class ATGCN(torch.nn.Module):
    def __init__(self, node_features, periods):
        super(ATGCN, self).__init__()
        self.recurrent = A3TGCN(node_features, 32, periods)
        self.linear = torch.nn.Linear(224, 1)

    def forward(self, x, edge_index, edge_weight):
        h = self.recurrent(x, edge_index, edge_weight)
        h = F.relu(h)
        h = nn.Flatten(start_dim=0)(h)
        h = self.linear(h)
        h = torch.sigmoid(h)
        return h
        


In [None]:
from sklearn import metrics
def get_accuracy(y_true, y_prob):
    """Binary accuracy calculation"""
    y_prob = np.array(y_prob)
    y_prob = np.where(y_prob <= 0.5, 0, y_prob)
    y_prob = np.where(y_prob > 0.5, 1, y_prob)

    accuracy = metrics.accuracy_score(y_true, y_prob)
    return accuracy

In [None]:
from tqdm import tqdm
#model = ATGCN(node_features = 2, periods = TIMESTEP).to(device)
model = RecurrentGCN(TIMESTEP).to(device)

loss_fn =  nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

model.train()

for epoch in tqdm(range(1000)):
    epoch_loss = 0.0
    preds = []
    ground_truth = []
    for time, snapshot in enumerate(valid_dataset):
        y_hat = model(snapshot.x, snapshot.edge_index, snapshot.edge_attr)
        ##loss
        loss = loss_fn(y_hat,snapshot.y)
        epoch_loss += loss
        ## get preds & gorund truth
        preds.append(y_hat.detach().numpy())
        ground_truth.append(snapshot.y.numpy())
        ##backprop
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    ## calculate acc
    acc = get_accuracy(ground_truth,preds)

    print(f'Epoch: {epoch}', f'Epoch accuracy: {acc}', f'Epoch loss: {epoch_loss.detach().numpy()}')

In [None]:
def extract_features_and_timesteps(dataset_files_path):
    """Code for debugging the method in the loader class. Not important now."""
    
    label_list = {}
    data_list = {}
        
    labeled_data = pd.read_csv(dataset_files_path)
    for index, row in labeled_data.iterrows():
        tuple_list = []
        if index == 0:
            vid_name = row[1]
            data_list[vid_name] = []
            label_list[vid_name] = re.split("\_",vid_name)[0]
        if index >= 3: ## skip first rows
            row = row.values[1:]
           #print(row) ## take only coordinates from the rows
            assert (len(row))%2 == 0, f"Odd number of coordinates in row {index + 2}, maybe some coordinates are missing"
            for i in range(1,len(row),2):
                try: 
                    x_coord = round(float((row[i-1])),4) ## extract x coordinate
                    y_coord = round(float((row[i])),4) ## extract y coordinate
                    assert np.isnan(x_coord) == False, f"Nan coordinate x found in row: {index+1}"
                    assert np.isnan(y_coord) == False, f"Nan coordinate y found in row: {index+1}"
                    tuple_list.append((x_coord, y_coord)) ## write every coordinate pair into list
                    
                except:
                    vid_name = row[1]
                    data_list[vid_name] = []
                    label_list[vid_name] = re.split("\_",vid_name)[0]
                    pass
                  
            if tuple_list:
                data_list[vid_name].append((tuple_list))
    features = data_list
    labels = label_list
    return features, labels