# 1. Import and Install Dependencies

In [1]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp
import time
import shutil
from tqdm import tqdm
import random
import itertools
import torch
from torch_geometric_temporal.dataset import MTMDatasetLoader
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric_temporal.nn.attention import AAGCN
from torch_geometric_temporal.signal import temporal_signal_split
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
%matplotlib inline
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import plot_confusion_matrix
import matplotlib.pyplot as plt
import string

# 2. Keypoints using MP Holistic

In [2]:
mp_holistic = mp.solutions.holistic # Holistic model
mp_drawing = mp.solutions.drawing_utils # Drawing utilities

In [3]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # COLOR CONVERSION BGR 2 RGB
    image.flags.writeable = False                  # Image is no longer writeable
    results = model.process(image)                 # Make prediction
    image.flags.writeable = True                   # Image is now writeable 
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
    return image, results

In [4]:
def draw_landmarks(image, results):
#     mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACE_CONNECTIONS) # Draw face connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS) # Draw pose connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS) # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS) # Draw right hand connections

In [5]:
def draw_styled_landmarks(image, results):
    # Draw face connections
#     mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS, 
#                              mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1), 
#                              mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)
#                              ) 
    # Draw pose connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
                             ) 
    # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             ) 
    # Draw right hand connections  
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             ) 

# setup meta parameters

In [6]:
# Videos are going to be 30 frames in length
sequence_length = 24

In [7]:
# Thirty videos worth of data per word
no_sequences = 1

In [8]:
# Rest Time between each action recording
pause_time = 30

In [9]:
hcrop = 0
wcrop = 0

In [10]:
a=[]
for subdir, dirs, files in os.walk('./test_data'):
    a=files
#class labels list
labels=[x[:-3] for x in a]

In [11]:
nodes = 61
wordsz=len(a)
frames=24
classes=wordsz
topn=5

In [12]:
wordsz

109

# Utility Functions


In [13]:
def tensor_push(tensor, x):
    return torch.cat((tensor[1:], x), dim=0)

In [14]:
softmax = torch.nn.Softmax(dim=0)

# ML part

## Importing Electra Model

In [15]:
from transformers import ElectraTokenizer, ElectraForMaskedLM
electra_tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-generator')
electra_model = ElectraForMaskedLM.from_pretrained('google/electra-small-generator').eval()

## Importing AAGCN Model

In [16]:
#load edge index tensor
edge_index=torch.load("cool_shit/new_edge_index.pt")
edge_index=edge_index.type(torch.LongTensor)

In [17]:
class CoolGCN(torch.nn.Module):
    def __init__(self, node_features):
        super(CoolGCN, self).__init__()
        self.r1 = AAGCN(3,32, torch.LongTensor(edge_index), node_features)
        self.r2 = AAGCN(32, 32,  torch.LongTensor(edge_index), node_features)
        self.r3 = AAGCN(32, 64,  torch.LongTensor(edge_index), node_features,stride =2)
        self.r4 = AAGCN(64, 64,  torch.LongTensor(edge_index), node_features)
        self.r5 = AAGCN(64, 128,  torch.LongTensor(edge_index), node_features,stride = 2)
        self.r6 = AAGCN(128, 128,  torch.LongTensor(edge_index), node_features)
        self.r7 = AAGCN(128, 256,  torch.LongTensor(edge_index), node_features,stride = 2)
        self.r8 = AAGCN(256, 256,  torch.LongTensor(edge_index), node_features)
        self.r9 = AAGCN(256, 768,  torch.LongTensor(edge_index), node_features,stride = 3)
        self.r10 = AAGCN(768, 768,  torch.LongTensor(edge_index), node_features)
        self.linear = torch.nn.Linear(nodes, classes)
        self.dropout = torch.nn.Dropout(0.4)
    def forward(self, x):
        h = self.r1(x)
        h = self.r2(h)
        h = self.r3(h)
        h = self.r4(h)
        h = self.r5(h)
        h = self.r6(h)
        h = self.r7(h)
        h = self.r8(h)
        h = self.r9(h)
        h = self.r10(h)
        h = h.mean(1)
        h = self.dropout(h)
        h = self.linear(h)
        return h

In [18]:
#check cuda availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#load model to device
model = CoolGCN(node_features = nodes).to(device)
#load model from file
model.load_state_dict(torch.load("./models/fulledgemodel.pth"))

<All keys matched successfully>

In [19]:
electrakeys = dict()
for w in a:
    if w == 't-shirt.pt':
        word = 'shirt.pt'
    else:
        word = w
    for i in range(9999999):
        if ''.join(electra_tokenizer.decode(i).split()) == word[:-3].lower():
            electrakeys[w[:-3]] = i
            break

In [20]:
def decode(tokenizer, pred_idx, top_clean):
    ignore_tokens = string.punctuation + '[PAD]'
    tokens = []
    for w in pred_idx:
        token = ''.join(tokenizer.decode(w).split())
        if token not in ignore_tokens:
            tokens.append(token.replace('##', ''))
    return '\n'.join(tokens[:top_clean])

In [21]:
def encode(tokenizer, text_sentence, add_special_tokens=True):
    text_sentence = text_sentence.replace('<mask>', tokenizer.mask_token)
    # if <mask> is the last token, append a "." so that models dont predict punctuation.
    if tokenizer.mask_token == text_sentence.split()[-1]:
        text_sentence += ' .'

    input_ids = torch.tensor([tokenizer.encode(text_sentence, add_special_tokens=add_special_tokens)])
    mask_idx = torch.where(input_ids == tokenizer.mask_token_id)[1].tolist()[0]
    return input_ids, mask_idx

In [22]:
def getDataPoints(result,posepoints,leftpoints,rightpoints,poselist,rightlist,leftlist,flag):
    if result.pose_landmarks:
        temptensor = torch.zeros(3,1,19)
        for i in range(len(posepoints)):
            if(i>18):
                break
            temptensor[0][0][i]=posepoints[i].x
            temptensor[1][0][i]=posepoints[i].y
            temptensor[2][0][i]=posepoints[i].z
        temptensor[0][0][17]=posepoints[23].x
        temptensor[0][0][18]=posepoints[24].x
        temptensor[1][0][17]=posepoints[23].y
        temptensor[1][0][18]=posepoints[24].y
        temptensor[2][0][17]=posepoints[23].z
        temptensor[2][0][18]=posepoints[24].z
        poselist[0]=tensor_push(poselist[0], temptensor[0])
        poselist[1]=tensor_push(poselist[1], temptensor[1])
        poselist[2]=tensor_push(poselist[2], temptensor[2])
    else:
        poselist[0]=tensor_push(poselist[0], torch.zeros(1,19))
        poselist[1]=tensor_push(poselist[1], torch.zeros(1,19))
        poselist[2]=tensor_push(poselist[2], torch.zeros(1,19))

    if result.left_hand_landmarks:
        temptensor = torch.zeros(3,1,21)
        for i in range(len(leftpoints)):
            temptensor[0][0][i]=leftpoints[i].x
            temptensor[1][0][i]=leftpoints[i].y
            temptensor[2][0][i]=leftpoints[i].z
        leftlist[0]=tensor_push(leftlist[0], temptensor[0])
        leftlist[1]=tensor_push(leftlist[1], temptensor[1])
        leftlist[2]=tensor_push(leftlist[2], temptensor[2])
    else:
        leftlist[0]=tensor_push(leftlist[0], torch.zeros(1,21))
        leftlist[1]=tensor_push(leftlist[1], torch.zeros(1,21))
        leftlist[2]=tensor_push(leftlist[2], torch.zeros(1,21))
        flag+=1
    if result.right_hand_landmarks:
        temptensor = torch.zeros(3,1,21)
        for i in range(len(rightpoints)):
            temptensor[0][0][i]=rightpoints[i].x
            temptensor[1][0][i]=rightpoints[i].y
            temptensor[2][0][i]=rightpoints[i].z
        rightlist[0]=tensor_push(rightlist[0], temptensor[0])
        rightlist[1]=tensor_push(rightlist[1], temptensor[1])
        rightlist[2]=tensor_push(rightlist[2], temptensor[2])
    else:
        rightlist[0]=tensor_push(rightlist[0], torch.zeros(1,21))
        rightlist[1]=tensor_push(rightlist[1], torch.zeros(1,21))
        rightlist[2]=tensor_push(rightlist[2], torch.zeros(1,21))
        flag+=1
        
    return (poselist,rightlist,leftlist,flag)

In [23]:
def getprediction(electra_tokenizer, sentence):
    input_ids, mask_idx = encode(electra_tokenizer, sentence, add_special_tokens=True)
    with torch.no_grad():
        predict = electra_model(input_ids)[0]
    return predict,mask_idx

In [24]:
def getGCNpred(biglist):
    y_hat=model(biglist)
    y_hat = softmax(y_hat[0][0])
    return y_hat

In [25]:
DONT_USE_NLP = True

In [26]:
pause_time = 40

In [28]:
action = 'them'

In [102]:
stop=False
nopose=torch.zeros(3,sequence_length,19)
noleft=torch.zeros(3,sequence_length,21)
noright=torch.zeros(3,sequence_length,21)

poselist=torch.zeros(3,sequence_length,19)
leftlist=torch.zeros(3,sequence_length,21)
rightlist=torch.zeros(3,sequence_length,21)
biglist=torch.zeros(no_sequences,3,sequence_length,61)
cap = cv2.VideoCapture(0)
bigresults=[]
tempresults=[]
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    sequence=0
    while(sequence<no_sequences):
        for frame_pause in range(pause_time):
            ret, frame = cap.read()
#             frame = frame[hcrop:-hcrop,wcrop:-wcrop]
#             print(frame.shape)
#             frame = cv2.resize(frame, (1280,960),interpolation=cv2.INTER_CUBIC)
            image, results = mediapipe_detection(frame, holistic)
            
            draw_styled_landmarks(image, results)
            image = cv2.flip(image,1)
            cv2.putText(image, 'STARTING COLLECTION', (250,300), 
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255, 0), 4, cv2.LINE_AA)
            cv2.putText(image, str((pause_time-frame_pause)//10+1), (350,350), 
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255, 0), 4, cv2.LINE_AA)
            cv2.putText(image, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15,12), 
                       cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
            image = cv2.resize(image, (1280,960),interpolation=cv2.INTER_CUBIC)
            cv2.imshow('OpenCV Feed', image)
            if cv2.waitKey(10) & 0xFF == ord('n'):
                break
#             if cv2.waitKey(10) & 0xFF == ord('q'):
#                 stop=True
#                 break
        #record video of 32 frames
        tempresults = []
        for frame_num in range(sequence_length):
            ret, frame = cap.read()
#             frame = frame[hcrop:-hcrop,wcrop:-wcrop]
#             print(frame.shape)
#             frame = cv2.resize(frame, (1280,960),interpolation=cv2.INTER_CUBIC)
            image, results = mediapipe_detection(frame, holistic)
            tempresults.append(results)
            draw_styled_landmarks(image, results)
            image = cv2.flip(image,1)
            cv2.putText(image, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15,12), 
                       cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
            cv2.putText(image, 'RECORDING', (640,650), 
                       cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 0, 255), 5, cv2.LINE_AA)
            cv2.putText(image, str((sequence_length-frame_num)//10+1), (870,650), 
                       cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 0, 255), 5, cv2.LINE_AA)
            
            image = cv2.resize(image, (1280,960),interpolation=cv2.INTER_CUBIC)
            cv2.imshow('OpenCV Feed', image)
            if cv2.waitKey(10) & 0xFF == ord('q'):
                stop=True
                break
        if stop:
            break
        bigresults.append(tempresults)
        sequence+=1
    cap.release()
    cv2.destroyAllWindows()
    for sequence,results in enumerate(bigresults):
        for frame,result in enumerate(results):
            posepoints=result.pose_landmarks.landmark if result.pose_landmarks else nopose
            leftpoints=result.left_hand_landmarks.landmark if result.left_hand_landmarks else noleft
            rightpoints=result.right_hand_landmarks.landmark if result.right_hand_landmarks else noright
            if result.pose_landmarks:
                for i in range(len(posepoints)):
                    if(i>18):
                        break
                    poselist[0][frame][i]=posepoints[i].x
                    poselist[1][frame][i]=posepoints[i].y
                    poselist[2][frame][i]=posepoints[i].z
                poselist[0][frame][17]=posepoints[23].x; poselist[0][frame][18]=posepoints[24].x;
                poselist[1][frame][17]=posepoints[23].y; poselist[1][frame][18]=posepoints[24].y;
                poselist[2][frame][17]=posepoints[23].z; poselist[2][frame][18]=posepoints[24].z;
            else:
                poselist[0][frame]=torch.zeros(18)
                poselist[1][frame]=torch.zeros(18)
                poselist[2][frame]=torch.zeros(18)
            if result.left_hand_landmarks:
                for i in range(len(leftpoints)):
                    leftlist[0][frame][i]=leftpoints[i].x
                    leftlist[1][frame][i]=leftpoints[i].y
                    leftlist[2][frame][i]=leftpoints[i].z
            else:
                leftlist[0][frame]=torch.zeros(21)
                leftlist[1][frame]=torch.zeros(21)
                leftlist[2][frame]=torch.zeros(21)
            if result.right_hand_landmarks:
                for i in range(len(rightpoints)):
                    rightlist[0][frame][i]=rightpoints[i].x
                    rightlist[1][frame][i]=rightpoints[i].y
                    rightlist[2][frame][i]=rightpoints[i].z
            else:
                rightlist[0][frame]=torch.zeros(21)
                rightlist[1][frame]=torch.zeros(21)
                rightlist[2][frame]=torch.zeros(21)
                
        biglist[sequence]=torch.cat((leftlist,poselist,rightlist),2)

# torch.save(biglist,"test_data/"+action+".pt")

In [104]:
sent = 'Place super glue underneath the rim of the jar lid and seal it down tight so the items won\'t <mask>'

biglist = biglist.cuda()
y_hat= getGCNpred(biglist)
top5=torch.topk(y_hat,5)
print('GCN top-5 Predictions:')
for i in top5.indices:
    print(a[i][:-3])
    
print('\nNLP Scores for each word:')
input_ids, mask_idx = encode(electra_tokenizer, sent, add_special_tokens=True)
with torch.no_grad():
    predict = electra_model(input_ids)[0]   

top_word_idx = top5.indices.tolist()
top_word_values = []
top_words = []
for widx in top_word_idx:
    w = a[widx][:-3]
    top_words.append(w)
    top_word_values.append(predict[0, mask_idx, :][electrakeys[w]])
    print(w, top_word_values[-1])

predwordidx = int(torch.argmax(torch.tensor(top_word_values),dim=0))
word2add = top_words[predwordidx]

print()
print('final result: '+ word2add)

GCN top-5 Predictions:
down
among
increase
spill
try

NLP Scores for each word:
down tensor(7.6291)
among tensor(-1.9091)
increase tensor(6.5163)
spill tensor(13.6828)
try tensor(6.7440)

final result: spill


In [362]:
top5

torch.return_types.topk(
values=tensor([9.5400e-01, 4.5979e-02, 6.6823e-06, 4.4749e-06, 3.5661e-06],
       device='cuda:0', grad_fn=<TopkBackward>),
indices=tensor([ 86,   2,  98,  85, 101], device='cuda:0'))

# Comparing GCN vs GCN+NLP

In [None]:
from transformers import ElectraTokenizer, ElectraForMaskedLM
electra_tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-generator')
electra_model = ElectraForMaskedLM.from_pretrained('google/electra-small-generator').eval()

def encode(tokenizer, text_sentence, add_special_tokens=True):
    text_sentence = text_sentence.replace('<mask>', tokenizer.mask_token)
    # if <mask> is the last token, append a "." so that models dont predict punctuation.
    if tokenizer.mask_token == text_sentence.split()[-1]:
        text_sentence += ' .'

    input_ids = torch.tensor([tokenizer.encode(text_sentence, add_special_tokens=add_special_tokens)])
    mask_idx = torch.where(input_ids == tokenizer.mask_token_id)[1].tolist()[0]
    return input_ids, mask_idx

In [272]:
b=[]
for subdir, dirs, files in os.walk('./test_sents'):
    b=files

In [292]:
electrakeys = dict()
for w in a:
    if w == 't-shirt.pt':
        word = 'shirt.pt'
    else:
        word = w
    for i in range(9999999):
        if ''.join(electra_tokenizer.decode(i).split()) == word[:-3].lower():
            electrakeys[w[:-3]] = i
            break

In [350]:
sents = open("sents.txt", "r")
sentences = sents.read().split('\n')

In [351]:
nlpcount = 0
noobcount =0
total = 0
for sentence in sentences:
    sent,true =  sentence.split('; ')
    
    testW = torch.load('./test_sents/'+true+'.pt')
    y_hat= getGCNpred(testW)
    top5=torch.topk(y_hat,5)


    input_ids, mask_idx = encode(electra_tokenizer, sent, add_special_tokens=True)
    with torch.no_grad():
        predict = electra_model(input_ids)[0]   

    top_word_idx = top5.indices.tolist()
    top_word_values = []
    top_words = []
    for widx in top_word_idx:
        w = a[widx][:-3]
        top_words.append(w)
        top_word_values.append(predict[0, mask_idx, :][electrakeys[w]])

    predwordidx = int(torch.argmax(torch.tensor(top_word_values),dim=0))
    word2add = top_words[predwordidx]

    if word2add == true:
        nlpcount+=1

    top1w = a[int(top5.indices[0])][:-3]

    if top1w == true:
        noobcount+=1

    total+=1

In [352]:
print("GCN + NLP Combination: " + str(round(nlpcount/total *100,2)))
print("Only GCN Predictions: "+ str(round(noobcount/total *100,2)))

GCN + NLP Combination: 97.87
Only GCN Predictions: 78.72
