# CNN-LSTM Seperate Models

Importing batch data

In [37]:
from pymongo import MongoClient
import pandas as pd
import tensorflow as tf
import numpy as np
from tqdm import tqdm
tqdm.pandas()
from tensorflow.keras.models import Model , Sequential
from tensorflow.keras.layers import Conv2D , MaxPooling2D , Flatten , Dropout , Input , TimeDistributed , Dense , LSTM

In [38]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [39]:
connection = MongoClient('localhost' , 27017)
db = connection['mydb']
collection = db['Batch_Data']

In [40]:
#Extracting the data

cursor = collection.find({}) 
batch_df = pd.DataFrame(list(cursor))

In [41]:
batch_df.head()

Unnamed: 0,_id,landmarks,label
0,6729ae6b068e5f1de5e9010e,"[{'face': [[0.528167724609375, 0.2578373367231...",book
1,6729ae6b068e5f1de5e9010f,"[{'face': [[0.242395827902416, 0.2020864317403...",book
2,6729ae6b068e5f1de5e90110,"[{'face': [[0.4664744738874764, 0.219961448490...",book
3,6729ae6b068e5f1de5e90111,"[{'face': [[0.479362738886966, 0.2433891842543...",book
4,6729ae6b068e5f1de5e90112,"[{'face': [[0.492517174267378, 0.0870498852922...",book


In [42]:
len(batch_df['landmarks'][0])

30

In [43]:

#converting the string values into numeric values

label_encoder = LabelEncoder()
batch_df['label_encoded'] = label_encoder.fit_transform(batch_df['label'])


In [44]:
batch_df

Unnamed: 0,_id,landmarks,label,label_encoded
0,6729ae6b068e5f1de5e9010e,"[{'face': [[0.528167724609375, 0.2578373367231...",book,1
1,6729ae6b068e5f1de5e9010f,"[{'face': [[0.242395827902416, 0.2020864317403...",book,1
2,6729ae6b068e5f1de5e90110,"[{'face': [[0.4664744738874764, 0.219961448490...",book,1
3,6729ae6b068e5f1de5e90111,"[{'face': [[0.479362738886966, 0.2433891842543...",book,1
4,6729ae6b068e5f1de5e90112,"[{'face': [[0.492517174267378, 0.0870498852922...",book,1
...,...,...,...,...
95,6729ae6b068e5f1de5e9016d,"[{'face': [[0.09501079164702321, 0.16571470140...",candy,2
96,6729ae6b068e5f1de5e9016e,"[{'face': [[0.20198555292371412, 0.22020006891...",candy,2
97,6729ae6b068e5f1de5e9016f,"[{'face': [[0.08536609051099148, 0.21951115430...",candy,2
98,6729ae6b068e5f1de5e90170,"[{'face': [[0.5128556353301399, 0.295628729619...",candy,2


In [45]:
# # Makin sure that each landmark for video is a dictionary , i.e removig outer list

# batch_df['landmarks'] = batch_df['landmarks'].apply(lambda x : x[0] if isinstance(x,list) else x)

In [46]:
# batch_df.head()

In [47]:
len(batch_df['landmarks'][0])

30

Reshaping the Landmarks to align all the landmarks

In [48]:
def reshape_landmarks(record):

 iterations = len(record)
 
 concatenated_one_video = []

 for count in range(iterations):
 
    face_lands = np.array(record[count]['face'])
    left_hand_lands = np.array(record[count]['left_hand'])
    right_hand_lands = np.array(record[count]['right_hand'])

    #Concatinating all the landmarks to the shape (510 , 3)

    all_landmarks = np.vstack([face_lands , left_hand_lands , right_hand_lands]) # concatenated landmarks for 1 frame of the video

    concatenated_one_video.append(all_landmarks)
 
 return np.array(concatenated_one_video)

In [49]:
r1 = reshape_landmarks(batch_df['landmarks'][0])

In [50]:
r2 = reshape_landmarks(batch_df['landmarks'][1])

In [51]:
r1.shape

(30, 510, 3)

In [52]:
r2.shape

(30, 510, 3)

In [53]:
batch_df['concatenated_landmarks'] = batch_df['landmarks'].progress_apply(reshape_landmarks)

100%|██████████| 100/100 [00:00<00:00, 176.56it/s]


In [54]:
batch_df.head()

Unnamed: 0,_id,landmarks,label,label_encoded,concatenated_landmarks
0,6729ae6b068e5f1de5e9010e,"[{'face': [[0.528167724609375, 0.2578373367231...",book,1,"[[[0.528167724609375, 0.25783733672313774, -0...."
1,6729ae6b068e5f1de5e9010f,"[{'face': [[0.242395827902416, 0.2020864317403...",book,1,"[[[0.242395827902416, 0.20208643174030372, -0...."
2,6729ae6b068e5f1de5e90110,"[{'face': [[0.4664744738874764, 0.219961448490...",book,1,"[[[0.4664744738874764, 0.21996144849046956, -0..."
3,6729ae6b068e5f1de5e90111,"[{'face': [[0.479362738886966, 0.2433891842543...",book,1,"[[[0.479362738886966, 0.2433891842543816, -0.0..."
4,6729ae6b068e5f1de5e90112,"[{'face': [[0.492517174267378, 0.0870498852922...",book,1,"[[[0.492517174267378, 0.08704988529227371, -0...."


In [55]:
((batch_df['concatenated_landmarks'][5].reshape(30 , 510 , 3 ,1))[1]).shape

(510, 3, 1)

In [56]:
batch_df['concatenated_landmarks'][5].shape

(30, 510, 3)

In [57]:
batch_df['concatenated_landmarks'].progress_apply(lambda row : (row).shape )

100%|██████████| 100/100 [00:00<00:00, 100174.44it/s]


0     (30, 510, 3)
1     (30, 510, 3)
2     (30, 510, 3)
3     (30, 510, 3)
4     (30, 510, 3)
          ...     
95    (30, 510, 3)
96    (30, 510, 3)
97    (30, 510, 3)
98    (30, 510, 3)
99    (30, 510, 3)
Name: concatenated_landmarks, Length: 100, dtype: object

In [58]:
#Resphaing the data to apply conv2D filter on the input data
batch_df['concatenated_landmarks'] = batch_df['concatenated_landmarks'].progress_apply(lambda row : row.reshape(30 , 510 , 3 , 1) )

100%|██████████| 100/100 [00:00<00:00, 99888.16it/s]


In [59]:
batch_df['concatenated_landmarks'][88].shape

(30, 510, 3, 1)

In [60]:
one_hot_encoded_labels = tf.keras.utils.to_categorical(batch_df['label_encoded'])

In [61]:
type(one_hot_encoded_labels)

numpy.ndarray

In [62]:
one_hot_encoded_labels

array([[0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0.

Splitting the Data into Train and Test set

In [63]:
X = batch_df['concatenated_landmarks']
Y = one_hot_encoded_labels

In [64]:
# using 75 - 25 train - test split

X_train , X_test , Y_train , Y_test = train_test_split(X , Y , test_size= 0.25 , shuffle= True , random_state= 42 )

In [65]:
X_train = np.array(X_train.tolist())

In [66]:
type(X_train)

numpy.ndarray

In [67]:
X_train.shape

(75, 30, 510, 3, 1)

In [68]:
X[0].shape

(30, 510, 3, 1)

In [69]:
Y.shape

(100, 9)

In [70]:
Y.shape[1]

9

## Creating Convolutional Neural Network to Extract Feature Maps for each frames

Reshaping videos landmark data
- face - 468 landmarks
- left-hand - 21 landmarks
- right-hand - 21 landmarks
- to Pass t conv2D layer -> arranging the landmarks for each frame in the shape = (468 + 21 + 21 , 3) = (510 , 3)

In [71]:
def CNN_model(input_shape = (30 , 510 , 3 , 1)):

 """
 Creating a CNN model for feature extraction from individual frames
 Input shape : (timestep , landmarks , coordinates , channels)
 """
 #Defining a Sequential Model
 model = Sequential() 

 #Defining the Model architecture

 #CNN Layer 1
 model.add(TimeDistributed(Conv2D(filters= 16 , kernel_size= (3,3) , padding='same' , activation='relu' , input_shape = input_shape )))
 # model.add(TimeDistributed(MaxPooling2D((1,2))))
 model.add(TimeDistributed(Dropout(0.25)))

 #CNN Layer 2
 model.add(TimeDistributed(Conv2D(filters= 32 , kernel_size= (3,3) , padding='same' , activation='relu')))
 # model.add(TimeDistributed(MaxPooling2D((1,2))))
 model.add(TimeDistributed(Dropout(0.25)))

  #CNN Layer 3
 model.add(TimeDistributed(Conv2D(filters= 64 , kernel_size= (3,3) , padding='same' , activation='relu')))
 # model.add(TimeDistributed(MaxPooling2D((1,2))))
 model.add(TimeDistributed(Dropout(0.25)))

  #CNN Layer 4
 model.add(TimeDistributed(Conv2D(filters= 64 , kernel_size= (3,3) , padding='same' , activation='relu')))
 # model.add(TimeDistributed(MaxPooling2D((1,2))))
 # model.add(TimeDistributed(Dropout(0.25)))

 model.add(TimeDistributed(Flatten()))

 #Fully Connected ANN layers
 
 model.add(TimeDistributed(Dense(256 , activation = 'relu')))
 model.add(TimeDistributed(Dense(128 , activation='relu')))

 return model

In [72]:
#Extracting feature maps of each frame and storing it

def extract_and_store_features(videos , feature_extractor):

 """
 Extract Feature maps from each frame in all videos
 videos Shape : ( num_videos , 30 , 510 , 3 , 1)
 feature_extractor expects shape : (30 , 510 , 3 , 1)
 """

 feature_list = []

 for video in videos:
   frame_features = feature_extractor.predict(video)
   feature_list.append(np.array(frame_features))
 
 return np.array(feature_list)

In [73]:
def create_LSTM_clssifier(num_classes,feature_size = 128 , sequence_length  = 30 ):
 """
 LSTM Model for classification using pre-extracted features from CNN 
 Input Shape : (no_of_Frames , features)
 """

 model = Sequential()

 model.add(Input(shape=(sequence_length , feature_size)))

 model.add(LSTM(256 , return_sequences= True))
 model.add(Dropout(0.3))

 model.add(LSTM(128))
 model.add(Dropout(0.3))

 model.add(Dense(64 , activation='relu'))
 model.add(Dense(32) ,activation = 'relu')
 model.add(Dense(num_classes , activation='softmax'))
 
 return model


In [74]:
def train_CNN_model(X_train , Y_train ,label_size, epochs = 50 , batch_size = 10):

 #Creating CNN model

 feature_extractor = CNN_model()

 #Extract the video Features for all videos

 video_features = extract_and_store_features(X_train , feature_extractor=feature_extractor)

 #Create and train LSTM classifier

 LSTM_classifier = create_LSTM_clssifier(label_size)
 LSTM_classifier.compile(optimizer = 'adam' , loss ='sparse_categorical_crossentropy' , metrics = ['accuracy'])

 LSTM_model_history = LSTM_classifier.fit(video_features , Y_train , epochs = epochs , batch_size =  batch_size , validation_split = 0.2) 

 return feature_extractor , LSTM_classifier , LSTM_model_history

In [76]:
label_size = Y_train.shape[1]

opt_feature_extractor , opt_LSTM_classifier , LSTM_model_history = train_CNN_model(X_train , Y_train , label_size)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


ValueError: Kernel shape must have the same length as input, but received kernel of shape (3, 3, 1, 16) and input of shape (30, 3, 1).