## Imports / Globals

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pickle
import pandas as pd
from os import listdir
from os.path import isfile, join
from tqdm import tqdm
import numpy as np
from itertools import product

np.random.seed(42)

In [2]:
# path = '/Users/thomas/Downloads/nturgb+d_skeletons'
path = 'D:\\Datasets\\Motion Privacy\\NTU RGB+D 120\\Skeleton Data'

## Data organization

### Input format

[J0 X, J0 Y, J0 Z, J1 X, J1 Y, J1 Z, ..., J25 Z]

### Output format
0 = Female, 1 = Male

In [3]:
# Load the Genders
Genders = pd.read_csv('Genders.csv')

# Convert M to 1 and F to 0
Genders = Genders.replace('M', 1).replace('F', 0)

# Convert dataframe to oject where P is the key, and Gender is the value
Genders = Genders.set_index('P').T.to_dict('list')

In [4]:
# Attempt to load X and Y from pickle before generating them
X = {}
try:
    print('Attempting to load X from pickle')
    with open('X.pkl', 'rb') as f:
        X = pickle.load(f)
    print('X loaded from pickle')
except:
    print('Could not load X and Y, generating them now')
    
    # Read the files
    files = [f for f in listdir(path) if isfile(join(path, f))]

    # Get stats for each file based on name
    files_ = []
    for file in files:
        data = {'file': file,
                's': file[0:4],
                'c': file[4:8],
                'p': file[8:12],
                'r': file[12:16],
                'a': file[16:20]
                }
        files_.append(data)

    # Generate X and Y
    for file_ in tqdm(files_, desc='Files Parsed', position=0):
        try:
            file = join(path, file_['file'])
            data = open(file, 'r')
            lines = data.readlines()
            frames_count = int(lines.pop(0).replace('\n', ''))
            file_['frames'] = frames_count
        except UnicodeDecodeError: # .DS_Store file
            print('UnicodeDecodeError: ', file)
            continue

        # Get P and add to X if not already there
        p = file_['p']
        if p not in X:
            X[p] = []

        # Skip file if 2 actors
        if lines[0].replace('\n', '') != '1': continue

        for f in tqdm(range(frames_count), desc='Frames Parsed', position=1, leave=False):
            try:
                # Get actor count
                actors = int(lines.pop(0).replace('\n', ''))
            
                # Get actor info
                t = lines.pop(0)

                # Get joint count
                joint_count = int(lines.pop(0).replace('\n', ''))

                # Get joint info
                d = []
                for j in range(joint_count):
                    joint = lines.pop(0).replace('\n', '').split(' ')
                    d.extend(joint[0:3])

                # Skip if not 25 joints
                if len(d) != 75: continue

                # Convert to numpy array
                d = np.array(d)

                # Append to X and Y
                X[p].append(d)
            except:
                break
        
    # Convert to numpy arrays
    for p in X:
        X[p] = np.array(X[p], dtype=np.float16)

    print('X Generated, saving to pickle...')

    # Save the data
    with open('X.pkl', 'wb') as f:
        pickle.dump(X, f)

    print('X Saved to pickle')

# Print Lengths
print('\n\nLengths:')
for p in X:
    print(p, X[p].shape)


Attempting to load X from pickle
X loaded from pickle


Lengths:
P001 (145215, 75)
P002 (63629, 75)
P003 (108274, 75)
P004 (50110, 75)
P005 (19615, 75)
P006 (67589, 75)
P007 (424687, 75)
P008 (567014, 75)
P009 (53245, 75)
P010 (52804, 75)
P011 (63254, 75)
P012 (27513, 75)
P013 (67912, 75)
P014 (26187, 75)
P015 (291737, 75)
P016 (267383, 75)
P017 (296334, 75)
P018 (146483, 75)
P019 (327106, 75)
P020 (53474, 75)
P021 (86379, 75)
P022 (30459, 75)
P023 (27174, 75)
P024 (26140, 75)
P025 (274698, 75)
P026 (27311, 75)
P027 (112695, 75)
P028 (93580, 75)
P029 (25443, 75)
P030 (21698, 75)
P031 (23599, 75)
P032 (27625, 75)
P033 (24038, 75)
P034 (23594, 75)
P035 (22215, 75)
P036 (22570, 75)
P038 (27434, 75)
P037 (110918, 75)
P039 (46711, 75)
P040 (19980, 75)
P041 (165442, 75)
P042 (69339, 75)
P043 (131743, 75)
P044 (112484, 75)
P045 (18725, 75)
P046 (34723, 75)
P047 (16625, 75)
P048 (63257, 75)
P049 (46325, 75)
P050 (37917, 75)
P051 (54275, 75)
P052 (21970, 75)
P053 (19426, 75)
P054 (18655, 75)
P0

In [10]:
# Default Male/Female split
MALES = 30
FEMALES = 15

# Split into train and test
def split_data(males, females):
    # To save to txt file
    attack_m = None
    attack_f = None
    retarget_m = None
    retarget_f = None
    remaining_m = None
    remaining_f = None

    # Choose males count random males
    m = []
    f = []
    for p in Genders:
        if Genders[p] == [1]:
            m.append(p)
        else:
            f.append(p)
    train_males = np.random.choice(list(m), males, replace=False)
    train_females = np.random.choice(list(f), females, replace=False)
    
    # Combine to get male/female split
    X_male = np.concatenate([X[x] for x in train_males])
    X_female = np.concatenate([X[x] for x in train_females])
    Y_male = np.ones(len(X_male))
    Y_female = np.zeros(len(X_female))

    # Combine to get train data
    train_x = np.concatenate([X_male, X_female])
    train_y = np.concatenate([Y_male, Y_female])

    # Get the actors not in the train set
    test_males = [x for x in m if x not in train_males]
    test_females = [x for x in f if x not in train_females]

    # Combine to get male/female split
    X_male = np.concatenate([X[x] for x in test_males])
    X_female = np.concatenate([X[x] for x in test_females])
    Y_male = np.ones(len(X_male), dtype=np.int8)
    Y_female = np.zeros(len(X_female), dtype=np.int8)

    # Combine to get test data
    test_x = np.concatenate([X_male, X_female])
    test_y = np.concatenate([Y_male, Y_female])

    # Print shapes
    print('Train X Shape: ', train_x.shape)
    print('Train Y Shape: ', train_y.shape)
    print('Test X Shape: ', test_x.shape)
    print('Test Y Shape: ', test_y.shape)

    # Save actor split to txt file
    attack_m = train_males
    attack_f = train_females
    # split test set into retarget and remaining
    temp_m = np.random.choice(test_males, males, replace=False)
    temp_f = np.random.choice(test_females, females, replace=False)
    retarget_m = temp_m
    retarget_f = temp_f
    remaining_m = np.array([x for x in test_males if x not in temp_m])
    remaining_f = np.array([x for x in test_females if x not in temp_f])

    with open('actor split.txt', 'w') as f:
        f.write(f"Attacking Male Actors: {attack_m}\n")
        f.write(f"\nAttacking Female Actors: {attack_f}\n")
        f.write(f"\nDefending/Retargeting Male Actors: {retarget_m}\n")
        f.write(f"\nDefending/Retargeting Female Actors: {retarget_f}\n")
        f.write(f"\nRemaining Male Actors: {remaining_m}\n")
        f.write(f"\nRemaining Female Actors: {remaining_f}\n")

    return train_x, train_y, test_x, test_y

train_x, train_y, test_x, test_y = split_data(MALES, FEMALES)

Train X Shape:  (3176693, 75)
Train Y Shape:  (3176693,)
Test X Shape:  (3244742, 75)
Test Y Shape:  (3244742,)


[array(['P100', 'P077', 'P004', 'P014', 'P040', 'P019', 'P074', 'P087',
       'P025', 'P059', 'P066', 'P048', 'P105', 'P003', 'P021', 'P095',
       'P049', 'P013', 'P043', 'P008', 'P057', 'P083', 'P068', 'P101',
       'P072', 'P073', 'P085', 'P033', 'P096', 'P046'], dtype='<U4'), array(['P016', 'P089', 'P056', 'P090', 'P028', 'P012', 'P015', 'P061',
       'P053', 'P081', 'P094', 'P102', 'P082', 'P091', 'P029'],
      dtype='<U4'), array(['P099', 'P080', 'P060', 'P084', 'P007', 'P030', 'P067', 'P054',
       'P009', 'P023', 'P076', 'P037', 'P098', 'P097', 'P047', 'P103',
       'P104', 'P010', 'P034', 'P086', 'P020', 'P031', 'P106', 'P045',
       'P064', 'P071', 'P063', 'P062', 'P051', 'P055'], dtype='<U4'), array(['P026', 'P011', 'P038', 'P017', 'P039', 'P022', 'P002', 'P050',
       'P075', 'P001', 'P070', 'P032', 'P024', 'P069', 'P092'],
      dtype='<U4'), array(['P005', 'P006', 'P027', 'P035', 'P041', 'P042', 'P044', 'P052',
       'P058', 'P065', 'P079', 'P088'], dtype='<U4')

## Parameter Optimization

In [5]:
# Define a grid of hyperparameters to search over
n_estimators = [50, 100, 150, 500]
max_depth = [10, 15, 20]
min_samples_split = [2, 3, 5]

# Create a grid of all possible combinations of the hyperparameters
param_grid = list(product(n_estimators, max_depth, min_samples_split))

print(f'Grid Search will be done on {len(param_grid)} parameters...')

Grid Search will be done on 36 parameters...


In [6]:
res = {}
def train_split(MALES, FEMALES):
    # Get train and test data
    train_x, train_y, test_x, test_y = split_data(MALES, FEMALES)

    # Initialize variables to store the best hyperparameters and their performance
    best_score = 0
    best_params = None

    # Train and evaluate the random forest model for each combination of hyperparameters
    for params in tqdm(param_grid):
        print(f'Training with {params}...')
        n_est, m_dep, m_sam = params
        clf = RandomForestClassifier(n_estimators=n_est, max_depth=m_dep, min_samples_split=m_sam, n_jobs=-1, verbose=0)
        clf.fit(train_x, train_y)
        print('Fitting done, evaluating...')
        pred_y = clf.predict(test_x)
        score = accuracy_score(test_y, pred_y)
        print(f'Accuracy: {score}')

        # Update the best hyperparameters if the current model is better
        if score > best_score:
            print('New best score! Saving params...')
            best_score = score
            best_params = params
    
    # Train the final random forest model using the best hyperparameters
    clf = RandomForestClassifier(n_estimators=best_params[0], max_depth=best_params[1], min_samples_split=best_params[2])
    clf.fit(train_x, train_y)

    # Save the final random forest model to a pickle file
    with open(f'clf {MALES}.{FEMALES}.pkl', 'wb') as f:
        pickle.dump(clf, f)

    # Evaluate the performance of the final model on the test set
    pred_y = clf.predict(test_x)
    score = accuracy_score(test_y, pred_y)
    print("Best parameters: ", best_params)
    print("Test accuracy: ", score)

    res[f'{MALES}-{FEMALES}'] = (best_params, score)

### Runtime was just under 10 hours on a xeon CPU

In [7]:
splits = [(10, 5), (20, 10), (30, 15)]
for split in splits:
    train_split(*split)

Train X Shape:  (523907, 75)
Train Y Shape:  (523907,)
Test X Shape:  (5897528, 75)
Test Y Shape:  (5897528,)


  0%|          | 0/36 [00:00<?, ?it/s]

Training with (50, 10, 2)...
Fitting done, evaluating...


  3%|▎         | 1/36 [00:18<10:36, 18.17s/it]

Accuracy: 0.6621036814068538
New best score! Saving params...
Training with (50, 10, 3)...
Fitting done, evaluating...


  6%|▌         | 2/36 [00:36<10:14, 18.08s/it]

Accuracy: 0.6584492689140263
Training with (50, 10, 5)...
Fitting done, evaluating...


  8%|▊         | 3/36 [00:54<09:58, 18.14s/it]

Accuracy: 0.6577325279337377
Training with (50, 15, 2)...
Fitting done, evaluating...


 11%|█         | 4/36 [01:16<10:35, 19.87s/it]

Accuracy: 0.6562134168756808
Training with (50, 15, 3)...
Fitting done, evaluating...


 14%|█▍        | 5/36 [01:39<10:48, 20.91s/it]

Accuracy: 0.6569111668482116
Training with (50, 15, 5)...
Fitting done, evaluating...


 17%|█▋        | 6/36 [02:02<10:43, 21.46s/it]

Accuracy: 0.6586956433271703
Training with (50, 20, 2)...
Fitting done, evaluating...


 19%|█▉        | 7/36 [02:27<10:59, 22.73s/it]

Accuracy: 0.6551236382430062
Training with (50, 20, 3)...
Fitting done, evaluating...


 22%|██▏       | 8/36 [02:52<10:56, 23.46s/it]

Accuracy: 0.6557801845112053
Training with (50, 20, 5)...
Fitting done, evaluating...


 25%|██▌       | 9/36 [03:17<10:46, 23.93s/it]

Accuracy: 0.6554842978278357
Training with (100, 10, 2)...
Fitting done, evaluating...


 28%|██▊       | 10/36 [03:50<11:31, 26.58s/it]

Accuracy: 0.6609537080620898
Training with (100, 10, 3)...
Fitting done, evaluating...


 31%|███       | 11/36 [04:22<11:50, 28.42s/it]

Accuracy: 0.6598054303430183
Training with (100, 10, 5)...
Fitting done, evaluating...


 33%|███▎      | 12/36 [04:55<11:53, 29.75s/it]

Accuracy: 0.6606401868715164
Training with (100, 15, 2)...
Fitting done, evaluating...


 36%|███▌      | 13/36 [05:36<12:43, 33.21s/it]

Accuracy: 0.6591660098943151
Training with (100, 15, 3)...
Fitting done, evaluating...


 39%|███▉      | 14/36 [06:17<13:05, 35.68s/it]

Accuracy: 0.6575690696169649
Training with (100, 15, 5)...
Fitting done, evaluating...


 42%|████▏     | 15/36 [06:59<13:05, 37.40s/it]

Accuracy: 0.658955243620717
Training with (100, 20, 2)...
Fitting done, evaluating...


 44%|████▍     | 16/36 [07:47<13:32, 40.63s/it]

Accuracy: 0.6563110849155782
Training with (100, 20, 3)...
Fitting done, evaluating...


 47%|████▋     | 17/36 [08:34<13:30, 42.64s/it]

Accuracy: 0.6584292605308529
Training with (100, 20, 5)...
Fitting done, evaluating...


 50%|█████     | 18/36 [09:20<13:02, 43.50s/it]

Accuracy: 0.6576719940965096
Training with (150, 10, 2)...
Fitting done, evaluating...


 53%|█████▎    | 19/36 [10:08<12:44, 44.98s/it]

Accuracy: 0.6603722780120755
Training with (150, 10, 3)...
Fitting done, evaluating...


 56%|█████▌    | 20/36 [10:53<11:58, 44.92s/it]

Accuracy: 0.6607080118992229
Training with (150, 10, 5)...
Fitting done, evaluating...


 58%|█████▊    | 21/36 [11:38<11:14, 44.96s/it]

Accuracy: 0.6624879101888113
New best score! Saving params...
Training with (150, 15, 2)...
Fitting done, evaluating...


 61%|██████    | 22/36 [12:35<11:19, 48.54s/it]

Accuracy: 0.6587853419263122
Training with (150, 15, 3)...
Fitting done, evaluating...


 64%|██████▍   | 23/36 [13:32<11:05, 51.16s/it]

Accuracy: 0.6576419815217495
Training with (150, 15, 5)...
Fitting done, evaluating...


 67%|██████▋   | 24/36 [14:30<10:35, 52.99s/it]

Accuracy: 0.6596068725744074
Training with (150, 20, 2)...
Fitting done, evaluating...


 69%|██████▉   | 25/36 [15:34<10:22, 56.57s/it]

Accuracy: 0.6574101894895624
Training with (150, 20, 3)...
Fitting done, evaluating...


 72%|███████▏  | 26/36 [16:39<09:50, 59.07s/it]

Accuracy: 0.6575555046114236
Training with (150, 20, 5)...
Fitting done, evaluating...


 75%|███████▌  | 27/36 [17:44<09:06, 60.69s/it]

Accuracy: 0.6572094274075511
Training with (500, 10, 2)...
Fitting done, evaluating...


 78%|███████▊  | 28/36 [20:08<11:26, 85.81s/it]

Accuracy: 0.6610198374641036
Training with (500, 10, 3)...
Fitting done, evaluating...


 81%|████████  | 29/36 [22:40<12:19, 105.59s/it]

Accuracy: 0.6617752387101851
Training with (500, 10, 5)...
Fitting done, evaluating...


 83%|████████▎ | 30/36 [25:15<12:01, 120.30s/it]

Accuracy: 0.6606400173089471
Training with (500, 15, 2)...


In [None]:
with open('results.pkl', 'wb') as f:
    pickle.dump(res, f)