In [1]:
import pandas as pd
import numpy as np
import pyreadr
import import_ipynb
import time

In [2]:
import os
os.environ["R_USER"] = "Jiyoung Sim" # user name

In [3]:
from rpy2 import robjects
from rpy2.robjects import pandas2ri
from rpy2.robjects import numpy2ri

In [4]:
### Step 0: Provide directories for training images. Training images and Training fiducial points will be in different subfolders. 
train_dir = './data/train_set/' # This will be modified for different data sets.
train_image_dir = train_dir + 'images/'
train_pt_dir = train_dir + 'points/'
train_label_path = train_dir + 'label.csv'

In [5]:
### Step 1: set up controls for evaluation experiments.
run_feature_train = True # process features for training set
run_train = True
run_test = True # run evaluation on an independent test set
run_feature_test = True # process features for test set

In [6]:
### Step 2: import data and train-test split 
from sklearn.model_selection import train_test_split
info = pd.read_csv(train_label_path)
train_idx_py, test_idx_py = train_test_split(range(len(info)), test_size=0.2, random_state = 0)
train_idx_r = [i+1 for i in train_idx_py]
test_idx_r = [i+1 for i in test_idx_py]

In [7]:
### Step 3: construct features and responses
feature = robjects.r(
    '''
    source('./lib/feature.R')
    '''
)[0]

In [8]:
n_files = len(os.listdir(train_pt_dir))

#function to read fiducial points
#input: index
#output: matrix of fiducial points corresponding to the index
def readMat(index):
    import scipy.io
    numpy2ri.activate()
    try:
        mat = np.round(scipy.io.loadmat(train_pt_dir + '{:04n}.mat'.format(index))['faceCoordinatesUnwarped'])
    except KeyError:
        mat = np.round(scipy.io.loadmat(train_pt_dir + '{:04n}.mat'.format(index))['faceCoordinates2'])
    nr,nc = mat.shape
    mat_r = robjects.r.matrix(mat, nrow=nr, ncol=nc)
    robjects.r.assign("mat", mat_r)
    return mat_r

#load fiducial points
fiducial_pt_list = [readMat(index) for index in range(1, n_files+1)]
# save fiducial_pt_list.csv

In [9]:
from rpy2.robjects import pandas2ri
pandas2ri.activate()
info_rdf = pandas2ri.py2ri(info)

In [10]:
# %load_ext rpy2.ipython
as_factor = robjects.r('''as.factor''')
if(run_feature_train):
    start = time.time()
    dat_train_r = feature(fiducial_pt_list, train_idx_r, info_rdf)
    end = time.time()
    dat_train_py = pandas2ri.ri2py_dataframe(dat_train_r)
    dat_train_r[-1] = as_factor(dat_train_r[-1])
    tm_feature_train = end - start
    dat_train_py.to_csv('dat_train_py.csv', index=False)

if(run_feature_test):
    start = time.time()
    dat_test_r = feature(fiducial_pt_list, test_idx_r, info_rdf)
    end = time.time()
    dat_test_py = pandas2ri.ri2py_dataframe(dat_test_r)
    dat_test_r[-1] = as_factor(dat_test_r[-1])
    tm_feature_test = end - start
    dat_test_py.to_csv('dat_test_py.csv', index=False)

  res = PandasDataFrame.from_items(items)


In [13]:
### Step 4: Train a classification model with training features and responses
# train
# cv done inside
# Input: a data frame containing features and labels and a parameter list.
# Output:a trained model
baseline_dir = 'baseline_train.sav'
if (run_train==True):
    import train_baseline
    baseline = train_baseline.gbm_fn(dat_train_py.iloc[:,:-1], dat_train_py.iloc[:,-1])
    
    from sklearn.externals import joblib
    joblib.dump(baseline, baseline_dir) # save the model to disk

# test
# Input: features and model directory 
# Output: training model specification
if (run_test==True):
    import test_baseline
    baseline_acc = test_baseline.test_clf(dat_test_py, baseline_dir) 



importing Jupyter notebook from test_baseline.ipynb


In [15]:
baseline_acc

0.45

In [13]:
# all_features = pd.concat([dat_train_py, dat_test_py]).reset_index(drop=True)

In [14]:
# feature_pca = robjects.r(
#     '''
#     source('./lib/feature_pca.R')
#     '''
# )[0]

In [17]:
# pca_thre_r = feature_pca(all_features, info_rdf)
# pca_thre_py = pandas2ri.ri2py_dataframe(pca_thre)
# pca_thre_r[-1] = as_factor(pca_thre_r[-1])

In [None]:
### Summarize Running Time

In [7]:
import feature_cnn
train_generator = feature_cnn.dat_generator(train_idx_py, True, train_image_dir, info)
test_generator = feature_cnn.dat_generator(test_idx_py, False, train_image_dir, info)

importing Jupyter notebook from feature_cnn.ipynb
Found 2000 validated image filenames belonging to 22 classes.
Found 500 validated image filenames belonging to 22 classes.
