In [6]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import pdist, squareform
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
import time

In [2]:
def feature_initial(input_list, index, info = np.nan):
    ### Construct initial features for training images 
    ### Input: input_list: a list of fiducial points; index: train index or test index; 
    ###        info: optional labeled data frame
    ### Output: a data frame containing: features and a column of label (if info is not provided, then only features)

    ### Step 1: Write a function pairwise_dist to calculate pairwise distance of items in a vector
    def pairwise_dist(vec):
        n = len(vec)
        dist_matrix = pairwise_distances(np.array(vec).reshape(-1,1),metric='euclidean')
        return list(dist_matrix[np.triu_indices(n,k=1)])
    
    ### Step 2: Write a function pairwise_dist_result to apply function in Step 1 to column of a matrix 
    def pairwise_dist_result(mat):
        ### input: a n*2 matrix(e.g. fiducial_pt_list[[1]]), output: a vector(length n(n-1))
        return list(np.transpose(np.apply_along_axis(pairwise_dist,0,mat)).flatten())
    
    ### Step 3: Apply function in Step 2 to selected index of input list, output: a feature matrix with ncol = n(n-1) = 78*77 = 6006
    pairwise_dist_feature = ((np.array(list(map(pairwise_dist_result, [input_list[i] for i in index])))))
    pairwise_dist_feature.shape
    
    colnames = ['feature'+str(i) for i in range(pairwise_dist_feature.shape[1])]
    df = pd.DataFrame(pairwise_dist_feature,columns=colnames)
    #if there's a label column then include it in the output, otherwise don't include a label column
    try:
        label_df = pd.DataFrame(list(info['label'].iloc[index]),columns=['labels'])
        pairwise_data = pd.concat([df,label_df],axis=1)
    except:
        pairwise_data = df
        
    return pairwise_data

In [3]:
def feature_improved(input_list, index, info=np.nan):
    ### Construct process features for training images 
    ### Input: a list of images or fiducial points; index: train index or test index
    ### Output: a data frame containing: features and a column of label

    ### Step 1: Write a function pairwise_dist to calculate pairwise distance of items in a matrix
    ### For every two points p1 and p2, the distance=sqrt((p1[0]-p2[0])**2+(p1[1]-p2[1])**2)
    def pairwise_dist(mat):
        ### input: a n*2 matrix(e.g. fiducial_pt_list[[1]]), output: a vector(length n(n-1)/2)
        dist = pdist(mat)
        return dist
    
    ### Step 2: Apply function in Step 1 to selected index of input list, output: a feature matrix with ncol = n(n-1)/2 = 78*77/2 = 3003
    pairwise_dist_feature = ((np.array(list(map(pairwise_dist, [input_list[i] for i in index])))))
    pairwise_dist_feature.shape
    
    colnames = ['feature'+str(i) for i in range(pairwise_dist_feature.shape[1])]
    df = pd.DataFrame(pairwise_dist_feature,columns=colnames)
    #if there's a label column then include it in the output, otherwise don't include a label column
    try:
        label_df = pd.DataFrame(list(info['label'].iloc[index]),columns=['labels'])
        pairwise_data = pd.concat([df,label_df],axis=1)
    except:
        pairwise_data = df
        
    return pairwise_data

In [4]:
#this function is only applied to labeled training data
#SMOTE is not used on test data

def feature_SMOTE(dat_train):    
    
    feature_train = dat_train.loc[:, dat_train.columns != 'labels']
    label_train = dat_train['labels'] 
    over = SMOTE(sampling_strategy='auto')
    under = RandomUnderSampler(sampling_strategy='auto')
    sm = Pipeline(steps = [('o', over), ('u', under)])
    
    feature_train_sm, label_train_sm = sm.fit_resample(feature_train,label_train)
    
    colnames = ['feature'+str(i) for i in range(feature_train_sm.shape[1])]
    df = pd.DataFrame(feature_train_sm,columns=colnames)
    label_df = pd.DataFrame(list(label_train_sm),columns=['labels'])
    
    SMOTE_data = pd.concat([df,label_df],axis=1)
    return SMOTE_data

In [5]:
def feature_PCA(dat_train, dat_test):
    
    #train data transformation
    start = time.time()

    feature_train = dat_train.loc[:, dat_train.columns != 'labels']
    label_train = dat_train['labels'] 
    
    scaler = MinMaxScaler()
    feature_train_scaled = scaler.fit_transform(feature_train)
    pca = PCA(n_components = 0.95, svd_solver='full').fit(feature_train_scaled)
    feature_train_PCA = pca.transform(feature_train_scaled)
    
    colnames = ['feature'+str(i) for i in range(feature_train_PCA.shape[1])]
    df = pd.DataFrame(feature_train_PCA,columns=colnames)
    label_df = pd.DataFrame(list(label_train),columns=['labels'])
    dat_train_PCA = pd.concat([df,label_df],axis = 1)
    
    end = time.time()
    tm_feature_train_PCA = end-start
       
    #test data transformation
    start = time.time()
    
    feature_test = dat_test.loc[:, dat_test.columns != 'labels']
    label_test = dat_test['labels']
    feature_test_scaled = scaler.fit_transform(feature_test)
    feature_test_PCA = pca.transform(feature_test_scaled)
    
    colnames = ['feature'+str(i) for i in range(feature_test_PCA.shape[1])]
    df = pd.DataFrame(feature_test_PCA,columns=colnames)
    label_df = pd.DataFrame(list(label_test),columns=['labels'])
    dat_test_PCA = pd.concat([df,label_df],axis = 1)

    end = time.time()
    tm_feature_test_PCA = end-start
    
    return [dat_train_PCA, dat_test_PCA, tm_feature_train_PCA, tm_feature_test_PCA]