In [283]:
# Basic models
import traceback
import numpy as np
import pandas as pd

# Data operation models
from sklearn.model_selection import StratifiedKFold
from sklearn import cross_validation
from sklearn.model_selection import ParameterGrid

# Classifier models
from sklearn.svm import SVC

# Evaluation models
import sklearn.metrics as sm

In [284]:
# Oversampling by SMOTE
from sklearn.neighbors import NearestNeighbors as kNN
def run_SMOTE(X, y, k, r):
    
    # obtain postive and negative data
    v = np.unique(y)
    if len(v) == 2:
        if sum(y == v[0]) >= sum(y == v[1]):
            X_pos = X[y == v[1],:]
            X_neg = X[y == v[0],:]
            y_pos = y[y == v[1]]
            y_neg = y[y == v[0]]
        else:
            X_pos = X[y == v[0],:]
            X_neg = X[y == v[1],:]
            y_pos = y[y == v[0]]
            y_neg = y[y == v[1]]
    else:
        raise Exception("Not a binary-class!")    
    n_pos = X_pos.shape[0]
    n_neg = X_neg.shape[0]
    
    # constrain hyper-parameters to their suitable ranges
    if n_pos == 0:
        raise Exception("No positive samples!")
    elif k + 1 > n_pos:
        k = n_pos - 1
    else:
        while k > 1:
            while r > 1:
                if n_pos + n_pos*k*r >= n_neg:
                    r = r - 1
                else:
                    break   
            if n_pos + n_pos*k*r >= n_neg:
                k = k - 1
            else:
                break
    
    # find k nearest neighbors of each positive sample
    kNN_model = kNN(n_neighbors=k+1)
    kNN_model.fit(X_pos)
    i_neighbor = np.argsort(kNN_model.kneighbors(X_pos, return_distance = False), axis = 1)
    i_neighbor = i_neighbor[:,1:]
    
    # generate new positive samples and corresponding labels
    new_pos_mat = []
    for i in xrange(n_pos):
        now_sample = X_pos[i,:]
        for j in xrange(k):
                new_sample_mat = get_new_pos(now_sample, X_pos[i_neighbor[i,j],:], r)
                new_pos_mat.extend(new_sample_mat)
    new_pos_label = y_pos[0] * np.ones((len(new_pos_mat)))
    
    # combine the newly-generated ones to the original data
    X_new = np.concatenate((X_neg, X_pos, new_pos_mat), axis = 0)
    y_new = np.concatenate((y_neg, y_pos, new_pos_label), axis = 0)                                 
    i_shuffle = np.random.permutation(len(y_new))
    X_new = X_new[i_shuffle, :] 
    y_new = y_new[i_shuffle] 
    
    return X_new, y_new, k, r

In [285]:
def get_new_pos(point, neighbor, r):
    # Function to generate new r samples according to point and one of its neighbor
    new_points = []
    for i_r in xrange(r):
        new_point = point + (point - neighbor) * np.random.rand(point.shape[0])
        new_points.extend(np.array([new_point]))
    return new_points                                         

In [286]:
if __name__ == '__main__':
    N_pos = 10
    N_neg = 3000
    N = N_pos + N_neg
    a = np.around(np.random.rand(N,4)*1000)/100
    b = np.concatenate((np.zeros(N_neg), np.ones(N_pos)), axis = 0)
    i = np.random.permutation(N)
    a = a[i,:]
    b = b[i]
    a_new, b_new, k_new, r_new = run_SMOTE(a, b, 3, 2)
    print a_new, b_new, k_new, r_new
    

[[ 8.98  7.9   8.95  3.96]
 [ 8.25  3.01  5.94  5.6 ]
 [ 3.14  8.38  2.1   7.29]
 ..., 
 [ 8.9   1.76  4.86  3.82]
 [ 7.08  6.7   2.63  5.02]
 [ 3.63  8.19  8.8   4.  ]] [ 0.  0.  0. ...,  0.  0.  0.] 3 2
