In [98]:
import pandas as pd
import csv
import numpy as np
import sklearn as sk
from sklearn import metrics

In [129]:
def MDKS(DIR,HEADER,k,COV,VERBOSE,PRINTRESULTS):
    #READ
    M = pd.read_csv(DIR, ';',header = HEADER)

    # M is our input matrix with observations placed on rows and variables on columns
    #M = np.array(M)
    if COV:
        Cov = pd.DataFrame.cov(M)
        Cov.to_csv(DIR.replace(".csv", str(_COV)) + '.csv' ,sep = ';', header = True, mode = 'a')
    # n = number of observations = number of rows
    n=len(M)
    if VERBOSE:
            print("Input Size:", n, "Desired Size:", k)
    assert n >= 2 and n >= k and k >= 2, "Error: number of rows must >= 2, k must >= 2 and k must > number of rows"        

    MDist = sk.metrics.pairwise_distances(M.T, metric = 'mahalanobis', n_jobs = -1)
    #We made a matrix of Mahalanobis distances between pairs, its a square matrix by definition
    #for this, rows are observations and columns are variables
    #Now, selecting two samples furthest apart in Mahalanobis metric:

    s0, s1 = np.unravel_index(np.argmax(MDist, axis=None), MDist.shape)
    #ARGMAX takes your matrix, flattens it and returns the indice(int) of the max element
    #UNRAVEL_INDEX proceeds to find where that element would be in your matrix and returns (row,column) of that element
    # But we are talking about a Mahalanobis distance matrix, so, an Element[i][j] (i!=j) represents the distance from
    #Sample i and j (counting from 0)
    selected = set([s0, s1])
    k -= 2

    #Next we select new samples bases on the minimal distances from our selected samples until we reach K samples
    #This needs to be a distance > 0 because of the main diagonal, and there is no such thing as negative distance
    #Checking minimal distances sweeping rows (square matrix)
    minj = s0
    while k > 0 and len(selected) < n:
        mindist = 0.0
        for j in range(n):
            if j not in selected:
                mindistj = min([MDist[j][i] for i in selected])
                if mindistj > mindist:
                    minj = j
                    mindist = mindistj
        selected.add(minj)
        k -= 1
        
    if PRINTRESULTS:
        M.iloc[list(selected), :].to_csv(DIR.replace(".csv", str(_SELECTED_SAMPLES)) + '.csv',sep = ';', header = True, mode = 'a')
        

    return M.iloc[list(selected), :]
    
  


    




        
        
    



In [128]:
#MDKS(r'C:\Users\jvabd\Desktop\Projeto final\Banco de dados\Data Splitting Mahalanobis\TESTE.csv', HEADER = None ,k=2,COV=True,VERBOSE=True,PRINTRESULTS=True)

Input Size: 3 Desired Size: 2


Unnamed: 0,0,1
0,-2.1,3.0
1,-1.0,1.1


In [97]:
#M

array([[-2.1 ,  3.  ],
       [-1.  ,  1.1 ],
       [ 4.3 ,  0.12]])

In [None]:
#M = np.array([[-2.1,3],[-1,1.1],[4.3,0.12]])

In [113]:
#Cov

array([[11.71      , -4.286     ],
       [-4.286     ,  2.14413333]])