In [1]:
import pandas as pd
import csv
import numpy as np
import sklearn as sk
from sklearn import metrics

In [13]:
def MDKS(DIR,HEADER,k,COV,VERBOSE,PRINTRESULTS):
    #READ
    M = pd.read_csv(DIR, ';',header = HEADER)

    # M is our input matrix with observations placed on rows and variables on columns
    #M = np.array(M)
    if COV:
        Cov = pd.DataFrame.cov(M)
        Cov.to_csv(DIR.replace(".csv", str('COV')) + '.csv' ,sep = ';', header = True, mode = 'a')
    # n = number of observations = number of rows
    n=len(M)
    columns = len(M.columns)
    if VERBOSE:
            print("Input Size:", n, "Desired Size:", k)
    assert n >= 2 and n >= k and k >= 2, "Error: number of rows must >= 2, k must >= 2 and k must > number of rows"        

    MDist = sk.metrics.pairwise_distances(M.T, metric = 'mahalanobis', n_jobs = -1)
    #We made a matrix of Mahalanobis distances between pairs, its a square matrix by definition
    #for this, rows are observations and columns are variables
    #Now, selecting two samples furthest apart in Mahalanobis metric:

    s0, s1 = np.unravel_index(np.argmax(MDist, axis=None), MDist.shape)
    #ARGMAX takes your matrix, flattens it and returns the indice(int) of the max element
    #UNRAVEL_INDEX proceeds to find where that element would be in your matrix and returns (row,column) of that element
    # But we are talking about a Mahalanobis distance matrix, so, an Element[i][j] (i!=j) represents the distance from
    #Samples i and j (counting from 0)
    selected = set([s0, s1])
    k -= 2

    #Next we select new samples bases on the minimal distances from our selected samples until we reach K samples
    #This needs to be a distance > 0 because of the main diagonal, and there is no such thing as negative distance
    #Checking minimal distances sweeping rows (square matrix)
    minj = s0
    while k > 0 and len(selected) < n:
        mindist = 0.0
        for j in range(columns):
            if j not in selected:
                mindistj = min([MDist[j][i] for i in selected])
                if mindistj > mindist:
                    minj = j
                    mindist = mindistj
        selected.add(minj)
        k -= 1
        
    if PRINTRESULTS:
        M.iloc[list(selected), :].to_csv(DIR.replace(".csv", str('SELECTED_SAMPLES')) + '.csv',sep = ';', header = True, mode = 'a')
        

    return M.iloc[list(selected), :]
    
  


    




        
        
    



In [19]:
#MDKS(r'C:\Users\jvabd\Desktop\Projeto final\Banco de dados\Data Splitting Mahalanobis\TESTE.csv', HEADER = None ,k=39,COV=True,VERBOSE=True,PRINTRESULTS=True)

Input Size: 536 Desired Size: 39


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,1,0.0,2.0,311.15,255.0,56.0,14,289.15,256.15,1696000000000.0,...,40,75500,8.314462,3.5,3700,2.032484,18.037788,135.694981,0,0
1,1,0.606061,1.978296,310.158835,256.878295,56.0,14,289.15,256.15,1696000000000.0,...,40,75500,8.314462,3.5,3700,2.032484,16.253297,129.292939,0,0
2,1,1.212121,1.9579,309.208465,258.652581,56.0,14,289.15,256.15,1696000000000.0,...,40,75500,8.314462,3.5,3700,2.032484,14.701465,123.177082,0,0
4,1,2.424242,1.920547,307.426707,261.910131,56.0,14,289.15,256.15,1696000000000.0,...,40,75500,8.314462,3.5,3700,2.032484,12.164007,111.769557,0,0
5,1,3.030303,1.903389,306.593259,263.403326,56.0,14,289.15,256.15,1696000000000.0,...,40,75500,8.314462,3.5,3700,2.032484,11.125195,106.459466,0,0
6,1,3.636364,1.887129,305.796561,264.812378,56.0,14,289.15,256.15,1696000000000.0,...,40,75500,8.314462,3.5,3700,2.032484,10.21104,101.399063,0,0
7,1,4.242424,1.871694,305.035382,266.141701,56.0,14,289.15,256.15,1696000000000.0,...,40,75500,8.314462,3.5,3700,2.032484,9.404207,96.578633,0,0
8,1,4.848485,1.85702,304.308456,267.395497,56.0,14,289.15,256.15,1696000000000.0,...,40,75500,8.314462,3.5,3700,2.032484,8.69002,91.988462,0,0
10,1,6.060606,1.829736,302.952221,269.69231,56.0,14,289.15,256.15,1696000000000.0,...,40,75500,8.314462,3.5,3700,2.032484,7.491644,83.460548,0,0
11,1,6.666667,1.81703,302.320345,270.742748,56.0,14,289.15,256.15,1696000000000.0,...,40,75500,8.314462,3.5,3700,2.032484,6.987844,79.504048,0,0


In [97]:
#M

array([[-2.1 ,  3.  ],
       [-1.  ,  1.1 ],
       [ 4.3 ,  0.12]])

In [3]:
#M = np.array([[-2.1,3],[-1,1.1],[4.3,0.12]])

In [113]:
#Cov

array([[11.71      , -4.286     ],
       [-4.286     ,  2.14413333]])