In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
b = pd.read_csv('data/3nja-bsch.tsv', sep='\t')
b = b.round(1)
print('Before Clean:', b.shape)
b.dropna(how = 'any', inplace=True)
b.reset_index(drop=True, inplace=True)
print('After Clean:', b.shape)

c = pd.read_csv('data/avir-tzek.tsv', sep='\t')
d = c.round(1)
print('Before Clean:', c.shape)
c.dropna(how = 'any', inplace=True)
c.reset_index(drop=True, inplace=True)
print('After Clean:', c.shape)

d = pd.read_csv('data/h3zm-ta5h.tsv', sep='\t')
d = d.round(1)
print('Before Clean:', d.shape)
d.dropna(how = 'any', inplace=True)
d.reset_index(drop=True, inplace=True)
print('After Clean:', d.shape)

e = pd.read_csv('data/m59i-mqex.tsv', sep='\t')
e = e.round(1)
print('Before Clean:',e.shape)
e.dropna(how = 'any', inplace=True)
e.reset_index(drop=True, inplace=True)
print('After Clean:', e.shape)

Before Clean: (43, 22)
After Clean: (41, 22)
Before Clean: (385, 22)
After Clean: (385, 22)
Before Clean: (490, 16)
After Clean: (490, 16)
Before Clean: (251, 48)
After Clean: (217, 48)


In [4]:
#aa = a.copy()
#aa.fillna(aa.median(numeric_only=True, skipna=True), inplace=True)
#aa.fillna(aa.mode().iloc[0], inplace=True)

bb = b.copy()
bb.fillna(bb.median(numeric_only=True, skipna=True), inplace=True)
bb.fillna(bb.mode().iloc[0], inplace=True)

cc = c.copy()
cc.fillna(cc.median(numeric_only=True, skipna=True), inplace=True)
cc.fillna(cc.mode().iloc[0], inplace=True)

dd = d.copy()
dd.fillna(dd.median(numeric_only=True, skipna=True), inplace=True)
dd.fillna(dd.mode().iloc[0], inplace=True)

ee = e.copy()
ee.fillna(ee.median(numeric_only=True, skipna=True), inplace=True)
ee.fillna(ee.mode().iloc[0], inplace=True)

In [8]:
def getCounts(labels):
    values, counts = np.unique(labels, return_counts=True)
    
    d = {}
    for v, c in zip(values, counts):
        d[v] = c
    return d

def calcEntropy(labels):
    counts_raw = np.array(list(labels.values())) 
    if counts_raw.shape[0] == 1:
        return 0
    counts = counts_raw[counts_raw.nonzero()]
    
    probs = counts/counts.sum() #get probs of feature
    
    label_entropy = -np.sum(probs * np.log(probs)) #calculate entropy
    entropy_weight = 2*(1 - 1/(1 + np.exp(-label_entropy)))
    
    return  entropy_weight * label_entropy

def adjustEntropy(data, dicts):
    entropy_row = np.zeros(len(dicts))
    for pos, key  in enumerate(data): #find value to remove in each feature
        dicts[pos][key] -= 1
        entropy_row[pos] = calcEntropy(dicts[pos])
        dicts[pos][key] += 1
    return entropy_row

def GreedyWeightedEntropy(data):
    
    d1 = [getCounts(data[col]) for col in data.columns] #get feature dicts
    tot_e = np.sum([calcEntropy(d) for d in d1]) #calc initial weighted-entropy
    o_factors = np.array([tot_e for x in range(data.shape[0])])
    for i in data.index: 
        
        adjusted = adjustEntropy(data.iloc[i], d1).sum()
        o_factors[i] -= adjusted
            
    return np.where(o_factors > 0), o_factors
    

In [None]:
for data in [(b,'3nja-bsch'),(c,'avir-tzek'),(d,'h3zm-ta5h'),(e,'m59i-mqex')]:
    outliers, factors = GreedyWeightedEntropy(data[0])
    print(outliers)
    fig = plt.figure(figsize=(8,6))
    
    factors_pos = factors.copy()
    factors_neg = factors.copy()

    factors_pos[factors_pos <= 0] = np.nan
    factors_neg[factors_neg > 0] = np.nan
    
    
    plt.scatter(data[0].index, factors_pos, color='r')
    plt.scatter(data[0].index, factors_neg, color='b')
    plt.grid()
    plt.xlabel('Row Index')
    plt.ylabel('Outlier Factor')
    plt.suptitle('GWE Results\n{}'.format(data[1]))
    fig.savefig('{}.png'.format(data[1]), bbox_inches='tight')

(array([39, 40], dtype=int64),)
