In [2]:
from math import exp, sqrt, log
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import preprocessing

from dictances import jensen_shannon
import matplotlib.pyplot as plt
from pydiffmap import diffusion_map as dm
from pydiffmap.visualization import embedding_plot, data_plot

from ref.diffusion_maps import diffusion_mapping
from ref.Shir import utils as shir_utils
from utils import min_max_scaler, calc_mean_std, flatten, norm_by_dist_type, calculate_distance
from main import execute_distance_func, calc_dist, export_heatmaps, k_medoids_features, return_best_features_by_kmeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier

# Data From - https://archive.ics.uci.edu/ml/datasets/glass+identification

In [3]:
pd.set_option('display.max_columns', None)
df_glass = pd.read_csv('data/glass.csv')
df_glass.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,label
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


In [4]:
df_glass.describe()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,label
count,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0
mean,1.518365,13.40785,2.684533,1.444907,72.650935,0.497056,8.956963,0.175047,0.057009,2.780374
std,0.003037,0.816604,1.442408,0.49927,0.774546,0.652192,1.423153,0.497219,0.097439,2.103739
min,1.51115,10.73,0.0,0.29,69.81,0.0,5.43,0.0,0.0,1.0
25%,1.516523,12.9075,2.115,1.19,72.28,0.1225,8.24,0.0,0.0,1.0
50%,1.51768,13.3,3.48,1.36,72.79,0.555,8.6,0.0,0.0,2.0
75%,1.519157,13.825,3.6,1.63,73.0875,0.61,9.1725,0.0,0.1,3.0
max,1.53393,17.38,4.49,3.5,75.41,6.21,16.19,3.15,0.51,7.0


In [5]:
df_glass['label'].value_counts()

2    76
1    70
7    29
3    17
5    13
6     9
Name: label, dtype: int64

# Feature HeatMaps w/ distance

# Feature HeatMaps w/ distance normalization

In [6]:
features = df_glass.columns.tolist()[:-1]
df_norm = min_max_scaler(df_glass, features)

export_heatmaps(df_norm, features, 'wasserstein_dist', 'hellinger_dist')

export_heatmaps(df_norm, features, 'wasserstein_dist', 'jensen_shannon_dist')

export_heatmaps(df_norm, features, 'jm_dist', 'hellinger_dist')

export_heatmaps(df_norm, features, 'bhattacharyya_dist', 'hellinger_dist')

In [7]:
df_dists_wasser, dist_dict_wasser = calc_dist('wasserstein_dist', df_norm, 'label')
df_dists_wasser.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35
0,0.0,0.057404,0.039554,0.077971,0.088249,0.074416,0.057404,0.0,0.044238,0.10449,0.11925,0.06595,0.039554,0.044238,0.0,0.091602,0.090029,0.049388,0.077971,0.10449,0.091602,0.0,0.077996,0.10609,0.088249,0.11925,0.090029,0.077996,0.0,0.101465,0.074416,0.06595,0.049388,0.10609,0.101465,0.0
1,0.0,0.023715,0.038668,0.062345,0.211185,0.18629,0.023715,0.0,0.051571,0.045014,0.23082,0.200054,0.038668,0.051571,0.0,0.091634,0.181896,0.153307,0.062345,0.045014,0.091634,0.0,0.27353,0.242763,0.211185,0.23082,0.181896,0.27353,0.0,0.066984,0.18629,0.200054,0.153307,0.242763,0.066984,0.0
2,0.0,0.126256,0.017595,0.618838,0.500417,0.671303,0.126256,0.0,0.12498,0.496272,0.377851,0.548737,0.017595,0.12498,0.0,0.616856,0.498435,0.669322,0.618838,0.496272,0.616856,0.0,0.127672,0.093866,0.500417,0.377851,0.498435,0.127672,0.0,0.210361,0.671303,0.548737,0.669322,0.093866,0.210361,0.0
3,0.0,0.076106,0.03326,0.271025,0.103264,0.298723,0.076106,0.0,0.064644,0.194918,0.062861,0.222617,0.03326,0.064644,0.0,0.259399,0.085476,0.287097,0.271025,0.194918,0.259399,0.0,0.207844,0.11048,0.103264,0.062861,0.085476,0.207844,0.0,0.235543,0.298723,0.222617,0.287097,0.11048,0.235543,0.0
4,0.0,0.031805,0.038436,0.097747,0.117018,0.097753,0.031805,0.0,0.053443,0.089486,0.113343,0.082499,0.038436,0.053443,0.0,0.110601,0.143207,0.128792,0.097747,0.089486,0.110601,0.0,0.157906,0.117947,0.117018,0.113343,0.143207,0.157906,0.0,0.121326,0.097753,0.082499,0.128792,0.117947,0.121326,0.0
5,0.0,0.012137,0.006609,0.168052,0.07205,0.074282,0.012137,0.0,0.018451,0.161298,0.083905,0.080751,0.006609,0.018451,0.0,0.17301,0.065454,0.069815,0.168052,0.161298,0.17301,0.0,0.236715,0.184352,0.07205,0.083905,0.065454,0.236715,0.0,0.052363,0.074282,0.080751,0.069815,0.184352,0.052363,0.0
6,0.0,0.072304,0.015714,0.191095,0.094874,0.030739,0.072304,0.0,0.086486,0.184064,0.11275,0.095329,0.015714,0.086486,0.0,0.200945,0.104672,0.029208,0.191095,0.184064,0.200945,0.0,0.099355,0.185745,0.094874,0.11275,0.104672,0.099355,0.0,0.09842,0.030739,0.095329,0.029208,0.185745,0.09842,0.0
7,0.0,0.012221,0.003663,0.055549,0.004036,0.326122,0.012221,0.0,0.013809,0.051565,0.015957,0.316458,0.003663,0.013809,0.0,0.056784,0.002801,0.327358,0.055549,0.051565,0.056784,0.0,0.059585,0.283929,0.004036,0.015957,0.002801,0.059585,0.0,0.330159,0.326122,0.316458,0.327358,0.283929,0.330159,0.0
8,0.0,0.044582,0.027072,0.092502,0.111765,0.085396,0.044582,0.0,0.05594,0.110264,0.156347,0.129978,0.027072,0.05594,0.0,0.066188,0.11188,0.085511,0.092502,0.110264,0.066188,0.0,0.119155,0.097779,0.111765,0.156347,0.11188,0.119155,0.0,0.026369,0.085396,0.129978,0.085511,0.097779,0.026369,0.0


In [None]:
eps_type='maxmin'#mean' #or maxmin
alpha=1
vec_wasser, egs_wasser, coordinates_wasser, dataList_wasser, epsilon_wasser = (
diffusion_mapping(df_dists_wasser, alpha, eps_type, 8, 1, dim=2)
)

plt.xlim(-0.2, 0.03)
plt.ylim(-0.2, 0.03)
plt.scatter(coordinates_wasser[0], coordinates_wasser[1])

In [None]:
df_coordinates_wasser = pd.DataFrame(data=coordinates_wasser).T
df_coordinates_wasser.head(10)

In [None]:
dists = []
for index1, row1 in df_coordinates_wasser.iterrows():
    dists2 = [calculate_distance(row1, row2) for index2, row2 in df_coordinates_wasser.iterrows()]
    dists.append(dists2)
df_DM_dists_wasser = pd.DataFrame(dists)
df_DM_dists_wasser.head(10)

In [None]:
df_dists_jm, dist_dict_jm = calc_dist('jm_dist', df_norm, 'label')
df_dists_jm.head(10)

In [None]:
eps_type='maxmin'#mean' #or maxmin
alpha=1
vec_jm, egs_jm, coordinates_jm, dataList_jm, epsilon_jm = diffusionMapping(df_dists_jm, alpha, eps_type, 4, 1, dim=2)

In [None]:
plt.xlim(-0.5, 0.7)
plt.ylim(-0.5, 0.7)
plt.scatter(coordinates_jm[0], coordinates_jm[1])

In [None]:
df_coordinates_jm = pd.DataFrame(data=coordinates_jm).T
df_coordinates_jm.head(10)

In [None]:
dists = []
for index1, row1 in df_coordinates_jm.iterrows():
    dists2 = [calculate_distance(row1, row2) for index2, row2 in df_coordinates_jm.iterrows()]
    dists.append(dists2)
df_DM_dists_jm = pd.DataFrame(dists)
df_DM_dists_jm.head(10)

In [None]:
neighbor_params = {'n_jobs': -1, 'algorithm': 'ball_tree'}
mydmap = dm.DiffusionMap.from_sklearn(n_evecs = 2, epsilon = 1.0, alpha = 0.5, k=16)
dmap = mydmap.fit_transform(df_norm)

In [None]:
embedding_plot(mydmap, scatter_kwargs = {'c': dmap[:,0], 'cmap': 'Spectral'}) 
data_plot(mydmap, dim=3, scatter_kwargs = {'cmap': 'Spectral'}) 
plt.show()

### PreProcess
Normalize the data

### Add Heatmap for each feature + heat scala

### 1st step - Embedding 
Over the feature matrix (6x6 / 4x4 / 3x3) - CNN
Over the feature row (1x36) - AutoEncoder
Over the flatenned matrix -  Random Projections / DM

### 2nd step - Diffusion Maps
DM over the encoded matrix (9x36-->9x2 for example)
DM from https://datafold-dev.gitlab.io/datafold/intro.html

### 3rd step - plot

