1. regular classification - done
2. random columns classification - done
3. classification using PCA feature reduction - done
4. for dist in (wesserstain, hellinger, JM):
    1. ranked columns classification (for ranking, use 1st eigen vector from the diffusion maps)        
    2. ranked + K-means classification (for ranking, use 1st eigen vector from the diffusion maps)
    3. K-mediods classification
    4. K-means + pick feature by distance from axis (0, 0)
### we want to deploy over all distance functions!
    
* ask Neta if the ranking we wrote is correct
* Chen will write the function regarding the ranking values

classification
1. add cross validation k-folds (5-10 folds)
2. ranked + K-means: rank by mean row of the flatten matrix. (the bigger the better)


In [1]:
from math import exp, sqrt, log
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import preprocessing

from dictances import jensen_shannon
import matplotlib.pyplot as plt
from pydiffmap import diffusion_map as dm
from pydiffmap.visualization import embedding_plot, data_plot

from ref.diffusion_maps import diffusion_mapping
from ref.Shir import utils as shir_utils
from utils import min_max_scaler, calc_mean_std, flatten, norm_by_dist_type, calculate_distance
from main import execute_distance_func, calc_dist, export_heatmaps, k_medoids_features, return_best_features_by_kmeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GroupKFold, KFold, StratifiedKFold
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
data = pd.read_csv('data/glass.csv')
features = data.columns.drop('label')
label_column = 'label'
k = 4

In [3]:
def predict(X_train, X_test, y_train, y_test):
    kf = StratifiedKFold(n_splits=5, shuffle=True)
    clf = RandomForestClassifier(random_state=1)
    multi_target_forest = OneVsRestClassifier(clf, n_jobs=-1)
    train_acc = []
    
    for train_index, test_index in kf.split(X_train, y_train):
        model = multi_target_forest.fit(X_train.iloc[train_index], y_train.iloc[train_index])
        train_preds = model.predict(X_train.iloc[test_index])
        
        train_acc.append(metrics.accuracy_score(y_train.iloc[test_index], train_preds))        
    
    model = multi_target_forest.fit(X_train, y_train)
    preds = model.predict(X_test)
    print(metrics.classification_report(y_test, preds, digits=3))
    return train_acc

## Regular classification

In [4]:
X_train, X_test, y_train, y_test = train_test_split(data[features], data[label_column], test_size=0.33, random_state=42)
train_acc = predict(X_train, X_test, y_train, y_test)
train_avg_score = sum(train_acc)/len(train_acc)
print(f'{train_avg_score=}')

              precision    recall  f1-score   support

           1      0.769     0.909     0.833        22
           2      0.792     0.760     0.776        25
           3      1.000     0.500     0.667         4
           5      1.000     0.500     0.667         6
           6      0.800     1.000     0.889         4
           7      0.909     1.000     0.952        10

    accuracy                          0.817        71
   macro avg      0.878     0.778     0.797        71
weighted avg      0.831     0.817     0.809        71

train_avg_score=0.7556650246305419


## Random columns

In [5]:
sampled_data = data[features].sample(n=k, axis='columns')
new_features = sampled_data.columns
sampled_data[label_column] = data[label_column]
X_train, X_test, y_train, y_test = train_test_split(sampled_data[new_features], sampled_data[label_column], test_size=0.33, random_state=42)

train_acc = predict(X_train, X_test, y_train, y_test)
train_avg_score = sum(train_acc)/len(train_acc)
print(f'{train_avg_score=}')

              precision    recall  f1-score   support

           1      0.667     0.818     0.735        22
           2      0.682     0.600     0.638        25
           3      0.000     0.000     0.000         4
           5      0.667     0.333     0.444         6
           6      0.500     0.500     0.500         4
           7      0.643     0.900     0.750        10

    accuracy                          0.648        71
   macro avg      0.526     0.525     0.511        71
weighted avg      0.622     0.648     0.624        71

train_avg_score=0.6283251231527093


## PCA Classification

In [6]:
X_train, X_test, y_train, y_test = train_test_split(data[features], data[label_column], test_size=0.33, random_state=42)
# Norm
X_train_norm, X_test_norm = min_max_scaler(X_train, features, X_test, False)

#PCA
pca = PCA(n_components=k)
pca.fit(X_train_norm)
X_train_pca = pca.transform(X_train_norm)
X_test_pca = pca.transform(X_test_norm)
y_train_arr = y_train.to_numpy()
y_test_arr = y_test.to_numpy()



In [7]:
def predict_np(X_train, X_test, y_train, y_test):
    kf = StratifiedKFold(n_splits=5, shuffle=True)
    clf = RandomForestClassifier(random_state=1)
    multi_target_forest = OneVsRestClassifier(clf, n_jobs=-1)
    train_acc = []
    
    for train_index, test_index in kf.split(X_train, y_train):
        model = multi_target_forest.fit(X_train[train_index], y_train[train_index])
        train_preds = model.predict(X_train[test_index])
        
        train_acc.append(metrics.accuracy_score(y_train[test_index], train_preds))        
    
    model = multi_target_forest.fit(X_train, y_train)
    preds = model.predict(X_test)
    print(metrics.classification_report(y_test, preds, digits=3))
    return train_acc

In [8]:
train_acc = predict_np(X_train_pca, X_test_pca, y_train_arr, y_test_arr)
train_avg_score = sum(train_acc)/len(train_acc)
print(f'{train_avg_score=}')

              precision    recall  f1-score   support

           1      0.720     0.818     0.766        22
           2      0.643     0.720     0.679        25
           3      0.000     0.000     0.000         4
           5      0.600     0.500     0.545         6
           6      1.000     0.500     0.667         4
           7      0.818     0.900     0.857        10

    accuracy                          0.704        71
   macro avg      0.630     0.573     0.586        71
weighted avg      0.672     0.704     0.681        71

train_avg_score=0.6571428571428571


  _warn_prf(average, modifier, msg_start, len(result))


# Our Methodology

In [9]:
# df_norm = min_max_scaler(df, features)

## Wasserstein Distance

In [9]:
X_train, X_test, y_train, y_test = train_test_split(data[features], data[label_column], test_size=0.33, random_state=42)
# Norm
X_train_norm, X_test_norm = min_max_scaler(X_train, features, X_test)

df_dists, dist_dict = calc_dist('wasserstein_dist', X_train_norm, y_train)

eps_type='maxmin'#mean' #or maxmin
alpha=1
vec, egs, coordinates, dataList, epsilon, ranking = (diffusion_mapping(df_dists, alpha, eps_type, 8, 1, dim=2))

ValueError: Distribution can't be empty.

### ranked columns classification (for ranking, use 1st eigen vector from the diffusion maps)

In [None]:
flat_ranking = [item for sublist in ranking for item in sublist]
ranking_idx = np.argsort(flat_ranking)
print(f'{ranking_idx=}')
predict(X_train.iloc[:, ranking_idx[-k:]], X_test.iloc[:, ranking_idx[-k:]], y_train, y_test)

### ranked + K-means classification (for ranking, use 1st eigen vector from the diffusion maps)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data[features], data[label_column], test_size=0.33, random_state=42)

best_features, labels, features_rank = return_best_features_by_kmeans(coordinates, k)
print(f'{best_features=}')
predict(X_train.iloc[:, best_features], X_test.iloc[:, best_features], y_train, y_test)

### K-mediods classification

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data[features], data[label_column], test_size=0.33, random_state=42)
k_features = k_medoids_features(coordinates, k)
print(f'{k_features=}')
predict(X_train.iloc[:, k_features], X_test.iloc[:, k_features], y_train, y_test)

## JM Distance

In [10]:
X_train, X_test, y_train, y_test = train_test_split(data[features], data[label_column], test_size=0.33, random_state=42)
# Norm
X_train_norm, X_test_norm = min_max_scaler(X_train, features, X_test)

df_dists, dist_dict = calc_dist('jm_dist', X_train_norm, y_train)
eps_type='maxmin'#mean' #or maxmin
alpha=1
vec, egs, coordinates, dataList, epsilon, ranking = (diffusion_mapping(df_dists, alpha, eps_type, 8, 1, dim=2))

LinAlgError: SVD did not converge

### ranked columns classification (for ranking, use 1st eigen vector from the diffusion maps)

In [None]:
flat_ranking = [item for sublist in ranking for item in sublist]
ranking_idx = np.argsort(flat_ranking)
print(f'{ranking_idx=}')
predict(X_train.iloc[:, ranking_idx[-k:]], X_test.iloc[:, ranking_idx[-k:]], y_train, y_test)

### ranked + K-means classification (for ranking, use 1st eigen vector from the diffusion maps)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data[features], data[label_column], test_size=0.33, random_state=42)
best_features, labels, features_rank = return_best_features_by_kmeans(coordinates, k)
print(f'{best_features=}')
predict(X_train.iloc[:, best_features], X_test.iloc[:, best_features], y_train, y_test)

### K-mediods classification

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data[features], data[label_column], test_size=0.33, random_state=42)
k_features = k_medoids_features(coordinates, k)
print(f'{k_features=}')
predict(X_train.iloc[:, k_features], X_test.iloc[:, k_features], y_train, y_test)

## Hellinger Distance

In [12]:
X_train, X_test, y_train, y_test = train_test_split(data[features], data[label_column], test_size=0.33, random_state=42)
# Norm
X_train_norm, X_test_norm = min_max_scaler(X_train, features, X_test)

df_dists, dist_dict = calc_dist('hellinger', X_train_norm, y_train)

eps_type='maxmin'#mean' #or maxmin
alpha=1
vec, egs, coordinates, dataList, epsilon, ranking = (diffusion_mapping(df_dists, alpha, eps_type, 8, 1, dim=2))

AssertionError: 

### ranked columns classification (for ranking, use 1st eigen vector from the diffusion maps)

In [None]:
flat_ranking = [item for sublist in ranking for item in sublist]
ranking_idx = np.argsort(flat_ranking)
print(f'{ranking_idx=}')
predict(X_train.iloc[:, ranking_idx[-k:]], X_test.iloc[:, ranking_idx[-k:]], y_train, y_test)

### ranked + K-means classification (for ranking, use 1st eigen vector from the diffusion maps)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data[features], data[label_column], test_size=0.33, random_state=42)

best_features, labels, features_rank = return_best_features_by_kmeans(coordinates, k)
print(f'{best_features=}')
predict(X_train.iloc[:, best_features], X_test.iloc[:, best_features], y_train, y_test)

### K-mediods classification

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data[features], data[label_column], test_size=0.33, random_state=42)

k_features = k_medoids_features(coordinates, k)
print(f'{k_features=}')
predict(X_train.iloc[:, k_features], X_test.iloc[:, k_features], y_train, y_test)