1. regular classification
2. random columns classification
3. classification using PCA feature reduction

3. for dist in (wesserstain, hellinger, JM):
    1. ranked columns classification (for ranking, use 1st eigen vector from the diffusion maps)
    2. ranked + K-means classification (for ranking, use 1st eigen vector from the diffusion maps)
    3. K-mediods classification
    4. K-means + pick feature by distance from axis (0, 0)

classification
1. add cross validation k-folds (5-10 folds)
2. ranked + K-means: rank by mean row of the flatten matrix. (the bigger the better)

In [40]:
from math import exp, sqrt, log
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import preprocessing

from dictances import jensen_shannon
import matplotlib.pyplot as plt
from pydiffmap import diffusion_map as dm
from pydiffmap.visualization import embedding_plot, data_plot

from ref.Diffusion_Maps import diffusionMapping
from ref.Shir import utils as shir_utils
from utils import min_max_scaler, calc_mean_std, flatten, norm_by_dist_type, calculate_distance
from main import execute_distance_func, calc_dist, export_heatmaps, k_medoids_features, return_best_features_by_kmeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GroupKFold, KFold, StratifiedKFold
from sklearn import metrics

In [13]:
df = pd.read_csv('data/glass.csv')
features = df.columns.drop('label')
label_column = 'label'

In [61]:
def predict(df, features, label_column):
    kf = StratifiedKFold(n_splits=5, shuffle=True)
    clf = RandomForestClassifier(random_state=1)
    multi_target_forest = OneVsRestClassifier(clf, n_jobs=-1)
    train_acc = []
    X_train, X_test, y_train, y_test = train_test_split(df[features], df[label_column], test_size=0.33, random_state=42)
    for train_index, test_index in kf.split(X_train, y_train):
        model = multi_target_forest.fit(X_train.iloc[train_index], y_train.iloc[train_index])
        train_preds = model.predict(X_train.iloc[test_index])
        
        train_acc.append(metrics.accuracy_score(y_train.iloc[test_index], train_preds))        
    
    model = multi_target_forest.fit(X_train, y_train)
    preds = model.predict(X_test)
    print(metrics.classification_report(y_test, preds, digits=3))
    return train_acc

In [63]:
train_acc = predict(df, features, label_column)
train_avg_score = sum(train_acc)/len(train_acc)
print(f'{train_avg_score=}')

              precision    recall  f1-score   support

           1      0.769     0.909     0.833        22
           2      0.792     0.760     0.776        25
           3      1.000     0.500     0.667         4
           5      1.000     0.500     0.667         6
           6      0.800     1.000     0.889         4
           7      0.909     1.000     0.952        10

    accuracy                          0.817        71
   macro avg      0.878     0.778     0.797        71
weighted avg      0.831     0.817     0.809        71

train_avg_score=0.7694581280788176


# ***Random columns

In [64]:
sampled_df = df[features].sample(n=4, axis='columns')
new_features = sampled_df.columns
sampled_df[label_column] = df[label_column]

train_acc = predict(sampled_df, new_features, label_column)
train_avg_score = sum(train_acc)/len(train_acc)
print(f'{train_avg_score=}')

              precision    recall  f1-score   support

           1      0.682     0.682     0.682        22
           2      0.548     0.680     0.607        25
           3      0.000     0.000     0.000         4
           5      0.500     0.167     0.250         6
           6      0.500     0.500     0.500         4
           7      0.727     0.800     0.762        10

    accuracy                          0.606        71
   macro avg      0.493     0.471     0.467        71
weighted avg      0.577     0.606     0.582        71

train_avg_score=0.6495073891625616
