1. regular classification - done
2. random columns classification - done
3. classification using PCA feature reduction - done
4. for dist in (wesserstain, hellinger, JM):
    1. ranked columns classification (for ranking, use 1st eigen vector from the diffusion maps)
    2. ranked + K-means classification (for ranking, use 1st eigen vector from the diffusion maps)
    3. K-mediods classification
    4. K-means + pick feature by distance from axis (0, 0)
    

classification
1. add cross validation k-folds (5-10 folds)
2. ranked + K-means: rank by mean row of the flatten matrix. (the bigger the better)

In [1]:
from math import exp, sqrt, log
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import preprocessing

from dictances import jensen_shannon
import matplotlib.pyplot as plt
from pydiffmap import diffusion_map as dm
from pydiffmap.visualization import embedding_plot, data_plot

from ref.Diffusion_Maps import diffusionMapping
from ref.Shir import utils as shir_utils
from utils import min_max_scaler, calc_mean_std, flatten, norm_by_dist_type, calculate_distance
from main import execute_distance_func, calc_dist, export_heatmaps, k_medoids_features, return_best_features_by_kmeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GroupKFold, KFold, StratifiedKFold
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('data/glass.csv')
features = df.columns.drop('label')
label_column = 'label'


In [3]:
def predict(df, features, label_column, X_train, X_test, y_train, y_test):
    kf = StratifiedKFold(n_splits=5, shuffle=True)
    clf = RandomForestClassifier(random_state=1)
    multi_target_forest = OneVsRestClassifier(clf, n_jobs=-1)
    train_acc = []
    
    for train_index, test_index in kf.split(X_train, y_train):
        model = multi_target_forest.fit(X_train.iloc[train_index], y_train.iloc[train_index])
        train_preds = model.predict(X_train.iloc[test_index])
        
        train_acc.append(metrics.accuracy_score(y_train.iloc[test_index], train_preds))        
    
    model = multi_target_forest.fit(X_train, y_train)
    preds = model.predict(X_test)
    print(metrics.classification_report(y_test, preds, digits=3))
    return train_acc

## Regular classification

In [4]:
X_train, X_test, y_train, y_test = train_test_split(df[features], df[label_column], test_size=0.33, random_state=42)
train_acc = predict(df, features, label_column, X_train, X_test, y_train, y_test)
train_avg_score = sum(train_acc)/len(train_acc)
print(f'{train_avg_score=}')

              precision    recall  f1-score   support

           1      0.769     0.909     0.833        22
           2      0.792     0.760     0.776        25
           3      1.000     0.500     0.667         4
           5      1.000     0.500     0.667         6
           6      0.800     1.000     0.889         4
           7      0.909     1.000     0.952        10

    accuracy                          0.817        71
   macro avg      0.878     0.778     0.797        71
weighted avg      0.831     0.817     0.809        71

train_avg_score=0.7413793103448276


## Random columns

In [12]:
sampled_df = df[features].sample(n=4, axis='columns')
new_features = sampled_df.columns
sampled_df[label_column] = df[label_column]
X_train, X_test, y_train, y_test = train_test_split(sampled_df[new_features], sampled_df[label_column], test_size=0.33, random_state=42)

train_acc = predict(sampled_df, new_features, label_column, X_train, X_test, y_train, y_test)
train_avg_score = sum(train_acc)/len(train_acc)
print(f'{train_avg_score=}')

              precision    recall  f1-score   support

           1      0.679     0.864     0.760        22
           2      0.708     0.680     0.694        25
           3      0.000     0.000     0.000         4
           5      0.333     0.167     0.222         6
           6      0.750     0.750     0.750         4
           7      0.750     0.900     0.818        10

    accuracy                          0.690        71
   macro avg      0.537     0.560     0.541        71
weighted avg      0.636     0.690     0.656        71

train_avg_score=0.6369458128078818


  _warn_prf(average, modifier, msg_start, len(result))


## PCA Classification

In [6]:
# Norm
scaler = StandardScaler()
X_train_norm = scaler.fit_transform(X_train)
X_test_norm = scaler.transform(X_test)
sample_data = X_train_norm

#PCA
pca = PCA(n_components=4)
pca.fit(X_train_norm)
X_train_pca = pca.transform(X_train_norm)
X_test_pca = pca.transform(X_test_norm)
y_train_arr = y_train.to_numpy()
y_test_arr = y_test.to_numpy()



In [7]:
def predict_np(df, features, label_column, X_train, X_test, y_train, y_test):
    kf = StratifiedKFold(n_splits=5, shuffle=True)
    clf = RandomForestClassifier(random_state=1)
    multi_target_forest = OneVsRestClassifier(clf, n_jobs=-1)
    train_acc = []
    
    for train_index, test_index in kf.split(X_train, y_train):
        model = multi_target_forest.fit(X_train[train_index], y_train[train_index])
        train_preds = model.predict(X_train[test_index])
        
        train_acc.append(metrics.accuracy_score(y_train[test_index], train_preds))        
    
    model = multi_target_forest.fit(X_train, y_train)
    preds = model.predict(X_test)
    print(metrics.classification_report(y_test, preds, digits=3))
    return train_acc

In [8]:
train_acc = predict_np(sampled_df, new_features, label_column, X_train_pca, X_test_pca, y_train_arr, y_test_arr)
train_avg_score = sum(train_acc)/len(train_acc)
print(f'{train_avg_score=}')

              precision    recall  f1-score   support

           1      0.652     0.682     0.667        22
           2      0.621     0.720     0.667        25
           3      0.500     0.250     0.333         4
           5      0.750     0.500     0.600         6
           6      0.667     0.500     0.571         4
           7      0.900     0.900     0.900        10

    accuracy                          0.676        71
   macro avg      0.682     0.592     0.623        71
weighted avg      0.677     0.676     0.670        71

train_avg_score=0.6573891625615764
