In [1]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pickle
from matplotlib.colors import ListedColormap

from sklearn.datasets import make_circles, make_classification, make_moons
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [3]:
with open('mpid_bgap_info.pkl', 'rb') as f:
    mpid_bgap_dict = pickle.load(f)
    
with open('mpid_fp_info.pkl', 'rb') as f:
    mpid_fp_dict = pickle.load(f)

In [4]:
len(mpid_bgap_dict), len(mpid_fp_dict)
mpid_bgap_dict['mp-5827'], mpid_fp_dict['mp-2929']

({'Metality': False,
  'Direct or not': False,
  'Gap value (eV)': 1.8284999999999996},
 array([0.77061369, 0.2127183 , 0.2127183 , ..., 0.        , 0.        ,
        0.        ]))

In [5]:
mpid_fp_dict['mp-5827'].shape, mpid_fp_dict['mp-2929'].shape

((6000,), (6000,))

In [6]:
fp_list = []
band_info_list = []
for mat_id in mpid_bgap_dict:
    if mpid_bgap_dict[mat_id]['Metality'] == True:
        band_info_list.append(int(0))
        fp_list.append(mpid_fp_dict[mat_id].tolist())
    else:
        if mpid_bgap_dict[mat_id]['Direct or not'] == False:
            band_info_list.append(int(1))
            fp_list.append(mpid_fp_dict[mat_id].tolist())
        else:
            band_info_list.append(int(2))
            fp_list.append(mpid_fp_dict[mat_id].tolist())
            
            


In [7]:
FP_ARR = np.array(fp_list, dtype=float)
Y_ARR = np.array(band_info_list, dtype=float)

In [8]:
FP_ARR.shape, Y_ARR.shape

((968, 6000), (968,))

In [9]:
names = [
    "Nearest Neighbors",
    "Logistic Regression",
    "RBF SVM",
    "Polynomial SVM",
    "Neural Net",
    "Gaussian Process",
    "Gaussian Naive Bayes",
    "Decision Tree",
    "Random Forest",
    "AdaBoost",
    "Gradient Boosting"
]

classifiers = [
    KNeighborsClassifier(n_neighbors=20, weights="distance", algorithm="auto"),
    LogisticRegression(penalty="l2", solver="newton-cg", class_weight="balanced", 
                       dual=False, C=1.0, max_iter=5000),
    SVC(kernel="rbf", gamma="scale", C=1.0, class_weight="balanced", probability=True),
    SVC(kernel="poly", gamma="scale", C=1.0, class_weight="balanced", probability=True),
    MLPClassifier(hidden_layer_sizes=(1000,1000,1000), activation="relu", 
                  solver="adam", alpha=0.0001, 
                  learning_rate="adaptive", max_iter=2000),
    GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0, 
                                               length_scale_bounds=(1.0E-5, 1.0E8))),
    GaussianNB(),
    DecisionTreeClassifier(criterion="gini", max_depth=2, min_samples_leaf=20, 
                           class_weight="balanced"),
    RandomForestClassifier(criterion="gini", max_depth=70, min_samples_leaf=1, 
                           min_samples_split=2, class_weight="balanced", 
                           n_estimators=1000, max_features=1, bootstrap=False),
    AdaBoostClassifier(n_estimators=500, learning_rate=0.01, algorithm='SAMME'),
    GradientBoostingClassifier(loss="log_loss", learning_rate=0.01, 
                               n_estimators=500, max_depth=1)
]


# k_fold = KFold(n_splits=10, shuffle=True, random_state=42)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for name, clf in zip(names, classifiers):
    # ds_cnt = 0 
    # ax = plt.subplot(1, len(classifiers) + 1, 1)
    clf = make_pipeline(StandardScaler(), clf)
    print("-" * 120)
    print("Classifier: {0}".format(name))
    for k, (train, test) in enumerate(skf.split(FP_ARR, Y_ARR)):
        X_train = FP_ARR[train]
        y_train = Y_ARR[train]
        X_test = FP_ARR[test]
        y_test = Y_ARR[test]

        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        y_pred = clf.predict(X_test)
        y_score = clf.predict_proba(X_test)

        # acc_score = accuracy_score(y_test, y_pred, normalize=True)
        # prec_score = precision_score(y_test, y_pred, average="weighted")
        rec_score = recall_score(y_test, y_pred, average="weighted")
        f_score = f1_score(y_test, y_pred, average="weighted")
        ovr_ROC = roc_auc_score(y_test, y_score, average="weighted", multi_class="ovr")
        ovo_ROC = roc_auc_score(y_test, y_score, average="weighted", multi_class="ovo")
        
        '''
        print(
            "[fold {0}] Classifier: {1}, Accuracy: {2:.8f}".format(k, name, acc_score)
        )
        print("-" * 80)
        print(
            "Precision score: {0:.6f}, Recall score: {1:.6f}, f1 score: {2:.6f}".format(
                prec_score, rec_score, f_score)
        )
        '''
        print(
            "[fold {0}] Recall score: {1:.8f}, F1 score: {2:.8f}, One-Over-Rest ROC: {3:.8f}, One-Over-One ROC: {4:.8f}".format(
                k, rec_score, f_score, ovr_ROC, ovo_ROC)
        )
        # print("-" * 80)
    

------------------------------------------------------------------------------------------------------------------------
Classifier: Nearest Neighbors
[fold 0] Recall score: 0.82474227, F1 score: 0.75699558, One-Over-Rest ROC: 0.61846047, One-Over-One ROC: 0.73061487
[fold 1] Recall score: 0.83505155, F1 score: 0.76522962, One-Over-Rest ROC: 0.62697985, One-Over-One ROC: 0.73199098
[fold 2] Recall score: 0.83505155, F1 score: 0.78240905, One-Over-Rest ROC: 0.61122714, One-Over-One ROC: 0.64927359
[fold 3] Recall score: 0.82474227, F1 score: 0.75456063, One-Over-Rest ROC: 0.60531829, One-Over-One ROC: 0.66141862
[fold 4] Recall score: 0.81443299, F1 score: 0.74667615, One-Over-Rest ROC: 0.89859694, One-Over-One ROC: 0.87379290
[fold 5] Recall score: 0.81443299, F1 score: 0.74822254, One-Over-Rest ROC: 0.62979487, One-Over-One ROC: 0.62627468
[fold 6] Recall score: 0.81443299, F1 score: 0.76267646, One-Over-Rest ROC: 0.66325896, One-Over-One ROC: 0.72716183
[fold 7] Recall score: 0.80412

[fold 4] Recall score: 0.10309278, F1 score: 0.11247064, One-Over-Rest ROC: 0.52733381, One-Over-One ROC: 0.53495694
[fold 5] Recall score: 0.07216495, F1 score: 0.06063533, One-Over-Rest ROC: 0.51483353, One-Over-One ROC: 0.51999753
[fold 6] Recall score: 0.06185567, F1 score: 0.04293710, One-Over-Rest ROC: 0.51075269, One-Over-One ROC: 0.51609893
[fold 7] Recall score: 0.05154639, F1 score: 0.04120312, One-Over-Rest ROC: 0.46494263, One-Over-One ROC: 0.49516290
[fold 8] Recall score: 0.03125000, F1 score: 0.00191327, One-Over-Rest ROC: 0.49927878, One-Over-One ROC: 0.49981870
[fold 9] Recall score: 0.05208333, F1 score: 0.04263254, One-Over-Rest ROC: 0.50858904, One-Over-One ROC: 0.50792787
------------------------------------------------------------------------------------------------------------------------
Classifier: Decision Tree
[fold 0] Recall score: 0.10309278, F1 score: 0.03664935, One-Over-Rest ROC: 0.52735899, One-Over-One ROC: 0.59334499
[fold 1] Recall score: 0.11340206,