In [1]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pickle
from matplotlib.colors import ListedColormap

from sklearn.datasets import make_circles, make_classification, make_moons
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [3]:
with open('mpid_bgap_info.pkl', 'rb') as f:
    mpid_bgap_dict = pickle.load(f)
    
with open('mpid_fp_info.pkl', 'rb') as f:
    mpid_fp_dict = pickle.load(f)

In [4]:
len(mpid_bgap_dict), len(mpid_fp_dict)
mpid_bgap_dict['mp-4748'], mpid_fp_dict['mp-3378']

({'Metality': False,
  'Direct or not': False,
  'Gap value (eV)': 1.8237999999999999},
 array([0.7279952 , 0.26307087, 0.25634488, ..., 0.        , 0.        ,
        0.        ]))

In [5]:
mpid_fp_dict['mp-4748'].shape, mpid_fp_dict['mp-3378'].shape

((72000,), (72000,))

In [6]:
fp_list = []
band_info_list = []
for mat_id in mpid_bgap_dict:
    if mpid_bgap_dict[mat_id]['Metality'] == True:
        band_info_list.append(int(0))
        fp_list.append(mpid_fp_dict[mat_id].tolist())
    else:
        if mpid_bgap_dict[mat_id]['Direct or not'] == False:
            band_info_list.append(int(1))
            fp_list.append(mpid_fp_dict[mat_id].tolist())
        else:
            band_info_list.append(int(2))
            fp_list.append(mpid_fp_dict[mat_id].tolist())
            
            


In [7]:
FP_ARR = np.array(fp_list, dtype=float)
Y_ARR = np.array(band_info_list, dtype=float)

In [8]:
FP_ARR.shape, Y_ARR.shape

((2488, 72000), (2488,))

In [9]:
names = [
    "Nearest Neighbors",
    "Logistic Regression",
    "RBF SVM",
    "Polynomial SVM",
    "Neural Net",
    "Gaussian Process",
    "Gaussian Naive Bayes",
    "Decision Tree",
    "Random Forest",
    "AdaBoost",
    "Gradient Boosting"
]

classifiers = [
    KNeighborsClassifier(n_neighbors=20, weights="distance", algorithm="auto"),
    LogisticRegression(penalty="l2", solver="newton-cg", class_weight="balanced", 
                       dual=False, C=1.0, max_iter=5000),
    SVC(kernel="rbf", gamma="scale", C=1.0, class_weight="balanced", probability=True),
    SVC(kernel="poly", gamma="scale", C=1.0, class_weight="balanced", probability=True),
    MLPClassifier(hidden_layer_sizes=(1000,1000,1000), activation="relu", 
                  solver="adam", alpha=0.0001, 
                  learning_rate="adaptive", max_iter=2000),
    GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0, 
                                               length_scale_bounds=(1.0E-5, 1.0E8))),
    GaussianNB(),
    DecisionTreeClassifier(criterion="gini", max_depth=2, min_samples_leaf=20, 
                           class_weight="balanced"),
    RandomForestClassifier(criterion="gini", max_depth=70, min_samples_leaf=1, 
                           min_samples_split=2, class_weight="balanced", 
                           n_estimators=1000, max_features=1, bootstrap=False),
    AdaBoostClassifier(n_estimators=500, learning_rate=0.01, algorithm='SAMME'),
    GradientBoostingClassifier(loss="log_loss", learning_rate=0.01, 
                               n_estimators=500, max_depth=1)
]


# k_fold = KFold(n_splits=10, shuffle=True, random_state=42)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for name, clf in zip(names, classifiers):
    # ds_cnt = 0 
    # ax = plt.subplot(1, len(classifiers) + 1, 1)
    clf = make_pipeline(StandardScaler(), clf)
    print("-" * 120)
    print("Classifier: {0}".format(name))
    for k, (train, test) in enumerate(skf.split(FP_ARR, Y_ARR)):
        X_train = FP_ARR[train]
        y_train = Y_ARR[train]
        X_test = FP_ARR[test]
        y_test = Y_ARR[test]

        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        y_pred = clf.predict(X_test)
        y_score = clf.predict_proba(X_test)

        # acc_score = accuracy_score(y_test, y_pred, normalize=True)
        # prec_score = precision_score(y_test, y_pred, average="weighted")
        rec_score = recall_score(y_test, y_pred, average="weighted")
        f_score = f1_score(y_test, y_pred, average="weighted")
        ovr_ROC = roc_auc_score(y_test, y_score, average="weighted", multi_class="ovr")
        ovo_ROC = roc_auc_score(y_test, y_score, average="weighted", multi_class="ovo")
        
        '''
        print(
            "[fold {0}] Classifier: {1}, Accuracy: {2:.8f}".format(k, name, acc_score)
        )
        print("-" * 80)
        print(
            "Precision score: {0:.6f}, Recall score: {1:.6f}, f1 score: {2:.6f}".format(
                prec_score, rec_score, f_score)
        )
        '''
        print(
            "[fold {0}] Recall score: {1:.8f}, F1 score: {2:.8f}, One-Over-Rest ROC: {3:.8f}, One-Over-One ROC: {4:.8f}".format(
                k, rec_score, f_score, ovr_ROC, ovo_ROC)
        )
        # print("-" * 80)
    

------------------------------------------------------------------------------------------------------------------------
Classifier: Nearest Neighbors
[fold 0] Recall score: 0.63453815, F1 score: 0.62455319, One-Over-Rest ROC: 0.79503161, One-Over-One ROC: 0.79635318
[fold 1] Recall score: 0.65060241, F1 score: 0.64821958, One-Over-Rest ROC: 0.80300732, One-Over-One ROC: 0.79555843
[fold 2] Recall score: 0.69879518, F1 score: 0.68357424, One-Over-Rest ROC: 0.83076154, One-Over-One ROC: 0.80858889
[fold 3] Recall score: 0.68674699, F1 score: 0.68022198, One-Over-Rest ROC: 0.82079616, One-Over-One ROC: 0.81396637
[fold 4] Recall score: 0.63855422, F1 score: 0.63430254, One-Over-Rest ROC: 0.81354205, One-Over-One ROC: 0.80667520
[fold 5] Recall score: 0.69477912, F1 score: 0.68112429, One-Over-Rest ROC: 0.81805262, One-Over-One ROC: 0.79059609
[fold 6] Recall score: 0.64257028, F1 score: 0.63563302, One-Over-Rest ROC: 0.81418917, One-Over-One ROC: 0.79930955
[fold 7] Recall score: 0.65863

[fold 2] Recall score: 0.49799197, F1 score: 0.35977981, One-Over-Rest ROC: 0.52538981, One-Over-One ROC: 0.52168825
[fold 3] Recall score: 0.48995984, F1 score: 0.33386609, One-Over-Rest ROC: 0.51134318, One-Over-One ROC: 0.50762760
[fold 4] Recall score: 0.49397590, F1 score: 0.36289245, One-Over-Rest ROC: 0.52220510, One-Over-One ROC: 0.51962183
[fold 5] Recall score: 0.48995984, F1 score: 0.34588980, One-Over-Rest ROC: 0.51210389, One-Over-One ROC: 0.51420266
[fold 6] Recall score: 0.50200803, F1 score: 0.37562056, One-Over-Rest ROC: 0.52381522, One-Over-One ROC: 0.52265785
[fold 7] Recall score: 0.48192771, F1 score: 0.33172723, One-Over-Rest ROC: 0.50965881, One-Over-One ROC: 0.50482134
[fold 8] Recall score: 0.36693548, F1 score: 0.21070112, One-Over-Rest ROC: 0.50211301, One-Over-One ROC: 0.50079954
[fold 9] Recall score: 0.49193548, F1 score: 0.34879905, One-Over-Rest ROC: 0.52301617, One-Over-One ROC: 0.52046243
----------------------------------------------------------------