In [1]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pickle
from matplotlib.colors import ListedColormap

from sklearn.datasets import make_circles, make_classification, make_moons
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [3]:
with open('mpid_bgap_info.pkl', 'rb') as f:
    mpid_bgap_dict = pickle.load(f)
    
with open('mpid_fp_info.pkl', 'rb') as f:
    mpid_fp_dict = pickle.load(f)

In [4]:
mpid_bgap_dict['4H-Si-1'], mpid_bgap_dict['Si24-1'], mpid_bgap_dict['Si46-1'], mpid_bgap_dict['Si34-1']
mpid_fp_dict['4H-Si-1'], mpid_fp_dict['Si24-1'], mpid_fp_dict['Si46-1'], mpid_fp_dict['Si34-1']

(array([0.80884361, 0.29976806, 0.29843747, ..., 0.        , 0.        ,
        0.        ]),
 array([0.79606863, 0.32157304, 0.3078777 , ..., 0.        , 0.        ,
        0.        ]),
 array([0.80920501, 0.310002  , 0.30455711, ..., 0.        , 0.        ,
        0.        ]),
 array([0.7698507 , 0.28575593, 0.28575554, ..., 0.        , 0.        ,
        0.        ]))

In [5]:
mpid_fp_dict['4H-Si-1'].shape, mpid_fp_dict['Si24-1'].shape, mpid_fp_dict['Si46-1'].shape, mpid_fp_dict['Si34-1'].shape

((13800,), (13800,), (13800,), (13800,))

In [6]:
fp_list = []
band_info_list = []
for mat_id in mpid_bgap_dict:
    if mpid_bgap_dict[mat_id]['Metality'] == True:
        band_info_list.append(int(0))
        fp_list.append(mpid_fp_dict[mat_id].tolist())
    else:
        if mpid_bgap_dict[mat_id]['Direct or not'] == False:
            band_info_list.append(int(1))
            fp_list.append(mpid_fp_dict[mat_id].tolist())
        else:
            band_info_list.append(int(2))
            fp_list.append(mpid_fp_dict[mat_id].tolist())
            
            


In [7]:
FP_ARR = np.array(fp_list, dtype=float)
Y_ARR = np.array(band_info_list, dtype=float)

In [8]:
FP_ARR.shape, Y_ARR.shape

((7913, 13800), (7913,))

In [9]:
names = [
    "Nearest Neighbors",
    "Logistic Regression",
    "RBF SVM",
    "Polynomial SVM",
    "Neural Net",
    "Gaussian Process",
    "Gaussian Naive Bayes",
    "Decision Tree",
    "Random Forest",
    "AdaBoost",
    "Gradient Boosting"
]

classifiers = [
    KNeighborsClassifier(n_neighbors=20, weights="distance", algorithm="auto"),
    LogisticRegression(penalty="l2", solver="newton-cg", class_weight="balanced", 
                       dual=False, C=1.0, max_iter=5000),
    SVC(kernel="rbf", gamma="scale", C=1.0, class_weight="balanced", probability=True),
    SVC(kernel="poly", gamma="scale", C=1.0, class_weight="balanced", probability=True),
    MLPClassifier(hidden_layer_sizes=(1000,1000,1000), activation="relu", 
                  solver="adam", alpha=0.0001, 
                  learning_rate="adaptive", max_iter=2000),
    GaussianProcessClassifier(kernel=1.0 * RBF(1.0)),
    GaussianNB(),
    DecisionTreeClassifier(criterion="gini", max_depth=2, min_samples_leaf=20, 
                           class_weight="balanced"),
    RandomForestClassifier(criterion="gini", max_depth=70, min_samples_leaf=1, 
                           min_samples_split=2, class_weight="balanced", 
                           n_estimators=1000, max_features=1, bootstrap=False),
    AdaBoostClassifier(n_estimators=500, learning_rate=0.01, algorithm='SAMME'),
    GradientBoostingClassifier(loss="log_loss", learning_rate=0.01, 
                               n_estimators=500, max_depth=1)
]


# k_fold = KFold(n_splits=10, shuffle=True, random_state=42)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for name, clf in zip(names, classifiers):
    # ds_cnt = 0 
    # ax = plt.subplot(1, len(classifiers) + 1, 1)
    clf = make_pipeline(StandardScaler(), clf)
    print("-" * 120)
    print("Classifier: {0}".format(name))
    for k, (train, test) in enumerate(skf.split(FP_ARR, Y_ARR)):
        X_train = FP_ARR[train]
        y_train = Y_ARR[train]
        X_test = FP_ARR[test]
        y_test = Y_ARR[test]

        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        y_pred = clf.predict(X_test)
        y_score = clf.predict_proba(X_test)

        # acc_score = accuracy_score(y_test, y_pred, normalize=True)
        # prec_score = precision_score(y_test, y_pred, average="weighted")
        rec_score = recall_score(y_test, y_pred, average="weighted")
        f_score = f1_score(y_test, y_pred, average="weighted")
        ovr_ROC = roc_auc_score(y_test, y_score, average="weighted", multi_class="ovr")
        ovo_ROC = roc_auc_score(y_test, y_score, average="weighted", multi_class="ovo")
        
        '''
        print(
            "[fold {0}] Classifier: {1}, Accuracy: {2:.8f}".format(k, name, acc_score)
        )
        print("-" * 80)
        print(
            "Precision score: {0:.6f}, Recall score: {1:.6f}, f1 score: {2:.6f}".format(
                prec_score, rec_score, f_score)
        )
        '''
        print(
            "[fold {0}] Recall score: {1:.8f}, F1 score: {2:.8f}, One-Over-Rest ROC: {3:.8f}, One-Over-One ROC: {4:.8f}".format(
                k, rec_score, f_score, ovr_ROC, ovo_ROC)
        )
        # print("-" * 80)
    

------------------------------------------------------------------------------------------------------------------------
Classifier: Nearest Neighbors
[fold 0] Recall score: 0.96717172, F1 score: 0.96612196, One-Over-Rest ROC: 0.99398349, One-Over-One ROC: 0.98796760
[fold 1] Recall score: 0.96464646, F1 score: 0.96231918, One-Over-Rest ROC: 0.98802323, One-Over-One ROC: 0.87431593
[fold 2] Recall score: 0.96717172, F1 score: 0.96640963, One-Over-Rest ROC: 0.99242890, One-Over-One ROC: 0.87725696
[fold 3] Recall score: 0.95701643, F1 score: 0.95638325, One-Over-Rest ROC: 0.97774731, One-Over-One ROC: 0.81693717
[fold 4] Recall score: 0.96713021, F1 score: 0.96533465, One-Over-Rest ROC: 0.99143741, One-Over-One ROC: 0.87103265
[fold 5] Recall score: 0.97345133, F1 score: 0.97162323, One-Over-Rest ROC: 0.99037931, One-Over-One ROC: 0.82279545
[fold 6] Recall score: 0.96460177, F1 score: 0.96274174, One-Over-Rest ROC: 0.99214211, One-Over-One ROC: 0.98919600
[fold 7] Recall score: 0.96333

[fold 2] Recall score: 0.88636364, F1 score: 0.91564390, One-Over-Rest ROC: 0.93318944, One-Over-One ROC: 0.89289591
[fold 3] Recall score: 0.89001264, F1 score: 0.91823716, One-Over-Rest ROC: 0.93131211, One-Over-One ROC: 0.82728069
[fold 4] Recall score: 0.89759798, F1 score: 0.92229370, One-Over-Rest ROC: 0.93315113, One-Over-One ROC: 0.77789263
[fold 5] Recall score: 0.90771176, F1 score: 0.93695563, One-Over-Rest ROC: 0.94605248, One-Over-One ROC: 0.85316502
[fold 6] Recall score: 0.88748420, F1 score: 0.92248840, One-Over-Rest ROC: 0.93588718, One-Over-One ROC: 0.92405765
[fold 7] Recall score: 0.92920354, F1 score: 0.93764671, One-Over-Rest ROC: 0.95029851, One-Over-One ROC: 0.85904227
[fold 8] Recall score: 0.90897598, F1 score: 0.93289319, One-Over-Rest ROC: 0.94229027, One-Over-One ROC: 0.82254411
[fold 9] Recall score: 0.91403287, F1 score: 0.92973741, One-Over-Rest ROC: 0.95414189, One-Over-One ROC: 0.82530167
----------------------------------------------------------------