In [1]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pickle
from matplotlib.colors import ListedColormap

from sklearn.datasets import make_circles, make_classification, make_moons
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [3]:
with open('mpid_bgap_info.pkl', 'rb') as f:
    mpid_bgap_dict = pickle.load(f)
    
with open('mpid_fp_info.pkl', 'rb') as f:
    mpid_fp_dict = pickle.load(f)

In [4]:
len(mpid_bgap_dict), len(mpid_fp_dict)
mpid_bgap_dict['4H-Si-1'], mpid_fp_dict['Si24-1'], mpid_fp_dict['Si46-1'], mpid_fp_dict['Si34-1']

({'Metality': True, 'Direct or not': False, 'Gap value (eV)': -0.03},
 array([0.79606863, 0.32157304, 0.3078777 , ..., 0.        , 0.        ,
        0.        ]),
 array([0.80920501, 0.310002  , 0.30455711, ..., 0.        , 0.        ,
        0.        ]),
 array([0.7698507 , 0.28575593, 0.28575554, ..., 0.        , 0.        ,
        0.        ]))

In [5]:
mpid_fp_dict['4H-Si-1'].shape, mpid_fp_dict['Si24-1'].shape, mpid_fp_dict['Si46-1'].shape, mpid_fp_dict['Si34-1'].shape

((13800,), (13800,), (13800,), (13800,))

In [6]:
fp_list = []
band_info_list = []
for mat_id in mpid_bgap_dict:
    if mpid_bgap_dict[mat_id]['Metality'] == True:
        band_info_list.append(int(0))
        fp_list.append(mpid_fp_dict[mat_id].tolist())
    else:
        if mpid_bgap_dict[mat_id]['Direct or not'] == False:
            band_info_list.append(int(1))
            fp_list.append(mpid_fp_dict[mat_id].tolist())
        else:
            band_info_list.append(int(2))
            fp_list.append(mpid_fp_dict[mat_id].tolist())
            
            


In [7]:
FP_ARR = np.array(fp_list, dtype=float)
Y_ARR = np.array(band_info_list, dtype=float)

In [8]:
FP_ARR.shape, Y_ARR.shape

((7913, 13800), (7913,))

In [9]:
names = [
    "Nearest Neighbors",
    "Polynomial SVM",
    "RBF SVM",
    "Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Gradient Boosting",
    "Logistic Regression",
]

classifiers = [
    KNeighborsClassifier(5),
    SVC(kernel="poly", gamma=0.001, C=1.0),
    SVC(kernel="rbf", gamma=0.001, C=1.0),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(criterion="entropy", max_depth=2, min_samples_leaf=20),
    RandomForestClassifier(criterion="entropy", max_depth=2, min_samples_leaf=20,
                           n_estimators=10, max_features=1),
    MLPClassifier(hidden_layer_sizes=(100,100,100), activation="tanh", 
                  solver="sgd", alpha=0.0001, 
                  learning_rate="adaptive", max_iter=1000),
    AdaBoostClassifier(),
    GradientBoostingClassifier(loss="log_loss", learning_rate=0.01, 
                               n_estimators=500, max_depth=1),
    LogisticRegression(penalty="l2", dual=False, C=1.0, max_iter=5000),
]

# k_fold = KFold(n_splits=10, shuffle=True, random_state=42)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for name, clf in zip(names, classifiers):
    # ds_cnt = 0 
    # ax = plt.subplot(1, len(classifiers) + 1, 1)
    clf = make_pipeline(StandardScaler(), clf)
    for k, (train, test) in enumerate(skf.split(FP_ARR, Y_ARR)):
        X_train = FP_ARR[train]
        y_train = Y_ARR[train]
        X_test = FP_ARR[test]
        y_test = Y_ARR[test]

        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        y_pred = clf.predict(X_test)

        prec_score = precision_score(y_test, y_pred, average="micro")
        rec_score = recall_score(y_test, y_pred, average="micro")
        f_score = f1_score(y_test, y_pred, average="micro")
        acc_score = accuracy_score(y_test, y_pred, normalize=True)
        print(
            "[fold {0}] Classifier: {1}, Accuracy: {2:.8f}".format(k, name, acc_score)
        )
        print("-" * 80)
        print(
            "Precision score: {0:.6f}, Recall score: {1:.6f}, f1 score: {2:.6f}".format(
                prec_score, rec_score, f_score)
        )
        print("-" * 80)
    

[fold 0] Classifier: Nearest Neighbors, Accuracy: 0.96212121
--------------------------------------------------------------------------------
Precision score: 0.962121, Recall score: 0.962121, f1 score: 0.962121
--------------------------------------------------------------------------------
[fold 1] Classifier: Nearest Neighbors, Accuracy: 0.96590909
--------------------------------------------------------------------------------
Precision score: 0.965909, Recall score: 0.965909, f1 score: 0.965909
--------------------------------------------------------------------------------
[fold 2] Classifier: Nearest Neighbors, Accuracy: 0.96464646
--------------------------------------------------------------------------------
Precision score: 0.964646, Recall score: 0.964646, f1 score: 0.964646
--------------------------------------------------------------------------------
[fold 3] Classifier: Nearest Neighbors, Accuracy: 0.95069532
------------------------------------------------------------

[fold 9] Classifier: RBF SVM, Accuracy: 0.96207332
--------------------------------------------------------------------------------
Precision score: 0.962073, Recall score: 0.962073, f1 score: 0.962073
--------------------------------------------------------------------------------
[fold 0] Classifier: Gaussian Process, Accuracy: 0.96843434
--------------------------------------------------------------------------------
Precision score: 0.968434, Recall score: 0.968434, f1 score: 0.968434
--------------------------------------------------------------------------------
[fold 1] Classifier: Gaussian Process, Accuracy: 0.96590909
--------------------------------------------------------------------------------
Precision score: 0.965909, Recall score: 0.965909, f1 score: 0.965909
--------------------------------------------------------------------------------
[fold 2] Classifier: Gaussian Process, Accuracy: 0.97348485
-------------------------------------------------------------------------

[fold 8] Classifier: Random Forest, Accuracy: 0.96460177
--------------------------------------------------------------------------------
Precision score: 0.964602, Recall score: 0.964602, f1 score: 0.964602
--------------------------------------------------------------------------------
[fold 9] Classifier: Random Forest, Accuracy: 0.94816688
--------------------------------------------------------------------------------
Precision score: 0.948167, Recall score: 0.948167, f1 score: 0.948167
--------------------------------------------------------------------------------
[fold 0] Classifier: Neural Net, Accuracy: 0.97348485
--------------------------------------------------------------------------------
Precision score: 0.973485, Recall score: 0.973485, f1 score: 0.973485
--------------------------------------------------------------------------------
[fold 1] Classifier: Neural Net, Accuracy: 0.96464646
--------------------------------------------------------------------------------
P

[fold 7] Classifier: Gradient Boosting, Accuracy: 0.96207332
--------------------------------------------------------------------------------
Precision score: 0.962073, Recall score: 0.962073, f1 score: 0.962073
--------------------------------------------------------------------------------
[fold 8] Classifier: Gradient Boosting, Accuracy: 0.97092288
--------------------------------------------------------------------------------
Precision score: 0.970923, Recall score: 0.970923, f1 score: 0.970923
--------------------------------------------------------------------------------
[fold 9] Classifier: Gradient Boosting, Accuracy: 0.96713021
--------------------------------------------------------------------------------
Precision score: 0.967130, Recall score: 0.967130, f1 score: 0.967130
--------------------------------------------------------------------------------
[fold 0] Classifier: Logistic Regression, Accuracy: 0.96717172
----------------------------------------------------------