In [1]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pickle
from matplotlib.colors import ListedColormap

from sklearn.datasets import make_circles, make_classification, make_moons
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [3]:
with open('mpid_bgap_info.pkl', 'rb') as f:
    mpid_bgap_dict = pickle.load(f)
    
with open('mpid_fp_info.pkl', 'rb') as f:
    mpid_fp_dict = pickle.load(f)

In [4]:
len(mpid_bgap_dict), len(mpid_fp_dict)
mpid_bgap_dict['mp-1199894'], mpid_fp_dict['mp-1199894']

({'Metality': False,
  'Direct or not': False,
  'Gap value (eV)': 0.9984999999999999},
 array([0.78848698, 0.32446369, 0.31716203, ..., 0.        , 0.        ,
        0.        ]))

In [5]:
mpid_fp_dict['mp-1199894'].shape, mpid_fp_dict['mp-9947'].shape

((108000,), (108000,))

In [6]:
fp_list = []
band_info_list = []
for mat_id in mpid_bgap_dict:
    if mpid_bgap_dict[mat_id]['Metality'] == True:
        band_info_list.append(int(0))
        fp_list.append(mpid_fp_dict[mat_id].tolist())
    else:
        if mpid_bgap_dict[mat_id]['Direct or not'] == False:
            band_info_list.append(int(1))
            fp_list.append(mpid_fp_dict[mat_id].tolist())
        else:
            band_info_list.append(int(2))
            fp_list.append(mpid_fp_dict[mat_id].tolist())
            
            


In [7]:
FP_ARR = np.array(fp_list, dtype=float)
Y_ARR = np.array(band_info_list, dtype=float)

In [8]:
FP_ARR.shape, Y_ARR.shape

((785, 108000), (785,))

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    FP_ARR, Y_ARR, test_size = 0.2, random_state = 42 )
x_min, x_max = FP_ARR[:, 0].min() - 0.5, FP_ARR[:, 0].max() + 0.5
y_min, y_max = FP_ARR[:, 1].min() - 0.5, FP_ARR[:, 1].max() + 0.5

In [10]:
names = [
    "Nearest Neighbors",
    "Polynomial SVM",
    "RBF SVM",
    "Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Gradient Boosting",
    "Logistic Regression",
]

classifiers = [
    KNeighborsClassifier(5),
    SVC(kernel="poly", gamma=0.001, C=1.0),
    SVC(kernel="rbf", gamma=0.001, C=1.0),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(criterion="entropy", max_depth=2, min_samples_leaf=20),
    RandomForestClassifier(criterion="entropy", max_depth=2, min_samples_leaf=20,
                           n_estimators=10, max_features=1),
    MLPClassifier(hidden_layer_sizes=(100,100,100), activation="tanh", 
                  solver="sgd", alpha=0.0001, 
                  learning_rate="adaptive", max_iter=1000),
    AdaBoostClassifier(),
    GradientBoostingClassifier(loss="log_loss", learning_rate=0.01, 
                               n_estimators=500, max_depth=1),
    LogisticRegression(penalty="l2", dual=False, C=1.0, max_iter=5000),
]


k_fold = KFold(n_splits=10, shuffle=True, random_state=42)

for name, clf in zip(names, classifiers):
    # ds_cnt = 0 
    # ax = plt.subplot(1, len(classifiers) + 1, 1)
    clf = make_pipeline(StandardScaler(), clf)
    for k, (train, test) in enumerate(k_fold.split(FP_ARR, Y_ARR)):
        X_train = FP_ARR[train]
        y_train = Y_ARR[train]
        X_test = FP_ARR[test]
        y_test = Y_ARR[test]

        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        y_pred = clf.predict(X_test)

        prec_score = precision_score(y_test, y_pred, average="micro")
        rec_score = recall_score(y_test, y_pred, average="micro")
        f_score = f1_score(y_test, y_pred, average="micro")
        acc_score = accuracy_score(y_test, y_pred, normalize=True)
        print(
            "[fold {0}] Classifier: {1}, Accuracy: {2:.8f}".format(k, name, acc_score)
        )
        print("-" * 80)
        print(
            "Precision score: {0:.6f}, Recall score: {1:.6f}, f1 score: {2:.6f}".format(
                prec_score, rec_score, f_score)
        )
        print("-" * 80)
    

[fold 0] Classifier: Nearest Neighbors, Accuracy: 0.49367089
--------------------------------------------------------------------------------
Precision score: 0.493671, Recall score: 0.493671, f1 score: 0.493671
--------------------------------------------------------------------------------
[fold 1] Classifier: Nearest Neighbors, Accuracy: 0.51898734
--------------------------------------------------------------------------------
Precision score: 0.518987, Recall score: 0.518987, f1 score: 0.518987
--------------------------------------------------------------------------------
[fold 2] Classifier: Nearest Neighbors, Accuracy: 0.49367089
--------------------------------------------------------------------------------
Precision score: 0.493671, Recall score: 0.493671, f1 score: 0.493671
--------------------------------------------------------------------------------
[fold 3] Classifier: Nearest Neighbors, Accuracy: 0.53164557
------------------------------------------------------------

[fold 9] Classifier: RBF SVM, Accuracy: 0.47435897
--------------------------------------------------------------------------------
Precision score: 0.474359, Recall score: 0.474359, f1 score: 0.474359
--------------------------------------------------------------------------------
[fold 0] Classifier: Gaussian Process, Accuracy: 0.48101266
--------------------------------------------------------------------------------
Precision score: 0.481013, Recall score: 0.481013, f1 score: 0.481013
--------------------------------------------------------------------------------
[fold 1] Classifier: Gaussian Process, Accuracy: 0.67088608
--------------------------------------------------------------------------------
Precision score: 0.670886, Recall score: 0.670886, f1 score: 0.670886
--------------------------------------------------------------------------------
[fold 2] Classifier: Gaussian Process, Accuracy: 0.44303797
-------------------------------------------------------------------------

[fold 8] Classifier: Random Forest, Accuracy: 0.50000000
--------------------------------------------------------------------------------
Precision score: 0.500000, Recall score: 0.500000, f1 score: 0.500000
--------------------------------------------------------------------------------
[fold 9] Classifier: Random Forest, Accuracy: 0.44871795
--------------------------------------------------------------------------------
Precision score: 0.448718, Recall score: 0.448718, f1 score: 0.448718
--------------------------------------------------------------------------------
[fold 0] Classifier: Neural Net, Accuracy: 0.59493671
--------------------------------------------------------------------------------
Precision score: 0.594937, Recall score: 0.594937, f1 score: 0.594937
--------------------------------------------------------------------------------
[fold 1] Classifier: Neural Net, Accuracy: 0.54430380
--------------------------------------------------------------------------------
P

[fold 7] Classifier: Gradient Boosting, Accuracy: 0.56410256
--------------------------------------------------------------------------------
Precision score: 0.564103, Recall score: 0.564103, f1 score: 0.564103
--------------------------------------------------------------------------------
[fold 8] Classifier: Gradient Boosting, Accuracy: 0.58974359
--------------------------------------------------------------------------------
Precision score: 0.589744, Recall score: 0.589744, f1 score: 0.589744
--------------------------------------------------------------------------------
[fold 9] Classifier: Gradient Boosting, Accuracy: 0.48717949
--------------------------------------------------------------------------------
Precision score: 0.487179, Recall score: 0.487179, f1 score: 0.487179
--------------------------------------------------------------------------------
[fold 0] Classifier: Logistic Regression, Accuracy: 0.50632911
----------------------------------------------------------