In [43]:
from pymatgen.core.composition import *
import numpy as np
import pandas as pd
import utils
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import pickle


# Reading Features - Atomic properties for elements in the periodic table.
### These properties were calculated using Density Functional Theory. For more informations see :

    Exploring Two-Dimensional Materials Thermodynamic Stability via Machine Learning
    Gabriel R. Schleder*, Carlos Mera Acosta, and Adalberto Fazzio
    ACS Appl. Mater. Interfaces 2020, 12, 18, 20149–20157.

In [44]:
df_atoms = pd.read_csv('Schleder2019_AtomicTable.csv')
#df_atoms


# Reading Database - This is a database of two-dimensional materials (C2DB)
### The majority of materials in this database are theoretical predictions. For more informations see :



    1) The Computational 2D Materials Database: High-Throughput Modeling and Discovery of Atomically Thin Crystals. 2D Materials 5, 042002 (2018).
    

    2) Recent Progress of the Computational 2D Materials Database (C2DB). 2D Materials 8, 044002 (2021).

In [45]:
# All possible properties included in the database

keys=['folder', 'uid', 'plasmafrequency_x', 'plasmafrequency_y', 'asr_id', 'cell_area', 
      'has_inversion_symmetry', 'stoichiometry', 'spacegroup', 'spgnum', 'pointgroup', 
      'crystal_type', 'dos_at_ef_nosoc', 'dos_at_ef_soc', 'alphax_el', 'alphay_el', 'alphaz_el', 
      'ehull', 'hform', 'thermodynamic_stability_level', 'spin_axis', 'E_x', 'E_y', 'E_z', 'dE_zx', 
      'dE_zy', 'dim_primary', 'dim_primary_score', 'dim_nclusters_0D', 'dim_nclusters_1D', 'dim_nclusters_2D', 
      'dim_nclusters_3D', 'dim_threshold_0D', 'dim_threshold_1D', 'dim_threshold_2D', 'dim_score_0D', 
      'dim_score_1D', 'dim_score_2D', 'dim_score_3D', 'dim_score_01D', 'dim_score_02D', 'dim_score_03D', 
      'dim_score_12D', 'dim_score_13D', 'dim_score_23D', 'dim_score_012D', 'dim_score_013D', 'dim_score_023D', 
      'dim_score_123D', 'dim_score_0123D', 'first_class_material', 'minhessianeig', 'dynamic_stability_phonons', 
      'speed_of_sound_x', 'speed_of_sound_y', 'dynamic_stability_stiffness', 'c_11', 'c_12', 'c_13', 
      'c_21', 'c_22', 'c_23', 'c_31', 'c_32', 'c_33', 'magstate', 'is_magnetic', 'nspins', 'evac', 
      'evacdiff', 'dipz', 'efermi', 'gap', 'vbm', 'cbm', 'gap_dir', 'gap_dir_nosoc', 'gap_nosoc', 
      'workfunction', 'emass_vb_dir1', 'emass_vb_dir2', 'emass_cb_dir1', 'emass_cb_dir2', 
      'has_asr_hse_calculate', 'has_asr_plasmafrequency', 'has_asr_phonons_calculate', 
      'has_asr_database_material_fingerprint', 'has_asr_structureinfo', 'has_asr_pdos', 
      'has_asr_setup_strains', 'has_asr_polarizability', 'has_asr_convex_hull', 'has_asr_magnetic_anisotropy', 
      'has_asr_dimensionality', 'has_asr_setinfo', 'has_asr_phonons', 'has_asr_bader', 
      'has_asr_stiffness', 'has_asr_magstate', 'has_asr_gs_calculate', 'has_asr_gs', 
      'has_asr_bandstructure_calculate', 'has_asr_bandstructure', 'has_asr_projected_bandstructure', 
      'has_asr_pdos_calculate', 'has_asr_emasses_refine', 'has_asr_emasses', 'has_asr_emasses_validate']

In [46]:
def evaluate_models(X_train, y_train, X_test, y_test):
    # Initialize classifiers
    classifiers = {
        'Logistic Regression': LogisticRegression(),
        'Decision Trees': DecisionTreeClassifier(),
        'Random Forest': RandomForestClassifier(),
        'Gradient Boosting': GradientBoostingClassifier(),
        'Support Vector Machines': SVC()
    }
    
    # Parameters for Grid Search
    params = {
        'Logistic Regression': {'C': [0.1, 1, 10]},
        'Decision Trees': {'criterion': ['gini', 'entropy']},
        'Random Forest': {'n_estimators': [10, 50]},
        'Gradient Boosting': {'n_estimators': [50, 100]},
        'Support Vector Machines': {'C': [0.1, 1, 10]}
    }
    
    # Store results
    results = {}
    
    for name, classifier in classifiers.items():
        print(f"Evaluating {name}...")
        
        # Cross Validation
        cv_score = cross_val_score(classifier, X_train, y_train, cv=5).mean()
        
        # Grid Search for Parameter Tuning
        grid_search = GridSearchCV(classifier, params[name], cv=5)
        grid_search.fit(X_train, y_train)
        
        # Bagging
        bagging = BaggingClassifier(classifier)
        bagging.fit(X_train, y_train)
        
        # Test the best estimator from Grid Search
        best_classifier = grid_search.best_estimator_
        y_pred = best_classifier.predict(X_test)
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
        
        # Store results
        results[name] = {
            'CV Score': cv_score,
            'Best Parameters': grid_search.best_params_,
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1 Score': f1
        }
        
        # Print results
        print(f"CV Score: {cv_score}")
        print(f"Best Parameters: {grid_search.best_params_}")
        print(f"Accuracy: {accuracy}")
        print(f"Precision: {precision}")
        print(f"Recall: {recall}")
        print(f"F1 Score: {f1}\n")
        
    return results

# Create the dataframes

In [47]:
# Create DataFrames for different stoichiometries
df_materials_AB1 = utils.create_df_stoichiometry('AB1')
df_materials_AB2 = utils.create_df_stoichiometry('AB2')
df_materials_AB3 = utils.create_df_stoichiometry('AB3')
df_materials_ABC = utils.create_df_stoichiometry('ABC')

In [48]:
df_materials_AB1.head(5)

Unnamed: 0,Material,Space_Group,ehull,Atom1,Atom2
0,CuBr,P-6m2,0.06486,Cu,Br
1,GaAs,P3m1,0.412799,Ga,As
2,BSb,P-6m2,0.806255,B,Sb
3,GaC,Amm2,1.452133,Ga,C
4,CP,Pm,0.984882,C,P


In [49]:
df_materials_AB2.head(5)

Unnamed: 0,Material,Space_Group,ehull,Atom1,Atom2
0,AgBr2,P-4m2,0.102879,Ag,Br
1,AgLi2,P-6m2,0.230152,Ag,Li
2,AlI2,P-6m2,0.399248,Al,I
3,AsCl2,P-6m2,0.55428,As,Cl
4,AsS2,P-3m1,0.154443,As,S


In [50]:
df_materials_AB3.head(5)

Unnamed: 0,Material,Space_Group,ehull,Atom1,Atom2


In [51]:
df_materials_ABC
df_materials_ABC.columns

df_materials_ABC.columns

Index(['Material', 'Space_Group', 'ehull', 'Atom1', 'Atom2', 'Atom3'], dtype='object')

## Calculating the statistical features for each DF

In [52]:
df_list = [df_materials_AB1, df_materials_AB2, df_materials_AB3, df_materials_ABC]
features_list = []

for df in df_list:

    if('Atom3' in df.columns):
        atoms = [ 'Atom1', 'Atom2', 'Atom3']
    else:
        atoms = ['Atom1', 'Atom2']

    features = utils.calculate_statistical_features(df, df_atoms, atoms)
    features_list.append(features)

In [53]:
features_list

[   Material Space_Group     ehull Atom1 Atom2  media_Z  max_Z  min_Z  \
 0      CuBr       P-6m2  0.064860    Cu    Br     32.0     35     29   
 1      GaAs        P3m1  0.412799    Ga    As     32.0     33     31   
 2       BSb       P-6m2  0.806255     B    Sb     28.0     51      5   
 3       GaC        Amm2  1.452133    Ga     C     18.5     31      6   
 4        CP          Pm  0.984882     C     P     10.5     15      6   
 5      GeTe        P3m1  0.087270    Ge    Te     42.0     52     32   
 6      HgSe        P3m1  0.156879    Hg    Se     57.0     80     34   
 7       InP        P3m1  0.432339    In     P     32.0     49     15   
 8       PbS        P3m1  0.231254    Pb     S     49.0     82     16   
 9      SnSe        P3m1  0.098331    Sn    Se     42.0     50     34   
 10     SnTe        P3m1  0.119079    Sn    Te     51.0     52     50   
 11     AlAs       P-6m2  0.521705    Al    As     23.0     33     13   
 12      AsB       P-6m2  0.467483    As     B     

In [54]:
df_materials_final = pd.concat(features_list,axis=0).fillna('0')

In [55]:
df_materials_final

Unnamed: 0,Material,Space_Group,ehull,Atom1,Atom2,media_Z,max_Z,min_Z,deviation_Z,media_Electronegativity,...,deviation_PeriodicColumn_upto18,media_NumberUnfilledOrbitals,max_NumberUnfilledOrbitals,min_NumberUnfilledOrbitals,deviation_NumberUnfilledOrbitals,media_Polarizability,max_Polarizability,min_Polarizability,deviation_Polarizability,Atom3
0,CuBr,P-6m2,0.064860,Cu,Br,32.000000,35.0,29.0,3.000000,2.430000,...,3.000000,1.000000,1.0,1.0,0.000000,37.620000,53.44,21.80,15.820000,0
1,GaAs,P3m1,0.412799,Ga,As,32.000000,33.0,31.0,1.000000,1.995000,...,1.000000,4.000000,5.0,3.0,1.000000,40.600000,51.40,29.80,10.800000,0
2,BSb,P-6m2,0.806255,B,Sb,28.000000,51.0,5.0,23.000000,2.045000,...,1.000000,4.000000,5.0,3.0,1.000000,31.540000,42.55,20.53,11.010000,0
3,GaC,Amm2,1.452133,Ga,C,18.500000,31.0,6.0,12.500000,2.180000,...,0.500000,4.500000,5.0,4.0,0.500000,31.330000,51.40,11.26,20.070000,0
4,CP,Pm,0.984882,C,P,10.500000,15.0,6.0,4.500000,2.370000,...,0.500000,3.500000,4.0,3.0,0.500000,18.095000,24.93,11.26,6.835000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
186,WSSe,P3m1,0.010168,W,S,41.333333,74.0,16.0,24.239545,2.496667,...,4.714045,3.333333,6.0,2.0,1.885618,40.203333,75.00,19.37,24.764292,Se
187,TaSTe,P3m1,0.066750,Ta,S,47.000000,73.0,16.0,23.537205,2.060000,...,5.185450,3.666667,7.0,2.0,2.357023,48.123333,88.00,19.37,29.101153,Te
188,ZrSTe,P3m1,0.122010,Zr,S,36.000000,52.0,16.0,14.966630,2.003333,...,5.656854,4.000000,8.0,2.0,2.828427,59.123333,121.00,19.37,44.341445,Te
189,TiSeTe,P3m1,0.104380,Ti,Se,36.000000,52.0,22.0,12.328828,2.063333,...,5.656854,4.000000,8.0,2.0,2.828427,51.746667,92.00,26.24,28.800377,Te


### Preparing our data

In [56]:
Y = df_materials_final['ehull']
X = df_materials_final.drop(columns='ehull')

Uma vez que temos colunas que possuem dados não numéricos, precisamos fazer o encode dessas colunas para dados numéricos, para que eles possam ser utilizados no treinamento do modelo

In [57]:
labelencoder = LabelEncoder()

# Apply LabelEncoder on each of the categorical columns
X['Material'] = labelencoder.fit_transform(X['Material'])
X['Space_Group'] = labelencoder.fit_transform(X['Space_Group'])
X['Atom1'] = labelencoder.fit_transform(X['Atom1'])
X['Atom2'] = labelencoder.fit_transform(X['Atom2'])
X['Atom3'] = labelencoder.fit_transform(X['Atom3'])

scaler = MinMaxScaler()
X = scaler.fit_transform(X)
Y = scaler.fit_transform(Y.to_frame())

### Transform points below an tolerance to zero 

In [58]:
tolerance = 0.005
Y = np.where(np.abs(Y) < tolerance, 1, 0)

In [59]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
results = evaluate_models(X_train, Y_train.ravel(), X_test, Y_test.ravel())
#save_best_model(results, X_train, y_train, metric='Accuracy')

Evaluating Logistic Regression...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

CV Score: 0.8994181741246166
Best Parameters: {'C': 0.1}
Accuracy: 0.936046511627907
Precision: 0.8761830719307735
Recall: 0.936046511627907
F1 Score: 0.905126056288847

Evaluating Decision Trees...
CV Score: 0.8513487781656617
Best Parameters: {'criterion': 'gini'}
Accuracy: 0.8895348837209303
Precision: 0.9100327966607036
Recall: 0.8895348837209303
F1 Score: 0.8988919592323598

Evaluating Random Forest...
CV Score: 0.8411086427589124
Best Parameters: {'n_estimators': 10}
Accuracy: 0.8837209302325582
Precision: 0.8728777286346127
Recall: 0.8837209302325582
F1 Score: 0.8782658627619869

Evaluating Gradient Boosting...
CV Score: 0.8585845763249763
Best Parameters: {'n_estimators': 50}
Accuracy: 0.9069767441860465
Precision: 0.874390753376967
Recall: 0.9069767441860465
F1 Score: 0.8903857061826432

Evaluating Support Vector Machines...
CV Score: 0.8994181741246166
Best Parameters: {'C': 0.1}
Accuracy: 0.936046511627907
Precision: 0.8761830719307735
Recall: 0.936046511627907
F1 Score: 0.9

In [60]:
best_model_name = max(results, key=lambda k: results[k]['Accuracy'])
best_model_params = results[best_model_name]['Best Parameters']
    
    # Initialize classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Decision Trees': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Support Vector Machines': SVC()
}

# Create and train the best model with best parameters
best_model = classifiers[best_model_name]
best_model.set_params(**best_model_params)
best_model.fit(X_train, Y_train)

# Save the best model to a pickle file
with open(f'{best_model_name}.pkl', 'wb') as f:
    pickle.dump(best_model, f)
    
print(f"Saved {best_model_name} with {best_model_params} as a pickle file.")

Saved Logistic Regression with {'C': 0.1} as a pickle file.


  y = column_or_1d(y, warn=True)
