In [1]:
import pandas as pd


In [2]:
df = pd.read_csv('./features/feature_merge_1219/csv/merged_features_original_18_0108_51_11.csv')

print(df.head())

     filename  A_HOMO_LUMO  B_HOMO_LUMO  A_Dipole  B_Dipole   mean_CN  \
0  CADPAB.cif     2.721574     7.285991    5.4464    4.2160  7.559311   
1  CECPUY.cif     2.773650     5.426899    4.8892    2.7691  7.168179   
2  CECRIO.cif     2.793827     5.501169    4.8046    0.7412  7.984630   
3  CECXEQ.cif     2.775525     5.427855    4.9502    2.2394  6.967348   
4  DOJKIY.cif     2.701374     6.494500    5.1755    2.5365  7.159820   

   mean_NDV  max packing efficiency   density        vpa  packing fraction  \
0  0.290453                0.072656  1.560219  10.056343          0.091649   
1  0.313178                0.077633  1.560587  10.851112          0.096749   
2  0.284820                0.075917  1.320394  10.020311          0.083520   
3  0.313415                0.079289  1.648445  10.272777          0.102196   
4  0.323809                0.073894  1.558056  11.213218          0.094581   

    SC_atom      SC_cell  E_els    E_x  E_rep  E_orb  E_DFTc  E_dc  polarity  
0  5.169925  

In [3]:
df_reset = df.reset_index(drop=True)

features = df.iloc[0:, 1:-1]  
polarity = df.iloc[:, -1]

print(features.head())

print(polarity.head())

   A_HOMO_LUMO  B_HOMO_LUMO  A_Dipole  B_Dipole   mean_CN  mean_NDV  \
0     2.721574     7.285991    5.4464    4.2160  7.559311  0.290453   
1     2.773650     5.426899    4.8892    2.7691  7.168179  0.313178   
2     2.793827     5.501169    4.8046    0.7412  7.984630  0.284820   
3     2.775525     5.427855    4.9502    2.2394  6.967348  0.313415   
4     2.701374     6.494500    5.1755    2.5365  7.159820  0.323809   

   max packing efficiency   density        vpa  packing fraction   SC_atom  \
0                0.072656  1.560219  10.056343          0.091649  5.169925   
1                0.077633  1.560587  10.851112          0.096749  4.954196   
2                0.075917  1.320394  10.020311          0.083520  5.700440   
3                0.079289  1.648445  10.272777          0.102196  4.954196   
4                0.073894  1.558056  11.213218          0.094581  4.857981   

       SC_cell  E_els    E_x  E_rep  E_orb  E_DFTc  E_dc  
0   372.234600 -49.08  -1.00   7.40  -4.05   

In [4]:
import matplotlib.pyplot as plt
import shap

In [5]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

# Create an instance of the SVM classifier
clf = SVC()

# Set the parameter grid for SVM
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'kernel': ['linear', 'rbf', 'poly'],  # Type of kernel function
    'gamma': ['scale', 'auto'],  # Kernel coefficient for 'rbf', 'poly', and 'sigmoid' kernels
    'degree': [2, 3, 4],  # Degree of the kernel function, used only when kernel is 'poly'
    'coef0': [0.0, 0.5, 1.0]  # Independent term in kernel function, used only when kernel is 'poly' or 'sigmoid'
}

# Create an instance of GridSearchCV
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='accuracy', cv=5, verbose=1)
grid_search.fit(features, polarity)

# Output the best parameters and the corresponding best score
print('Best parameters:', grid_search.best_params_)
print('Best accuracy score:', grid_search.best_score_)

# Make predictions on the entire dataset using the model with the best parameters
best_model = grid_search.best_estimator_
predictions = best_model.predict(features)

# Calculate the accuracy on the entire dataset
accuracy = accuracy_score(polarity, predictions)
print('Accuracy score on the entire dataset:', accuracy)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best parameters: {'C': 0.1, 'coef0': 0.0, 'degree': 3, 'gamma': 'auto', 'kernel': 'poly'}
Best accuracy score: 0.8564102564102564
Accuracy score on the entire dataset: 1.0


In [35]:
import shap
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
best_params =  {'C': 0.1, 'coef0': 0.0, 'degree': 3, 'gamma': 'auto', 'kernel': 'poly'}
clf = SVC(**best_params)

# Use cross-validation to compute the model's accuracy
scores = cross_val_score(clf, features, polarity, cv=5, scoring='accuracy')

# Print the accuracy
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

# Train the model and evaluate its performance on the entire dataset
clf.fit(features, polarity)  # Train the model
predictions = clf.predict(features)  # Make predictions

# Calculate the accuracy on the entire dataset
accuracy = accuracy_score(polarity, predictions)
print('Accuracy score on the entire dataset:', accuracy)

Accuracy: 0.86 (+/- 0.23)
Accuracy score on the entire dataset: 1.0


In [27]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

# Create an instance of the RandomForestClassifier
clf = RandomForestClassifier()

# Set the parameter grid for the RandomForestClassifier
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [3, 4, 5, 6, None],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4, 8],  # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False]  # Whether bootstrap samples are used when constructing trees
}

# Create an instance of GridSearchCV
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='accuracy', cv=5, verbose=1)
grid_search.fit(features, polarity)

# Output the best parameters and the corresponding best score
print('Best parameters:', grid_search.best_params_)
print('Best accuracy score:', grid_search.best_score_)

# Make predictions on the entire dataset using the model with the best parameters
best_model = grid_search.best_estimator_
predictions = best_model.predict(features)

# Calculate the accuracy on the entire dataset
accuracy = accuracy_score(polarity, predictions)
print('Accuracy score on the entire dataset:', accuracy)

Fitting 5 folds for each of 360 candidates, totalling 1800 fits
Best parameters: {'bootstrap': True, 'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 300}
Best accuracy score: 0.9217948717948719
Accuracy score on the entire dataset: 0.9838709677419355


  _data = np.array(data, dtype=dtype, copy=copy,


In [32]:
import shap
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
import numpy as np

# Define the best parameters
# best_params = {'bootstrap': True, 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
best_params = {'bootstrap': True, 'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 300}

# Create an instance of the RandomForestClassifier
clf = RandomForestClassifier(**best_params)

# Use cross-validation to compute the model's accuracy
scores = cross_val_score(clf, features, polarity, cv=5, scoring='accuracy')

# Print the accuracy
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

# Train the model and evaluate its performance on the entire dataset
clf.fit(features, polarity)  # Train the model
predictions = clf.predict(features)  # Make predictions

# Calculate the accuracy on the entire dataset
accuracy = accuracy_score(polarity, predictions)
print('Accuracy score on the entire dataset:', accuracy)

Accuracy: 0.87 (+/- 0.16)
Accuracy score on the entire dataset: 0.9838709677419355
