In [1]:
import os
import pandas as pd
from tqdm import tqdm
from collections import Counter
from pycm import ConfusionMatrix
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score as accuracy
from sklearn.metrics import balanced_accuracy_score as bal_score
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
import pickle
import joblib
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [2]:
path = "file_path" # path to the directory where the data files are stored
filename = ['global.csv','metal_properties.csv','fr_descriptors.csv','label_sel.csv']

load_data = []
for i in filename:
    load_data.append(pd.read_csv(filepath_or_buffer=os.path.join(path,i)))

load_data_content = {'Global':0 ,'metal':1, "Linker_fr":2}

### Load Classifier

In [4]:
def select_classification(data, select_data_content, select_descriptor):
    """
    Selects and merges specified descriptor data with base physical properties for classification tasks.
    
    Args:
        data: Input dataset where the last element contains labeled data
        select_data_content: Dictionary mapping descriptor names to their indices in data
        select_descriptor: List of descriptor names to include
        
    Returns:
        output_X: Combined feature matrix (excluding MOF identifiers)
    """
    
    # (1) Prepare training dataset
    data_labels = data[-1]  # Assuming last item contains labeled data
    # Select base physical property columns (column 0: MOF, 1: gas type, 2: temperature)
    data_physics = data_labels.iloc[:, [0,1,2]]
    # Initialize dataset with base physical properties
    data_all = data_physics.copy()
    
    # Merge selected descriptor data
    for descriptor in select_descriptor:
        idx = select_data_content[descriptor]
        # Merge descriptor data using 'mof' as primary key
        data_all = pd.merge(data_all, data[idx], on='mof', how='left')
    
    # Remove 'mof' column after merging
    data_X = data_all.drop(['mof'], axis=1).astype(float)
    output_X = data_X.copy()
    
    return output_X

### Dataset

In [None]:
select_descriptor = ['Global','metal',"Linker_fr"]
data_X = select_classification(data = load_data,
                               select_data_content = load_data_content, 
                               select_descriptor=select_descriptor,
                               )  

In [7]:
features = ['LCD', 'PLD', 'desity(g/cm^3)', 'VSA(m^2/cm^3)', 'GSA(m^2/g)',
       'Vp(cm^3/g)', 'void_fraction', 'atomic_radius', 'electronegativity',
       'electron_affinity', 'ionization_energy', 'atomic_weight', 'oxistates',
       'fr_Al_COO', 'fr_Al_OH', 'fr_Al_OH_noTert', 'fr_ArN', 'fr_Ar_COO',
       'fr_Ar_N', 'fr_Ar_NH', 'fr_COO', 'fr_COO2', 'fr_C_O', 'fr_C_O_noCOO',
       'fr_NH0', 'fr_NH1', 'fr_NH2', 'fr_Nhpyrrole', 'fr_aniline',
       'fr_aryl_methyl', 'fr_azo', 'fr_benzene', 'fr_bicyclic', 'fr_ether',
       'fr_furan', 'fr_halogen', 'fr_methoxy', 'fr_piperzine', 'fr_pyridine',
       'fr_sulfone']
data_X = data_X[features]

### prediction

In [9]:
model = joblib.load('Model\selectivity.pkl')
model_scaler = joblib.load('Model\selectivity_scaler.pkl')

In [None]:
X = data_X.values
X = model_scaler.transform(X)
y = model.predict(X)
print('y:',y)