In [18]:
# General
import pandas as pd
import numpy as np
import pathlib

# ML
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from xgboost import XGBRegressor 

# Cp 
from cp_app.utils import select_structures
from cp_app.descriptors import cv_features

# others
import warnings
warnings.filterwarnings('ignore')

## Setting up the model, target, and features

In [19]:

FEATURES = cv_features
TARGET = 'pCv_300.00' 

In [20]:
def xgb_model(best_params, X, FEATURES, TARGET):
    pipe_xgb = Pipeline(
        [
       ('scaling', StandardScaler()), 
       ('variance_threshold', VarianceThreshold(threshold=0.95)),
       ('xgb', XGBRegressor())
       
       ]
    )
    pipe_xgb.set_params(xgb__reg_lambda = best_params["xgb__reg_lambda"])
    pipe_xgb.set_params(xgb__reg_alpha = best_params["xgb__reg_alpha"])
    pipe_xgb.set_params(xgb__n_estimators = best_params["xgb__n_estimators"])
    pipe_xgb.set_params(xgb__max_depth = best_params["xgb__max_depth"])
    pipe_xgb.set_params(xgb__learning_rate = best_params["xgb__learning_rate"])
    
    pipe_xgb.fit(X[FEATURES], X[TARGET])
    
    return pipe_xgb
    

In [21]:
## these were obtained using hyperopt
xgb_hyperparams = {
    'xgb__reg_lambda': 0.049238826317067365,
    'xgb__reg_alpha': 0.049238826317067365,
    'xgb__n_estimators': 300,
    'xgb__max_depth': 10,
    'xgb__learning_rate': 0.1
}

## Data loading, and train test split,...

In [29]:
RANDOM_SEED = 3982
DATA_DIR = 'data/'
DATA_site = 'alldata.csv'
DATA_structure = 'structures_data.csv'
flag="smallML_120_10" # name_#TrainSize_#EnsembleSize 
N_ensemble = int(flag.split("_")[2])
N_train=int(flag.split("_")[1])

In [30]:
df_structures=pd.read_csv(DATA_DIR+DATA_structure)
df_allsites = pd.read_csv(DATA_DIR+DATA_site)
print("Total structures: ", len(df_structures))

Total structures:  232


### train test split, making sure we have diverse training set

In [31]:
inds=select_structures(N_train,df_structures)
df_train_structures=df_structures.loc[df_structures.index.isin(inds)]
df_test_structures=df_structures.loc[~df_structures.index.isin(inds)]
df_train_structures.to_csv(DATA_DIR+"train_structures.csv")
df_test_structures.to_csv(DATA_DIR+"test_structures.csv")
print("{} structures in training set".format(len(df_train_structures)))
print("{} structures in test set".format(len(df_test_structures)))

120 structures in training set
112 structures in test set


In [32]:
df = df_allsites.loc[df_allsites["structure_name"].isin(df_train_structures["Unnamed: 0"])]
df_site_test = df_allsites.loc[~df_allsites["structure_name"].isin(df_train_structures["Unnamed: 0"])]
df.to_csv(DATA_DIR+"site_train.csv",index=False)

In [33]:
THRESHOLD =df[TARGET].median()
train_size=min(int(0.95*len(df)),max(10000,len(df)))
test_size=len(df)-train_size
df['target_binned'] = [1 if value > THRESHOLD else 0 for value in df[TARGET]]
df_train_stratified, df_test_stratified = train_test_split(df, train_size=train_size, 
                                                                test_size=test_size, 
                                                                random_state=RANDOM_SEED, 
                                                                stratify=df['target_binned']) 

### generating bootstrapped trainset for quantifying uncertainty

In [34]:
Xs = []
for i in range(N_ensemble):
    X = resample(df_train_stratified, replace=True,random_state=i) 
    Xs.append(X)

In [None]:
cif_list = glob.glob("cifs/*")

In [None]:
df_features = featurize_dataset(cif_list, verbos=False, saveto="data/features_example1.csv")

In [None]:
df_features.head()