In [None]:
import GPy
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from tqdm import tqdm

# Set pandas view options
pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# filter warnings messages from the notebook
import warnings
warnings.filterwarnings('ignore')

from monty.os import cd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

from multi import Generic_MultiAgent, GP_MultiAgent, MultiAnalyzer
from camd.experiment.base import ATFSampler
from camd.campaigns.base import Campaign
from camd import CAMD_CACHE

In [None]:
featurized_data =pd.read_csv(os.path.join(CAMD_CACHE, "brgoch_unittest_data.csv"), index_col=0)

seed_data, candidate_data = train_test_split(featurized_data, test_size=0.8, random_state=42)
# First we will drop all the candidate compositions if they 
# are already verified in seed data. 
seed_data_chemsys = list(seed_data.reduced_formula)
seed_data  = seed_data.append(candidate_data.loc[candidate_data.reduced_formula.isin(seed_data_chemsys)])
candidate_data = candidate_data[~candidate_data.reduced_formula.isin(seed_data_chemsys)]

print(len(seed_data), len(candidate_data))
print(len(seed_data)+len(candidate_data)==len(featurized_data))

# Campaign

In [None]:
N_query = 200
iterations = 10

### 1. Random Agent

In [None]:
# Set up a folder for all the results file
os.system('rm -rf random')
os.system('mkdir -p random')

random_agent = RandomAgent(candidate_data=candidate_data, n_query=N_query)
random_experiment = ATFSampler(dataframe=featurized_data)
analyzer = MultiAnalyzer(target_prop='bandgap', prop_range=[1.6, 2.0])

with cd('random'):
    random_campaign = Campaign(candidate_data=candidate_data, seed_data=seed_data, 
                               agent=random_agent, experiment=random_experiment, 
                               analyzer=analyzer)

    random_campaign.auto_loop(n_iterations=iterations, initialize=True)

In [None]:
# random_history = pd.read_pickle('random/history.pickle')
# random_history

In [None]:
# random_history[['new_exp_discovery', 'total_exp_discovery']].plot(subplots=True)

### 2. MultiAgent

#### 2.1 SVR (Generic Agent)

In [None]:
# Set up a folder for all the results file
os.system('rm -rf svr')
os.system('mkdir -p svr')

svr_agent = GenericMultiAgent(target_prop='bandgap', ideal_prop_val=1.8, 
                   candidate_data=candidate_data, seed_data=seed_data, n_query=N_query,
                   model=SVR(C=10), preprocessor=preprocessing.StandardScaler()
                   )
svr_experiment = ATFSampler(dataframe=featurized_data)
svr_analyzer = MultiAnalyzer(target_prop='bandgap', prop_range=[1.6, 2.0])

with cd('svr'):
    random_campaign = Campaign(candidate_data=candidate_data, seed_data=seed_data, 
                               agent=svr_agent, experiment=svr_experiment, 
                               analyzer=svr_analyzer)
    random_campaign.auto_loop(n_iterations=iterations, initialize=True)


In [None]:
# svr_history = pd.read_pickle('svr/history.pickle')
# svr_history

In [None]:
# svr_history[['new_exp_discovery', 'total_exp_discovery']].plot(subplots=True)

#### 2.2 RandomForest Regressor (Generic Agent)

In [None]:
# Set up a folder for all the results file
os.system('rm -rf rf')
os.system('mkdir -p rf')

# the hyper-params in rf are optimized 
rf_regressor = RandomForestRegressor(min_samples_split=5, n_estimators=30, n_jobs=-1)

rf_agent = Generic_MultiAgent(target_prop='bandgap', ideal_prop_val=1.8, 
                   candidate_data=candidate_data, seed_data=seed_data, n_query=N_query,
                   model=rf_regressor, preprocessor=preprocessing.StandardScaler()
                   )
svr_experiment = ATFSampler(dataframe=featurized_data)
svr_analyzer = MultiAnalyzer(target_prop='bandgap', prop_range=[1.6, 2.0])

with cd('rf'):
    random_campaign = Campaign(candidate_data=candidate_data, seed_data=seed_data, 
                               agent=svr_agent, experiment=svr_experiment, 
                               analyzer=svr_analyzer)
    random_campaign.auto_loop(n_iterations=iterations, initialize=True)


In [None]:
# rf_history = pd.read_pickle('rf/history.pickle')
# rf_history

In [None]:
# rf_history[['new_exp_discovery', 'total_exp_discovery']].plot(subplots=True)

#### 2.3 GP

In [None]:
# Set up a folder for all the results file
os.system('rm -rf GP')
os.system('mkdir -p GP')

GP_agent = GP_MultiAgent(target_prop='bandgap', ideal_prop_val=1.8, 
                      candidate_data=candidate_data, seed_data=seed_data, n_query=N_query,
                      preprocessor=preprocessing.StandardScaler() 
                   )
GP_experiment = ATFSampler(dataframe=featurized_data)
GP_analyzer = MultiAnalyzer(target_prop='bandgap', prop_range=[1.6, 2.0])

with cd('GP'):
    random_campaign = Campaign(candidate_data=candidate_data, seed_data=seed_data, 
                               agent=GP_agent, experiment=GP_experiment, 
                               analyzer=GP_analyzer)
    random_campaign.auto_loop(n_iterations=iterations, initialize=True)

In [None]:
gp_history = pd.read_pickle('GP/history.pickle')
gp_history

#### Side by Side Analysis

* Total budget is 1000. (300 is  experimental allocation, rest is theory allocation)

* We only acquired 89 good experimental data. ~29% success rate 

In [None]:
random_history[['new_exp_discovery', 'total_exp_discovery']].plot(subplots=True)
svr_history[['new_exp_discovery', 'total_exp_discovery']].plot(subplots=True)
rf_history[['new_exp_discovery', 'total_exp_discovery']].plot(subplots=True)
gp_history[['new_exp_discovery', 'total_exp_discovery']].plot(subplots=True)

In [None]:
new_exp_discoveries = pd.DataFrame({'random_new_discovery': np.array(random_history['new_exp_discovery']),
                                'svr_new_discovery': np.array(svr_history['new_exp_discovery']),
                                'rf_new_discovery': np.array(rf_history['new_exp_discovery']),    
                                'gp_new_discovery': np.array(gp_history['new_exp_discovery'])  
})
new_exp_discoveries

In [None]:
total_exp_discoveries = pd.DataFrame({'random_tot_discovery': np.array(random_history['total_exp_discovery']),
                                'svr_tot_discovery': np.array(svr_history['total_exp_discovery']),
                                'rf_tot_discovery': np.array(rf_history['total_exp_discovery']),            
                                'gp_tot_discovery': np.array(gp_history['total_exp_discovery'])  
})

# 228 expt candidates is what was in seed data
total_exp_discoveries-228