Skip to content

Commit

Permalink
Merge pull request #498 from JosephMontoya-TRI/remove_tensorflow
Browse files Browse the repository at this point in the history
Remove tensorflow
  • Loading branch information
JosephMontoya-TRI committed Jan 24, 2022
2 parents 6b0cd0f + b68d063 commit 697704b
Show file tree
Hide file tree
Showing 10 changed files with 23 additions and 278 deletions.
1 change: 0 additions & 1 deletion .github/workflows/camd-test-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install `grep numpy requirements.txt`
pip install -r requirements.txt
pip install -e .[tests]
- name: pytest
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ COPY setup.py requirements.txt /home/camd/

# Install package
RUN source /opt/conda/bin/activate camd && \
pip install `grep numpy requirements.txt` && \
# pip install `grep numpy requirements.txt` && \
pip install -r requirements.txt

COPY camd /home/camd/camd
Expand Down
2 changes: 1 addition & 1 deletion camd/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from tqdm import tqdm as _tqdm


__version__ = "2021.6.11-post6"
__version__ = "2021.6.11-post9"

CAMD_ROOT = os.path.dirname(os.path.abspath(__file__))
CAMD_TEST_FILES = os.path.join(CAMD_ROOT, "tests", "test_files")
Expand Down
209 changes: 1 addition & 208 deletions camd/agent/stability.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,13 @@
and hypothesizing
"""

import time
import abc
import json
import os
from copy import deepcopy
from multiprocessing import cpu_count
from collections import OrderedDict

import pandas as pd
import tensorflow as tf
import gpflow
from gpflow.ci_utils import ci_niter
import numpy as np
from qmpy.analysis.thermodynamics.phase import Phase, PhaseData
from camd.analysis import PhaseSpaceAL, ELEMENTS
Expand All @@ -26,11 +21,10 @@
from sklearn.gaussian_process.kernels import RBF, ConstantKernel
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import pairwise_distances

# TODO: Adaptive N_query and subsampling of candidate space
Expand Down Expand Up @@ -471,207 +465,6 @@ def get_hypotheses(self, candidate_data, seed_data=None):
return within_hull.head(self.n_query)


class SVGProcessStabilityAgent(StabilityAgent):
"""
Stochastic variational gaussian process stability agent for Big Data.
The computational complexity of this algorithm scales as O(M^3)
compared to O(N^3) of standard GP, where N is the number of data points
and M is the number of inducing points (M<<N).
The default parameters are optimized to deliver a compromise between
compute-time and model accuracy for data sets with up to 25 to 40
thousand examples (e.g. the ICSD seed data). For bigger systems,
parameter M may need to be reduced. For smaller systems, it can be
increased, if higher accuracy is desired. Inducing point locations
are determined using k-means clustering.
References:
Hensman, James, Nicolo Fusi, and Neil D. Lawrence. "Gaussian
processes for big data." Uncertainty in Artificial
Intelligence (2013).
Kingma, Diederik P., and Jimmy Ba. "Adam: A method for stochastic
optimization." arXiv preprint arXiv:1412.6980 (2014).
"""

def __init__(
self,
candidate_data=None,
seed_data=None,
n_query=1,
hull_distance=0.0,
parallel=cpu_count(),
alpha=0.5,
M=600,
maxiter=20000
):
"""
Args:
candidate_data (DataFrame): data about the candidates
seed_data (DataFrame): data which to fit the Agent to
n_query (int): number of hypotheses to generate
hull_distance (float): hull distance as a criteria for
which to deem a given material as "stable"
parallel (bool): whether to use multiprocessing
for phase stability analysis
alpha (float): weighting factor for the stdev in making
best-case predictions of the stability
M (int): number of inducing points associated with the
SVGProcess
maxiter (int): number of maximum iterations of the SVG
process
"""
super(SVGProcessStabilityAgent, self).__init__(
candidate_data=candidate_data,
seed_data=seed_data,
n_query=n_query,
hull_distance=hull_distance,
parallel=parallel,
)
self.alpha = alpha
self.M = M
self.maxiter = maxiter

# Define non-argument SVG-specific attributes
self.kernel = gpflow.kernels.RBF(273) * gpflow.kernels.Constant(273)
self.mean_f = gpflow.mean_functions.Constant()
self.logger = None
self.model = None
self.pred_y = None
self.pred_std = None

def get_hypotheses(self, candidate_data, seed_data=None):
"""
Get hypotheses method for SVGProcessStabilityAgent.
Code used from gpflow examples for big data, see:
https://github.com/GPflow/GPflow/blob/develop/doc/source/notebooks/advanced/gps_for_big_data.pct.py
Args:
candidate_data (pandas.DataFrame): dataframe of candidates
seed_data (pandas.DataFrame): dataframe of prior data on
which to fit model
Returns:
(pandas.DataFrame): top candidates from the algorithm
"""
X_cand, X_seed, y_seed = self.update_data(candidate_data, seed_data)

# Test model performance first. Note we avoid doing CV to
# reduce compute time. We simply do a 1-way split 80:20 (train:test)
X_train, X_test, y_train, y_test = train_test_split(
X_seed, y_seed, test_size=0.2, random_state=42
)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Do a k-means clustering and use cluster centers as inducing points.
cls = MiniBatchKMeans(n_clusters=self.M, batch_size=200)
cls.fit(X_train_scaled)
Z = cls.cluster_centers_
_y = np.array(y_train.to_list())
mu = np.mean(_y)
sig = np.std(_y)
print(Z, _y.shape)
print(sig, mu)
model = gpflow.models.SVGP(
self.kernel,
gpflow.likelihoods.Gaussian(),
Z,
mean_function=self.mean_f,
)
elbo = tf.function(model.elbo)

# TensorFlow re-traces & compiles a `tf.function`-wrapped method at *every* call
# if the arguments are numpy arrays instead of tf.Tensors. Hence:
tensor_data = tuple(map(tf.convert_to_tensor, (X_seed, pd.DataFrame(y_seed))))
elbo(tensor_data) # run it once to trace & compile
minibatch_size = 100

# We turn off training for inducing point locations
gpflow.utilities.set_trainable(model.inducing_variable, False)

train_dataset = tf.data.Dataset.from_tensor_slices((X_train_scaled, pd.DataFrame(y_train))) \
.repeat() \
.shuffle(len(X_train_scaled))

def run_adam(model, iterations):
"""
Utility function running the Adam optimizer
:param model: GPflow model
:param interations: number of iterations
"""
# Create an Adam Optimizer action
logf = []
train_iter = iter(train_dataset.batch(minibatch_size))
training_loss = model.training_loss_closure(train_iter, compile=True)
optimizer = tf.optimizers.Adam()

@tf.function
def optimization_step():
optimizer.minimize(training_loss, model.trainable_variables)

for step in range(iterations):
optimization_step()
if step % 10 == 0:
elbo = -training_loss().numpy()
logf.append(elbo)
return logf

print("training")
t0 = time.time()
run_adam(model, ci_niter(self.maxiter))
print("elapsed time: ", time.time() - t0)

pred_y, pred_v = model.predict_y(scaler.transform(X_test))
pred_y = pred_y * sig + mu
self.cv_score = np.mean(np.abs(pred_y - y_test.to_numpy().reshape(-1, 1)))
print("cv score", self.cv_score)
self.model = model

# Overall model
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_seed)
cls = MiniBatchKMeans(n_clusters=self.M, batch_size=200)
cls.fit(X_scaled)
Z = cls.cluster_centers_
_y = np.array(y_seed.to_list())
mu = np.mean(_y)
sig = np.std(_y)
model = gpflow.models.SVGP(
self.kernel,
gpflow.likelihoods.Gaussian(),
Z,
mean_function=self.mean_f,
)
maxiter = gpflow.ci_utils.ci_niter(self.maxiter)
run_adam(model, maxiter)
print(self.model)
self.model = model

# GP makes predictions for Hf and uncertainty*alpha on candidate data
pred_y, pred_v = model.predict_y(scaler.transform(X_cand))
pred_y = pred_y * sig + mu
self.pred_y = np.array(pred_y).reshape(-1,)
self.pred_std = (np.array(pred_v) ** 0.5).reshape(-1,)

expected = self.pred_y - self.pred_std * self.alpha
print("expected improv", expected)

# Update candidate data dataframe with predictions
self.update_candidate_stabilities(expected, sort=True, floor=-6.0)

# Find the most stable ones up to n_query within hull_distance
stability_filter = self.candidate_data["pred_stability"] <= self.hull_distance
within_hull = self.candidate_data[stability_filter]

return within_hull.head(self.n_query)


class BaggedGaussianProcessStabilityAgent(StabilityAgent):
"""
An ensemble GP learner that can handle relatively large
Expand Down
10 changes: 2 additions & 8 deletions camd/agent/tests/test_stability.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
from sklearn.model_selection import train_test_split
from camd import CAMD_TEST_FILES
from camd.agent.stability import QBCStabilityAgent, AgentStabilityML5, \
GaussianProcessStabilityAgent, SVGProcessStabilityAgent, \
BaggedGaussianProcessStabilityAgent, AgentStabilityAdaBoost
GaussianProcessStabilityAgent, BaggedGaussianProcessStabilityAgent, \
AgentStabilityAdaBoost


class StabilityAgentsTest(unittest.TestCase):
Expand All @@ -31,12 +31,6 @@ def test_gp_stability_agent(self):
hypotheses = agent.get_hypotheses(
candidate_data=self.candidate_data, seed_data=self.seed_data)

def test_svg_process_stability_agent(self):
agent = SVGProcessStabilityAgent(M=100, maxiter=6)
hypotheses = agent.get_hypotheses(
candidate_data=self.candidate_data, seed_data=self.seed_data,
)

def test_bagged_gp_stability_agent(self):
agent = BaggedGaussianProcessStabilityAgent(max_samples=100)
hypotheses = agent.get_hypotheses(
Expand Down
6 changes: 3 additions & 3 deletions camd/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,10 +272,10 @@ def analyze_vaspqmpy_jobs(self, jobs, against_icsd=False, use_energies=False):
self.energies = []
for j, r in jobs.iterrows():
if r["status"] == "SUCCEEDED":
rdict = r['result'].as_dict()
self.structures.append(r['result'].final_structure)
final_structure = r['result'].final_structure
self.structures.append(final_structure)
self.structure_ids.append(j)
self.energies.append(rdict["output"]["final_energy_per_atom"])
self.energies.append(r['result'].final_energy / len(final_structure))
if use_energies:
return self.analyze(
self.structures, self.structure_ids, against_icsd, self.energies
Expand Down
3 changes: 3 additions & 0 deletions camd/campaigns/tests/test_structure_discovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ def teardown_s3():

class ProtoDFTCampaignTest(unittest.TestCase):
def test_simulated(self):
# Note that there's a small issue with pickled results here that may not have
# certain spin and magnetization flags set - pickled Vasprun objects may not
# be completely compatible with latest version of pymatgen
exp_dataframe = pd.read_pickle(os.path.join(CAMD_TEST_FILES, "mn-ni-o-sb.pickle"))
experiment = ATFSampler(exp_dataframe)
candidate_data = exp_dataframe.iloc[:, :-11]
Expand Down
31 changes: 1 addition & 30 deletions camd/tests/test_atf_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from sklearn.neural_network import MLPRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel

from camd.agent.stability import QBCStabilityAgent, GaussianProcessStabilityAgent, SVGProcessStabilityAgent, \
from camd.agent.stability import QBCStabilityAgent, GaussianProcessStabilityAgent, \
BaggedGaussianProcessStabilityAgent, AgentStabilityAdaBoost
from camd.agent.base import RandomAgent
from camd.agent.generic import GenericGPUCB, GPBatchUCB
Expand Down Expand Up @@ -211,35 +211,6 @@ def test_mp_loop(self):
self.assertEqual(new_loop.iteration, 7)


@unittest.skipUnless(CAMD_LONG_TESTS, SKIP_MSG)
class AtfSVGPLoopTest(unittest.TestCase):
def setUp(self):
self.pwd = os.getcwd()
self.tempdir = tempfile.mkdtemp()
os.chdir(self.tempdir)

def tearDown(self):
os.chdir(self.pwd)
shutil.rmtree(self.tempdir)

def test_svgp_loop(self):
df = pd.read_csv(os.path.join(CAMD_TEST_FILES, 'test_df.csv'))
df_sub = df[df['N_species'] <= 3]
n_seed = 200 # Starting sample size
agent = SVGProcessStabilityAgent(n_query=10, hull_distance=0.05, alpha=0.5, M=100)
analyzer = StabilityAnalyzer(hull_distance=0.05, parallel=False)
experiment = ATFSampler(df_sub)
candidate_data = df_sub

new_loop = Campaign(candidate_data, agent, experiment, analyzer,
create_seed=n_seed)
new_loop.initialize()
self.assertTrue(new_loop.initialized)

new_loop.auto_loop(3)
self.assertTrue(True)


class GPBatchUCBAgent(unittest.TestCase):
def setUp(self):
self.pwd = os.getcwd()
Expand Down
19 changes: 5 additions & 14 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,28 +1,19 @@
numpy==1.19.2
Django==3.1.13
gpflow==2.1.5
python-dateutil==2.8.1
networkx==2.5.1
matplotlib==3.4.2
pandas==1.2.4
matminer==0.6.5
autologging
awscli==1.19.95
boto3==1.17.93
matminer==0.7.4
boto3==1.20.41
docopt==0.6.2
scikit-learn==0.24.1
taburu==2020.5.9
protosearch==2020.5.10
GPy==1.9.9
GPy==1.10.0
watchtower==1.0.6
awscli

# Pinned dependencies to help pip
tensorflow==2.7.0
sympy==1.8
sqlparse==0.4.2
spglib==1.16.1
scipy==1.6.3
pymatgen==2020.12.31

# Temporary qmpy3 dependency
qmpy-tri==2021.6.11
qmpy-tri>=2021.6.11
Loading

0 comments on commit 697704b

Please sign in to comment.