Merge pull request #498 from JosephMontoya-TRI/remove_tensorflow

Remove tensorflow
TRI-AMDD · Jan 24, 2022 · 697704b · 697704b
2 parents 6b0cd0f + b68d063
commit 697704b
Show file tree

Hide file tree

Showing 10 changed files with 23 additions and 278 deletions.
diff --git a/.github/workflows/camd-test-main.yml b/.github/workflows/camd-test-main.yml
@@ -26,7 +26,6 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install `grep numpy requirements.txt`
         pip install -r requirements.txt
         pip install -e .[tests]
     - name: pytest

diff --git a/Dockerfile b/Dockerfile
@@ -18,7 +18,7 @@ COPY setup.py requirements.txt /home/camd/
 
 # Install package
 RUN source /opt/conda/bin/activate camd && \
-    pip install `grep numpy requirements.txt` && \
+    # pip install `grep numpy requirements.txt` && \
     pip install -r requirements.txt
 
 COPY camd /home/camd/camd

diff --git a/camd/__init__.py b/camd/__init__.py
@@ -11,7 +11,7 @@
 from tqdm import tqdm as _tqdm
 
 
-__version__ = "2021.6.11-post6"
+__version__ = "2021.6.11-post9"
 
 CAMD_ROOT = os.path.dirname(os.path.abspath(__file__))
 CAMD_TEST_FILES = os.path.join(CAMD_ROOT, "tests", "test_files")

diff --git a/camd/agent/stability.py b/camd/agent/stability.py
@@ -4,18 +4,13 @@
 and hypothesizing
 """
 
-import time
 import abc
 import json
 import os
 from copy import deepcopy
 from multiprocessing import cpu_count
 from collections import OrderedDict
 
-import pandas as pd
-import tensorflow as tf
-import gpflow
-from gpflow.ci_utils import ci_niter
 import numpy as np
 from qmpy.analysis.thermodynamics.phase import Phase, PhaseData
 from camd.analysis import PhaseSpaceAL, ELEMENTS
@@ -26,11 +21,10 @@
 from sklearn.gaussian_process.kernels import RBF, ConstantKernel
 from sklearn.gaussian_process import GaussianProcessRegressor
 from sklearn.linear_model import LinearRegression
-from sklearn.model_selection import cross_val_score, KFold, train_test_split
+from sklearn.model_selection import cross_val_score, KFold
 from sklearn.preprocessing import StandardScaler
 from sklearn.pipeline import Pipeline
 from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor
-from sklearn.cluster import MiniBatchKMeans
 from sklearn.metrics import pairwise_distances
 
 # TODO: Adaptive N_query and subsampling of candidate space
@@ -471,207 +465,6 @@ def get_hypotheses(self, candidate_data, seed_data=None):
         return within_hull.head(self.n_query)
 
 
-class SVGProcessStabilityAgent(StabilityAgent):
-    """
-    Stochastic variational gaussian process stability agent for Big Data.
-
-    The computational complexity of this algorithm scales as O(M^3)
-    compared to O(N^3) of standard GP, where N is the number of data points
-    and M is the number of inducing points (M<<N).
-
-    The default parameters are optimized to deliver a compromise between
-    compute-time and model accuracy for data sets with up to 25 to 40
-    thousand examples (e.g. the ICSD seed data). For bigger systems,
-    parameter M may need to be reduced. For smaller systems, it can be
-    increased, if higher accuracy is desired.  Inducing point locations
-    are determined using k-means clustering.
-
-    References:
-        Hensman, James, Nicolo Fusi, and Neil D. Lawrence. "Gaussian
-            processes for big data." Uncertainty in Artificial
-            Intelligence (2013).
-        Kingma, Diederik P., and Jimmy Ba. "Adam: A method for stochastic
-            optimization." arXiv preprint arXiv:1412.6980 (2014).
-
-    """
-
-    def __init__(
-        self,
-        candidate_data=None,
-        seed_data=None,
-        n_query=1,
-        hull_distance=0.0,
-        parallel=cpu_count(),
-        alpha=0.5,
-        M=600,
-        maxiter=20000
-    ):
-        """
-        Args:
-            candidate_data (DataFrame): data about the candidates
-            seed_data (DataFrame): data which to fit the Agent to
-            n_query (int): number of hypotheses to generate
-            hull_distance (float): hull distance as a criteria for
-                which to deem a given material as "stable"
-            parallel (bool): whether to use multiprocessing
-                for phase stability analysis
-            alpha (float): weighting factor for the stdev in making
-                best-case predictions of the stability
-            M (int): number of inducing points associated with the
-                SVGProcess
-            maxiter (int): number of maximum iterations of the SVG
-                process
-        """
-        super(SVGProcessStabilityAgent, self).__init__(
-            candidate_data=candidate_data,
-            seed_data=seed_data,
-            n_query=n_query,
-            hull_distance=hull_distance,
-            parallel=parallel,
-        )
-        self.alpha = alpha
-        self.M = M
-        self.maxiter = maxiter
-
-        # Define non-argument SVG-specific attributes
-        self.kernel = gpflow.kernels.RBF(273) * gpflow.kernels.Constant(273)
-        self.mean_f = gpflow.mean_functions.Constant()
-        self.logger = None
-        self.model = None
-        self.pred_y = None
-        self.pred_std = None
-
-    def get_hypotheses(self, candidate_data, seed_data=None):
-        """
-        Get hypotheses method for SVGProcessStabilityAgent.
-
-        Code used from gpflow examples for big data, see:
-
-        https://github.com/GPflow/GPflow/blob/develop/doc/source/notebooks/advanced/gps_for_big_data.pct.py
-
-        Args:
-            candidate_data (pandas.DataFrame): dataframe of candidates
-            seed_data (pandas.DataFrame): dataframe of prior data on
-                which to fit model
-
-        Returns:
-            (pandas.DataFrame): top candidates from the algorithm
-
-        """
-        X_cand, X_seed, y_seed = self.update_data(candidate_data, seed_data)
-
-        # Test model performance first.  Note we avoid doing CV to
-        # reduce compute time. We simply do a 1-way split 80:20 (train:test)
-        X_train, X_test, y_train, y_test = train_test_split(
-            X_seed, y_seed, test_size=0.2, random_state=42
-        )
-        scaler = StandardScaler()
-        X_train_scaled = scaler.fit_transform(X_train)
-
-        # Do a k-means clustering and use cluster centers as inducing points.
-        cls = MiniBatchKMeans(n_clusters=self.M, batch_size=200)
-        cls.fit(X_train_scaled)
-        Z = cls.cluster_centers_
-        _y = np.array(y_train.to_list())
-        mu = np.mean(_y)
-        sig = np.std(_y)
-        print(Z, _y.shape)
-        print(sig, mu)
-        model = gpflow.models.SVGP(
-            self.kernel,
-            gpflow.likelihoods.Gaussian(),
-            Z,
-            mean_function=self.mean_f,
-        )
-        elbo = tf.function(model.elbo)
-
-        # TensorFlow re-traces & compiles a `tf.function`-wrapped method at *every* call
-        # if the arguments are numpy arrays instead of tf.Tensors. Hence:
-        tensor_data = tuple(map(tf.convert_to_tensor, (X_seed, pd.DataFrame(y_seed))))
-        elbo(tensor_data)  # run it once to trace & compile
-        minibatch_size = 100
-
-        # We turn off training for inducing point locations
-        gpflow.utilities.set_trainable(model.inducing_variable, False)
-
-        train_dataset = tf.data.Dataset.from_tensor_slices((X_train_scaled, pd.DataFrame(y_train))) \
-            .repeat() \
-            .shuffle(len(X_train_scaled))
-
-        def run_adam(model, iterations):
-            """
-            Utility function running the Adam optimizer
-
-            :param model: GPflow model
-            :param interations: number of iterations
-            """
-            # Create an Adam Optimizer action
-            logf = []
-            train_iter = iter(train_dataset.batch(minibatch_size))
-            training_loss = model.training_loss_closure(train_iter, compile=True)
-            optimizer = tf.optimizers.Adam()
-
-            @tf.function
-            def optimization_step():
-                optimizer.minimize(training_loss, model.trainable_variables)
-
-            for step in range(iterations):
-                optimization_step()
-                if step % 10 == 0:
-                    elbo = -training_loss().numpy()
-                    logf.append(elbo)
-            return logf
-
-        print("training")
-        t0 = time.time()
-        run_adam(model, ci_niter(self.maxiter))
-        print("elapsed time: ", time.time() - t0)
-
-        pred_y, pred_v = model.predict_y(scaler.transform(X_test))
-        pred_y = pred_y * sig + mu
-        self.cv_score = np.mean(np.abs(pred_y - y_test.to_numpy().reshape(-1, 1)))
-        print("cv score", self.cv_score)
-        self.model = model
-
-        # Overall model
-        scaler = StandardScaler()
-        X_scaled = scaler.fit_transform(X_seed)
-        cls = MiniBatchKMeans(n_clusters=self.M, batch_size=200)
-        cls.fit(X_scaled)
-        Z = cls.cluster_centers_
-        _y = np.array(y_seed.to_list())
-        mu = np.mean(_y)
-        sig = np.std(_y)
-        model = gpflow.models.SVGP(
-            self.kernel,
-            gpflow.likelihoods.Gaussian(),
-            Z,
-            mean_function=self.mean_f,
-        )
-        maxiter = gpflow.ci_utils.ci_niter(self.maxiter)
-        run_adam(model, maxiter)
-        print(self.model)
-        self.model = model
-
-        # GP makes predictions for Hf and uncertainty*alpha on candidate data
-        pred_y, pred_v = model.predict_y(scaler.transform(X_cand))
-        pred_y = pred_y * sig + mu
-        self.pred_y = np.array(pred_y).reshape(-1,)
-        self.pred_std = (np.array(pred_v) ** 0.5).reshape(-1,)
-
-        expected = self.pred_y - self.pred_std * self.alpha
-        print("expected improv", expected)
-
-        # Update candidate data dataframe with predictions
-        self.update_candidate_stabilities(expected, sort=True, floor=-6.0)
-
-        # Find the most stable ones up to n_query within hull_distance
-        stability_filter = self.candidate_data["pred_stability"] <= self.hull_distance
-        within_hull = self.candidate_data[stability_filter]
-
-        return within_hull.head(self.n_query)
-
-
 class BaggedGaussianProcessStabilityAgent(StabilityAgent):
     """
     An ensemble GP learner that can handle relatively large

diff --git a/camd/agent/tests/test_stability.py b/camd/agent/tests/test_stability.py
@@ -4,8 +4,8 @@
 from sklearn.model_selection import train_test_split
 from camd import CAMD_TEST_FILES
 from camd.agent.stability import QBCStabilityAgent, AgentStabilityML5, \
-    GaussianProcessStabilityAgent, SVGProcessStabilityAgent, \
-    BaggedGaussianProcessStabilityAgent, AgentStabilityAdaBoost
+    GaussianProcessStabilityAgent, BaggedGaussianProcessStabilityAgent, \
+    AgentStabilityAdaBoost
 
 
 class StabilityAgentsTest(unittest.TestCase):
@@ -31,12 +31,6 @@ def test_gp_stability_agent(self):
         hypotheses = agent.get_hypotheses(
             candidate_data=self.candidate_data, seed_data=self.seed_data)
 
-    def test_svg_process_stability_agent(self):
-        agent = SVGProcessStabilityAgent(M=100, maxiter=6)
-        hypotheses = agent.get_hypotheses(
-            candidate_data=self.candidate_data, seed_data=self.seed_data,
-        )
-
     def test_bagged_gp_stability_agent(self):
         agent = BaggedGaussianProcessStabilityAgent(max_samples=100)
         hypotheses = agent.get_hypotheses(

diff --git a/camd/analysis.py b/camd/analysis.py
@@ -272,10 +272,10 @@ def analyze_vaspqmpy_jobs(self, jobs, against_icsd=False, use_energies=False):
         self.energies = []
         for j, r in jobs.iterrows():
             if r["status"] == "SUCCEEDED":
-                rdict = r['result'].as_dict()
-                self.structures.append(r['result'].final_structure)
+                final_structure = r['result'].final_structure
+                self.structures.append(final_structure)
                 self.structure_ids.append(j)
-                self.energies.append(rdict["output"]["final_energy_per_atom"])
+                self.energies.append(r['result'].final_energy / len(final_structure))
         if use_energies:
             return self.analyze(
                 self.structures, self.structure_ids, against_icsd, self.energies

diff --git a/camd/campaigns/tests/test_structure_discovery.py b/camd/campaigns/tests/test_structure_discovery.py
@@ -29,6 +29,9 @@ def teardown_s3():
 
 class ProtoDFTCampaignTest(unittest.TestCase):
     def test_simulated(self):
+        # Note that there's a small issue with pickled results here that may not have
+        # certain spin and magnetization flags set - pickled Vasprun objects may not
+        # be completely compatible with latest version of pymatgen
         exp_dataframe = pd.read_pickle(os.path.join(CAMD_TEST_FILES, "mn-ni-o-sb.pickle"))
         experiment = ATFSampler(exp_dataframe)
         candidate_data = exp_dataframe.iloc[:, :-11]

diff --git a/camd/tests/test_atf_examples.py b/camd/tests/test_atf_examples.py
@@ -8,7 +8,7 @@
 from sklearn.neural_network import MLPRegressor
 from sklearn.gaussian_process.kernels import RBF, ConstantKernel
 
-from camd.agent.stability import QBCStabilityAgent, GaussianProcessStabilityAgent, SVGProcessStabilityAgent, \
+from camd.agent.stability import QBCStabilityAgent, GaussianProcessStabilityAgent, \
     BaggedGaussianProcessStabilityAgent, AgentStabilityAdaBoost
 from camd.agent.base import RandomAgent
 from camd.agent.generic import GenericGPUCB, GPBatchUCB
@@ -211,35 +211,6 @@ def test_mp_loop(self):
         self.assertEqual(new_loop.iteration, 7)
 
 
-@unittest.skipUnless(CAMD_LONG_TESTS, SKIP_MSG)
-class AtfSVGPLoopTest(unittest.TestCase):
-    def setUp(self):
-        self.pwd = os.getcwd()
-        self.tempdir = tempfile.mkdtemp()
-        os.chdir(self.tempdir)
-
-    def tearDown(self):
-        os.chdir(self.pwd)
-        shutil.rmtree(self.tempdir)
-
-    def test_svgp_loop(self):
-        df = pd.read_csv(os.path.join(CAMD_TEST_FILES, 'test_df.csv'))
-        df_sub = df[df['N_species'] <= 3]
-        n_seed = 200  # Starting sample size
-        agent = SVGProcessStabilityAgent(n_query=10, hull_distance=0.05, alpha=0.5, M=100)
-        analyzer = StabilityAnalyzer(hull_distance=0.05, parallel=False)
-        experiment = ATFSampler(df_sub)
-        candidate_data = df_sub
-
-        new_loop = Campaign(candidate_data, agent, experiment, analyzer,
-                            create_seed=n_seed)
-        new_loop.initialize()
-        self.assertTrue(new_loop.initialized)
-
-        new_loop.auto_loop(3)
-        self.assertTrue(True)
-
-
 class GPBatchUCBAgent(unittest.TestCase):
     def setUp(self):
         self.pwd = os.getcwd()

diff --git a/requirements.txt b/requirements.txt
@@ -1,28 +1,19 @@
-numpy==1.19.2
 Django==3.1.13
-gpflow==2.1.5
 python-dateutil==2.8.1
 networkx==2.5.1
-matplotlib==3.4.2
-pandas==1.2.4
-matminer==0.6.5
-autologging
-awscli==1.19.95
-boto3==1.17.93
+matminer==0.7.4
+boto3==1.20.41
 docopt==0.6.2
-scikit-learn==0.24.1
 taburu==2020.5.9
 protosearch==2020.5.10
-GPy==1.9.9
+GPy==1.10.0
 watchtower==1.0.6
+awscli
 
 # Pinned dependencies to help pip
-tensorflow==2.7.0
-sympy==1.8
 sqlparse==0.4.2
 spglib==1.16.1
 scipy==1.6.3
-pymatgen==2020.12.31
 
 # Temporary qmpy3 dependency
-qmpy-tri==2021.6.11
+qmpy-tri>=2021.6.11