Merge db76ab4 into f4720da

TRI-AMDD · Jul 26, 2022 · f4995b3 · f4995b3
2 parents f4720da + db76ab4
commit f4995b3
Show file tree

Hide file tree

Showing 65 changed files with 7,870 additions and 2,694 deletions.
diff --git a/.github/workflows/camd-docker.yml b/.github/workflows/camd-docker.yml
@@ -14,7 +14,7 @@ jobs:
         os: [
           ubuntu-latest,
         ]
-        python-version: [3.7]
+        python-version: [3.8]
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v2

diff --git a/.github/workflows/camd-test-main.yml b/.github/workflows/camd-test-main.yml
@@ -13,7 +13,7 @@ jobs:
           macos-latest,
           # windows-latest
         ]
-        python-version: [3.7]
+        python-version: [3.9]
 
     runs-on: ${{ matrix.os }}
 
@@ -27,7 +27,7 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         pip install -r requirements.txt
-        pip install -e .[tests]
+        pip install -e .[tests,m3gnet,atomate,proto_dft]
     - name: pytest
       env:
         MPLBACKEND: "Agg"

diff --git a/Dockerfile b/Dockerfile
@@ -5,7 +5,7 @@ SHELL ["/bin/bash", "-c"]
 ENV PATH="/opt/conda/bin/:$PATH"
 
 RUN mkdir -p /home/camd && \
-    conda create -n camd python=3.7 && \
+    conda create -n camd python=3.8 && \
     apt-get update && \
     apt-get -y install gcc g++
 
@@ -22,4 +22,4 @@ RUN source /opt/conda/bin/activate camd && \
     pip install -r requirements.txt
 
 COPY camd /home/camd/camd
-RUN pip install -e .
+RUN pip install -e .[proto_dft,m3gnet,atomate]
diff --git a/README.md b/README.md
@@ -2,6 +2,7 @@
 ![Testing - main](https://github.com/TRI-AMDD/CAMD/workflows/Testing%20-%20main/badge.svg)
 ![Linting](https://github.com/TRI-AMDD/CAMD/workflows/Linting/badge.svg)
 [![Coverage Status](https://coveralls.io/repos/github/TRI-AMDD/CAMD/badge.svg)](https://coveralls.io/github/TRI-AMDD/CAMD)
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/TRI-AMDD/camd/blob/master/examples/main_tutorial.ipynb)
 
 CAMD provides a flexible software framework for sequential / Bayesian optimization type campaigns for materials discovery. Its key features include:
 * **Agents**: Decision making entities which select experiments to run from pre-determined candidate sets. Agents can combine machine learning with physical or chemical constructs, logic, heuristics, exploration-exploitation strategies and so on. CAMD comes with several generic and structure-discovery focused agents, which can be used by the users as templates to derive new ones.

diff --git a/camd/agent/generic.py b/camd/agent/generic.py
@@ -13,6 +13,8 @@
 from sklearn.gaussian_process import GaussianProcessRegressor
 from sklearn.gaussian_process.kernels import RBF, ConstantKernel
 from sklearn.linear_model import LinearRegression
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.neural_network import MLPRegressor
 
 from camd.agent.base import HypothesisAgent
 
@@ -61,7 +63,7 @@ def get_hypotheses(self, candidate_data, seed_data=None):
 
         self.candidate_data = candidate_data.drop(columns=["target"], axis=1)
         self.seed_data = seed_data
-        X_seed = seed_data.drop(columns=["target"], axis=1, errors='ignore')
+        X_seed = seed_data.drop(columns=["target"], axis=1, errors="ignore")
         y_seed = seed_data["target"]
         steps = [
             ("scaler", StandardScaler()),
@@ -203,8 +205,8 @@ def get_hypotheses(self, candidate_data, seed_data=None):
                         2
                         * np.log(
                             len(self.candidate_data)
-                            * _t ** 2
-                            * np.pi ** 2
+                            * _t**2
+                            * np.pi**2
                             / 6
                             / self.kwargs.get("delta", 0.1)
                         )
@@ -247,36 +249,37 @@ def get_hypotheses(self, candidate_data, seed_data=None):
         return self.candidate_data.loc[batch]
 
 
-class LinearAgent(HypothesisAgent):
+class RegressorAgent(HypothesisAgent):
     """
     Linear regression based agent that tries to maximize a target.
     Best for simple checks and benchmarks.
     """
 
     def __init__(
         self,
+        model,
+        features=None,
+        target="target",
         candidate_data=None,
         seed_data=None,
         n_query: int = None,
-        fit_intercept: bool = True,
-        positive: bool = False,
     ):
 
         """
         Args:
+            model (sklearn.RegressorMixin): some regressor with "fit" method
             candidate_data (pandas.DataFrame): data about the candidates to search over. Must have a "target" column,
                     and at least one additional column that can be used as descriptors.
             seed_data (pandas.DataFrame):  data which to fit the Agent to.
             n_query (int): number of queries in allowed. Defaults to 1.
-            fit_intercept (bool): if the intercept is fit for the linear regression
-            positive (bool): if true, constraint coefficients to be positive for the linear regression
         """
+        self.model = model
+        self.features = features
+        self.target = target
         self.candidate_data = candidate_data
         self.seed_data = seed_data
         self.n_query = n_query if n_query else 1
-        self.fit_intercept = fit_intercept
-        self.positive = positive
-        super(LinearAgent).__init__()
+        super(RegressorAgent).__init__()
 
     def get_hypotheses(self, candidate_data, seed_data=None):
         """
@@ -291,31 +294,91 @@ def get_hypotheses(self, candidate_data, seed_data=None):
 
         """
         # Fit on known data
-        self.candidate_data = candidate_data.drop(
-            columns=["target"], axis=1, errors="ignore"
-        )
+        self.candidate_data = candidate_data
 
         if seed_data is not None:
             self.seed_data = seed_data
         else:
             raise ValueError(
-                "Linear Agent requires a finite seed as input. "
+                "RegressorAgent requires a finite seed as input. "
                 "If you are using this as part of a Campaign, consider "
                 "the create_seed option."
             )
 
-        X_seed = seed_data.drop(columns=["target"], axis=1)
-        y_seed = seed_data["target"]
-        steps = [
-            ("scaler", StandardScaler()),
-            (
-                "linear",
-                LinearRegression(),
-            ),
-        ]
-        self.pipeline = Pipeline(steps)
-        self.pipeline.fit(X_seed, y_seed)
-        output = self.pipeline.predict(self.candidate_data)
+        if self.features is not None:
+            X_seed = seed_data[self.features]
+            X_cand = candidate_data[self.features]
+        else:
+            X_seed = seed_data.drop(columns=[self.target], axis=1)
+            X_cand = candidate_data.drop(
+                columns=[self.target], axis=1, errors="ignore"
+            )
+        y_seed = seed_data[self.target]
+        self.model.fit(X_seed, y_seed)
+        output = self.model.predict(X_cand)
         sorted_output = np.argsort(output)[::-1]
         selected = sorted_output[: self.n_query]
         return candidate_data.iloc[selected]
+
+    @classmethod
+    def from_linear(
+        cls,
+        features=None,
+        target="target",
+        candidate_data=None,
+        seed_data=None,
+        n_query: int = None,
+        **kwargs
+    ):
+        """Preset factory method for a Linear Agent"""
+        linear_reg = LinearRegression(**kwargs)
+        return cls(
+            model=linear_reg,
+            features=features,
+            target=target,
+            candidate_data=candidate_data,
+            seed_data=seed_data,
+            n_query=n_query,
+        )
+
+    @classmethod
+    def from_random_forest(
+        cls,
+        features=None,
+        target="target",
+        candidate_data=None,
+        seed_data=None,
+        n_query: int = None,
+        **kwargs
+    ):
+        """Preset factory method for a RandomForestRegressor-based Agent"""
+        rf = RandomForestRegressor(**kwargs)
+        return cls(
+            model=rf,
+            features=features,
+            target=target,
+            candidate_data=candidate_data,
+            seed_data=seed_data,
+            n_query=n_query,
+        )
+
+    @classmethod
+    def from_mlp(
+        cls,
+        features=None,
+        target="target",
+        candidate_data=None,
+        seed_data=None,
+        n_query: int = None,
+        **kwargs
+    ):
+        """Preset factory method for an MLP-based Agent"""
+        mlp = MLPRegressor(**kwargs)
+        return cls(
+            model=mlp,
+            features=features,
+            target=target,
+            candidate_data=candidate_data,
+            seed_data=seed_data,
+            n_query=n_query,
+        )