# AIxChem

### The Dataset class

#### Loading
The ```Dataset()``` class is one of the central objects in the framework. It offers a convenient way to handle X and y data simultanously.
You can initiate a ```Dataset()``` instance from various sources.

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from aixchem import Dataset

# Using np arrays.
data = np.random.rand(10, 4)
target = np.random.rand(10, 1)
dataset = Dataset(data, target)

# Using pd dataframes and target from different source
data = pd.DataFrame(np.random.rand(10, 4))
target = pd.Series(np.random.rand(10))
dataset = Dataset(data, target)

# Using pd dataframes and target from the same source
data = pd.DataFrame(np.random.rand(10, 4), columns=["A", "B", "C", "D"])
target = "D"
dataset = Dataset(data, target)

# Using pd dataframes and multiple targets from the same source.
target = ["D", "C"]
dataset = Dataset(data, target)

# Loading from file.
AIxChem = Path.cwd().parents[1]
data = AIxChem / "datasets" / "buttar_norrby_dataset.csv"  # Adjust the path to where the results are located
dataset = Dataset(data, target="exp_activation_energy")

#### Cleaning
The ```Dataset()``` class offers several functions for cleaning the data, dropping columns and rows etc. (see source code for more info). Some simple cleaning operations could look like this:

In [None]:
from aixchem import Dataset

# Loading from file.
AIxChem = Path.cwd().parents[1]
data = AIxChem / "datasets" / "buttar_norrby_dataset.csv"  # Adjust the path to where the results are located
dataset = Dataset(data, target="exp_activation_energy")

# Drop rows with missing values
dataset.dropna(axis=0)
# Shuffle the data
dataset = dataset.shuffle(random_state=42)

# Note that these functions return the Dataset() instance itself (see above), which means you can also chain operations e.g. like this:
# dataset.dropna(axis=0).shuffle(seed=42)

# Drop the unnecessary columns
dataset.drop(columns=["Unnamed: 0"])

# Drop highly correlated features
dataset.correlation(thr=0.8)

#### Exploratory Data Analysis

The ```Dataset()``` class offers a ```summary()``` function that can be helpful for EDA. (NOTE: is_categorical is still under development)

In [None]:
from aixchem import Dataset


# Loading from file.
AIxChem = Path.cwd().parents[1]
data = AIxChem / "datasets" / "buttar_norrby_dataset.csv"  # Adjust the path to where the results are located
dataset = Dataset(data, target="exp_activation_energy")
dataset.summary()

You can also check column types easily (Enjoy with care):

In [None]:
from aixchem import Dataset

# Loading from file.
AIxChem = Path.cwd().parents[1]
data = AIxChem / "datasets" / "buttar_norrby_dataset.csv"  # Adjust the path to where the results are located
dataset = Dataset(data, target="exp_activation_energy")
column = dataset.X.columns[0]
print(dataset.is_categorical(column))
print(dataset.is_numeric(column))

#### Saving

You can directly save the ```Dataset()``` as an excel file (1 worksheet for data, 1 for labels).

In [None]:
from aixchem import Dataset

# Loading from file.
AIxChem = Path.cwd().parents[1]
data = AIxChem / "datasets" / "buttar_norrby_dataset.csv"  # Adjust the path to where the results are located
dataset = Dataset(data, target="exp_activation_energy")

save_path = AIxChem / "docs/examples" / "test.xlsx"
dataset.save(save_path, index=False)

#### Magic methods

The ```Dataset()``` class offers certain magic methods. E.g. Dataset(1) + Dataset(2) will allow you to concatenate 2 datasets, wherease Dataset(1) - Dataset(2) will remove the entries of Dataset(2) from Dataset(1)



In [None]:
from aixchem import Dataset

# Loading from file.
AIxChem = Path.cwd().parents[1]
data = AIxChem / "datasets" / "buttar_norrby_dataset.csv"  # Adjust the path to where the results are located
dataset = Dataset(data, target="exp_activation_energy")

print(dataset.X.shape, dataset.y.shape)

# Generate subsets
d1, d2 = dataset.split(size=.27)
print(d1.X.shape, d1.y.shape)
print(d2.X.shape, d2.y.shape)

# Adding two datasets
d = d1 + d2
print(d.X.shape, d.y.shape)

# Subtracting two datasets
d = dataset - d2
print(d.X.shape, d.y.shape)

#### Other utils

Several other functions exist that can be useful

In [None]:
from aixchem import Dataset

# Loading from file.
AIxChem = Path.cwd().parents[1]
data = AIxChem / "datasets" / "buttar_norrby_dataset.csv"  # Adjust the path to where the results are located
dataset = Dataset(data, target="exp_activation_energy")

# Splitting the dataset into training and test sets
train, test = dataset.split(size=.2)

# Copying the dataset
dataset_copy = dataset.copy()

# ...

### The Transformer class

The Transformer class allows you to apply classic transformations to your dataset (such as scaling and one-hot encoding) as well as data augmentation. Currently, we have four augmenters in the pipeline

1. Additive Gaussian Noise (AGN)
2. Nearest Neighbour SMOTE (Sythentic Minority Oversampling Technique)
3. Tabular Variational AutoEncoder (TVAE)
4. Conditional Tabular GAN (Generative Adversarial Networks)


Here an example with AGN...

In [None]:
from aixchem import Dataset
from aixchem.transform import augment 
from aixchem.transform import preprocess

# Loading from file.
AIxChem = Path.cwd().parents[1]
data = AIxChem / "datasets" / "buttar_norrby_dataset.csv"  # Adjust the path to where the results are located
train, test = Dataset(data, target="exp_activation_energy").drop(columns=["Unnamed: 0"]).shuffle(random_state=42).dropna(axis=0).split(size=.7, random_state=42)

scaler = preprocess.Scaler().fit(train, columns=None)
train, test = scaler.transform(train), scaler.transform(test)

# if n = 10, 10 augmented instances will be created for each instance in the dataset. Total entries will be n + 10n = 11n 
aug = augment.AdditiveGaussianNoise(sigma=0.5, n=10, random_state=42).fit(train)

# Transform creates augmented instance and add it to the dataset, 
# returning the augmented dataset (augmented + original)
augmented = aug.transform(train)

print(f"Before augmentation: {train.X.shape}, after augmentation: {augmented.X.shape}")

...and here with CTGAN

In [None]:
from aixchem import Dataset
from aixchem.transform import augment 
from aixchem.transform import preprocess

# Loading from file.
AIxChem = Path.cwd().parents[1]
data = AIxChem / "datasets" / "buttar_norrby_dataset.csv"  # Adjust the path to where the results are located
train, test = Dataset(data, target="exp_activation_energy").drop(columns=["Unnamed: 0"]).shuffle(random_state=42).dropna(axis=0).split(size=.7, random_state=42)

scaler = preprocess.Scaler().fit(train, columns=None)
train, test = scaler.transform(train), scaler.transform(test)

aug = augment.CTGAN(n=2, epochs=100, random_state=42, verbose=0).fit(train)

# Transform creates augmented instance and add it to the dataset, 
# returning the augmented dataset (augmented + original)
augmented = aug.transform(train)

print(f"Before augmentation: {train.X.shape}, after augmentation: {augmented.X.shape}")

### The Model class

#### Decomposition Models

There are several decomposition algorithms available that allow you to embedd your data into a lateral space. After using the run() method, you will find a new Dataset() instance in the decomposition.embedding attribute. The embedding.X data corresponds to your embedded data, while the embedding.raw attribtue corresponds to the unembedded data.

##### PCA

For PCA additional class attributes exist that allow you to access a summary, the loadings and the most important features for each PC.

In [None]:
# Adjust the path to where aixchem is located
from aixchem.models import decomposition

# Load some dummy dataset
from aixchem.test.data import regression_dataset
data = regression_dataset()

pca = decomposition.PCA(n_components=4).run(data)

# Your embedded/unembedded data is stored here:
pca.embedding.X, pca.embedding.raw

# Other useful information:
pca.summary, pca.loadings, pca.feature_ranking

##### UMAP and t-SNE

Both of these dont offer any additional information yet

In [None]:
from aixchem.models import decomposition

# Load some dummy dataset
from aixchem.test.data import regression_dataset
data = regression_dataset()

umap = decomposition.UMAP(n_components=2).run(data)
tsne = decomposition.tSNE(n_components=2).run(data)

umap.embedding.X, tsne.embedding.X

### The Validator class

Validators allow you to easily generate splits of your dataset object. You can use instances of sklearn validators for this. (e.g. kfold for regression, stratifiedKfold for classification)

In [None]:
from sklearn.model_selection import KFold

from aixchem import Dataset
from aixchem.validation.core import CrossValidator


# Loading from file.
AIxChem = Path.cwd().parents[1]
data = AIxChem / "datasets" / "buttar_norrby_dataset.csv"  # Adjust the path to where the results are located
dataset = Dataset(data, target="exp_activation_energy")

validator = CrossValidator(splitter=KFold(n_splits=10, shuffle=True, random_state=42))

for train, test in validator.split(dataset):
    print(train.X.shape, test.X.shape)

Instead of using a for loop to iterate (sequentially) over each fold from split, you can define a function and run it for each fold in the validator in parallel with the specified number of (logical) cpu cores.

In [None]:
from sklearn.model_selection import KFold

from aixchem import Dataset
from aixchem.validation.core import CrossValidator

# Loading from file.
AIxChem = Path.cwd().parents[1]
data = AIxChem / "datasets" / "buttar_norrby_dataset.csv"  # Adjust the path to where the results are located
dataset = Dataset(data, target="exp_activation_energy")

validator = CrossValidator(splitter=KFold(n_splits=4, shuffle=True, random_state=42))


def main(train, test):
    return train.X.shape, test.X.shape


results = validator.run(dataset, main, n_cpus=4)

results

### Workflow example

In the following you can see a typical regression workflow. We first load, clean and split our dataset into train and test data.
Then we do some transformations (scaling numeric features, encoding categorical features) based on our train data.
Finally, we build our model and evaluate it on both, train and test data.

In [None]:
from aixchem import Dataset
from aixchem.transform import preprocess
from aixchem.models import regression

# Loading from file.
AIxChem = Path.cwd().parents[1]
data = AIxChem / "datasets" / "buttar_norrby_dataset.csv"  # Adjust the path to where the results are located
dataset = Dataset(data, target="exp_activation_energy").dropna(axis=0).shuffle(random_state=42).drop(columns=["Unnamed: 0"])
train, test = dataset.split(size=.75, random_state=42)

# Preprocessing with StandardScaler and OneHotEncoder
scaler = preprocess.Scaler().fit(train, columns=None)
train, test = scaler.transform(train), scaler.transform(test)

ohe = preprocess.OneHotEncoder().fit(train, columns=None)
train, test = scaler.transform(train), scaler.transform(test)

model = regression.RandomForest(n_estimators=1000, max_depth=50, random_state=42).fit(train)

print(f"train: {model.evaluate(train)}, test: {model.evaluate(test)}")

Example workflow with cross-validation

In [None]:
from aixchem import Dataset
from aixchem.transform import preprocess
from aixchem.models import regression
from sklearn.model_selection import KFold
from aixchem.validation.core import CrossValidator

# Loading from file.
AIxChem = Path.cwd().parents[1]
data = AIxChem / "datasets" / "buttar_norrby_dataset.csv"  # Adjust the path to where the results are located
dataset = Dataset(data, target="exp_activation_energy", categorical_thr=None).dropna(axis=0)
dataset.drop(columns=["Unnamed: 0"])

print(dataset.X.shape, dataset.y.shape)

validator = CrossValidator(splitter=KFold(n_splits=4, shuffle=True, random_state=42))

for train, test in validator.split(dataset):

    scaler = preprocess.Scaler().fit(train, columns=None)
    train, test = scaler.transform(train), scaler.transform(test)

    print(train.X.shape, test.X.shape)

    #ohe = preprocess.OneHotEncoder().fit(train, columns=None)
    #train, test = ohe.transform(train), ohe.transform(test)

    model = regression.RandomForest(n_estimators=10, max_depth=10, random_state=42).fit(train)

    print(f"train: {model.evaluate(train)}, test: {model.evaluate(test)}")

Example workflow with cross-validation and parallel execution

In [None]:
from aixchem import Dataset
from aixchem.transform import preprocess
from aixchem.models import regression
from sklearn.model_selection import KFold
from aixchem.validation.core import CrossValidator

# Loading from file.
AIxChem = Path.cwd().parents[1]
data = AIxChem / "datasets" / "buttar_norrby_dataset.csv"  # Adjust the path to where the results are located
dataset = Dataset(data, target="exp_activation_energy", categorical_thr=None).dropna(axis=0)
dataset.drop(columns=["Unnamed: 0"])
validator = CrossValidator(splitter=KFold(n_splits=4, shuffle=True, random_state=42))


def main(train, test):
    # Fit and transform the data using Scaler
    scaler = preprocess.Scaler().fit(train, columns=None)
    train, test = scaler.transform(train), scaler.transform(test)

    # Fit and transform the data using OneHotEncoder
    #ohe = preprocess.OneHotEncoder().fit(train, columns=None)
    #train, test = ohe.transform(train), ohe.transform(test)

    # Fit the model
    model = regression.RandomForest(n_estimators=1000, max_depth=50, random_state=42).fit(train)

    # Evaluate the model
    return f"train: {model.evaluate(train)}, test: {model.evaluate(test)}"


results = validator.run(dataset, main, n_cpus=5)

results
    

Example workflow with the pipeline class, on regression problems

A multitude of ML models are available in the model class.

In [None]:
from sklearn.model_selection import KFold
from aixchem import Dataset
from aixchem.validation import CrossValidator
from aixchem.pipeline.pipe import Pipeline
from aixchem.optimization import GridOptimizer

from aixchem.transform import preprocess
from aixchem.models import regression

# Loading from file.
AIxChem = Path.cwd().parents[1]
data = AIxChem / "datasets" / "buttar_norrby_dataset.csv"  # Adjust the path to where the results are located
dataset = Dataset(data, target="exp_activation_energy", categorical_thr=None).dropna(axis=0)
dataset.drop(columns=["Unnamed: 0"])

save_path = AIxChem / "docs/examples/regression"

pipeline = Pipeline(

    dataset=dataset,

    transformers=[
        preprocess.Scaler(),
        #[GridOptimizer(preprocess.OneHotEncoder, params={"random_state": [42]}), None]
        ],

    models={
        "MLR": regression.LinearModel(),
        "RF": GridOptimizer(regression.RandomForest, params={"n_estimators": [20, 50, 100], "max_depth": [10, 20], "random_state": [42]}),
        "MLP": GridOptimizer(regression.NeuralNetwork, params={
                        'hidden_neurons': [[46, 46], [92, 92], [92, 46]],
                        'optimizer': ['adam', 'rmsprop'],
                        'batch_size': [32, 64],
                        'epochs': [500, 1000],
                        'random_state': [42]})
        },

    validator=CrossValidator(splitter=KFold(n_splits=3, shuffle=True, random_state=42)),
    
    # Adjust the path to where you want to save the results
    path=save_path
)

pipeline.run(n_cpus=5)


Example workflow using the Pipeline class for classification. In this example, we perform binary classification by splitting the label into reactions with Experimental Activation Energies above or below the 25 kcal/mol threshold

In [None]:
from sklearn.model_selection import KFold
from aixchem import Dataset
from aixchem.validation import CrossValidator
from aixchem.pipeline.pipe import Pipeline
from aixchem.optimization import GridOptimizer

from aixchem.transform import preprocess
from aixchem.models import classification

# Loading from file.
AIxChem = Path.cwd().parents[1]
DATA = AIxChem / "datasets" / "buttar_norrby_dataset.csv"  # Adjust the path to where the results are located
LABEL = "exp_activation_energy"
BINARY_THR = 25

save_path = AIxChem / "docs/examples/classification"

dataset = Dataset(DATA, target=LABEL, categorical_thr=None).dropna(axis=0).shuffle(random_state=42)
# encode the label as binary
dataset.y[LABEL] = dataset.y[LABEL].apply(lambda x: 0 if x <= BINARY_THR else 1)

dataset.drop(columns=["Unnamed: 0"])

pipeline = Pipeline(

    dataset=dataset,

    transformers=[
        preprocess.Scaler(),
        #[GridOptimizer(preprocess.OneHotEncoder, params={"random_state": [42]}), None]
        ],

    models={
        "LOG": classification.LogModel(),
        "RF": GridOptimizer(classification.RandomForest, params={"n_estimators": [20, 50, 100], "max_depth": [10, 20], "random_state": [42]}),
        },

    validator=CrossValidator(splitter=KFold(n_splits=3, shuffle=True, random_state=42)),
    
    # Adjust the path to where you want to save the results
    path=save_path
)

pipeline.run(n_cpus=5)


Example of simultaneous optimization of augmenters and ML models

In [None]:
from sklearn.model_selection import KFold
from aixchem import Dataset
from aixchem.validation import CrossValidator
from aixchem.pipeline.pipe import Pipeline
from aixchem.optimization import GridOptimizer

from aixchem.transform import preprocess
from aixchem.models import regression


# Loading from file.
AIxChem = Path.cwd().parents[1]
DATA = AIxChem / "datasets" / "buttar_norrby_dataset.csv"  # Adjust the path to where the results are located
dataset = Dataset(DATA, target="exp_activation_energy", categorical_thr=None).dropna(axis=0)
dataset.drop(columns=["Unnamed: 0"])

save_path = AIxChem / "docs/examples/regression_augmentation"

pipeline = Pipeline(

    dataset=dataset,

    transformers=[
        preprocess.Scaler(),

        # Augmentation
        [ 
            None, 
            GridOptimizer(
                augment.AdditiveGaussianNoise, params={"n":[1, 2, 3], "sigma":[0.1, 0.5, 1.0], "random_state":[42], "transform_y":[True]},
            ),
        ],
    ],

    models={
        "MLR": regression.LinearModel(),
        "RF": GridOptimizer(regression.RandomForest, params={"n_estimators": [20, 50, 100], "max_depth": [10, 20], "random_state": [42]}),
        },

    validator=CrossValidator(splitter=KFold(n_splits=3, shuffle=True, random_state=42)),
    
    # Adjust the path to where you want to save the results
    path=save_path
)

pipeline.run(n_cpus=5)