In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import pickle
with open('outputs/non_constant_columns.pkl', 'rb') as f:
    non_constant_columns = pickle.load(f)

In [None]:
source_energy_levels_df = pd.read_csv('outputs/processed_source_dataset.csv')
source_energy_levels_df.head()

In [None]:
source_df = pd.read_pickle('outputs/source_descriptors_processed.pkl')
source_df.head()

In [None]:
source_df = pd.concat((source_df, source_energy_levels_df[['HOMO_DFT', 'LUMO_DFT']]), axis=1)
del source_energy_levels_df
source_df.head()

In [None]:
target_df = pd.read_pickle('outputs/target_descriptors_calculated_n_processed.pkl')
target_df.head()

In [7]:
# Only use the molecules with a HOMO_DFT value
target_df = target_df[target_df['HOMO_DFT'].notna()]

In [None]:
X_target = target_df[non_constant_columns]
y_target = target_df['HOMO_DFT']
X_target.shape, y_target.shape

In [None]:
X_source1 = source_df[non_constant_columns].iloc[:233]
y_source1 = source_df['HOMO_DFT'].iloc[:233]
X_source1.shape, y_source1.shape

In [None]:
X_source2 = source_df[non_constant_columns].iloc[233:]
y_source2 = source_df['HOMO_DFT'].iloc[233:]
X_source2.shape, y_source2.shape

In [11]:
X = pd.concat((X_target, X_source1, X_source2), axis=0)
y = pd.concat((y_target, y_source1, y_source2), axis=0)

In [None]:
target_weight = 100000
source1_weight = 1000
source2_weight = 1

sample_weights = np.concatenate([np.full(X_target.shape[0], target_weight),
                                 np.full(X_source1.shape[0], source1_weight),
                                 np.full(X_source2.shape[0], source2_weight)])

sample_weights.shape

In [None]:
from sklearn.ensemble import HistGradientBoostingRegressor
model = HistGradientBoostingRegressor(random_state=0)
model.fit(X, y, sample_weight=sample_weights)

In [None]:
from sklearn.metrics import mean_absolute_error
preds = model.predict(X_target)
mean_absolute_error(preds, y_target)

In [None]:
preds = model.predict(X_source1)
mean_absolute_error(preds, y_source1)

In [None]:
preds = model.predict(X_source2)
mean_absolute_error(preds, y_source2)

In [None]:
from joblib import dump, load
dump(model, 'outputs/models/homo_dft.joblib')