In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import csv

# Open the CSV file for reading
with open('outputs/homo_optimum_features_and_trials.csv', 'r', newline='') as file:
    reader = csv.reader(file)

    # Read the first row from the CSV file
    row = next(reader)

    # Assign the values to a and b
    optimum_number_of_features, optimum_number_of_trials = int(row[0]), int(row[1])

# Now a and b contain the values from the CSV file
print(f"Number of features: {optimum_number_of_features}, Number of trials: {optimum_number_of_trials}")


## Target dataset prep

In [3]:
import pickle
with open('outputs/non_constant_columns.pkl', 'rb') as f:
    non_constant_columns = pickle.load(f)

In [None]:
target_df = pd.read_pickle('outputs/target_descriptors_calculated_n_processed.pkl')
target_df.head()

In [None]:
target_df.info()

In [6]:
# Only use the molecules with a HOMO_UPS value
target_df = target_df[target_df['HOMO_UPS'].notna()]

In [None]:
from sklearn.utils import shuffle
target_df  = shuffle(target_df, random_state=0)
target_df.head()

In [8]:
X = target_df[target_df['Type'] != 'External Validation'][non_constant_columns]
y = target_df[target_df['Type'] != 'External Validation']['HOMO_UPS']

In [None]:
X.shape, y.shape

In [10]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor

In [11]:
rf = RandomForestRegressor(random_state=0)
selector = RFE(rf, n_features_to_select=optimum_number_of_features-1, step=1)
selector = selector.fit(X, y)

In [None]:
X_selected = selector.transform(X)
X_selected.shape

In [None]:
selected_features = X.columns[selector.support_]
selected_features

In [None]:
X_selected = X[X.columns[selector.support_]]
X_selected

## Combine selected features with DFT prediction

In [15]:
from joblib import dump, load
base_model = load('outputs/models/homo_dft.joblib')

In [16]:
base_model_preds = base_model.predict(X)
base_model_preds = base_model_preds.reshape(-1, 1)

In [17]:
X_selected.insert(0, 'DFT_pred', base_model_preds)

In [None]:
selected_features = list(X_selected.columns)
selected_features

In [None]:
type(X_selected), type(y)

In [None]:
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_validate
cv = RepeatedKFold(n_splits=10, n_repeats=5, random_state=0)
rf = RandomForestRegressor(random_state=0)
cv_results = cross_validate(rf, X_selected, y, scoring='neg_mean_absolute_error', return_estimator=True, cv=cv)
np.mean(cv_results['test_score'])*-1

In [None]:
import optuna
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RepeatedKFold

cv = RepeatedKFold(n_splits=10, n_repeats=5, random_state=0)

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 700),
        'max_depth': trial.suggest_int('max_depth', 2, 25),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
        'max_features': trial.suggest_float('max_features', 0.3, 1.0)
        }

    rf = RandomForestRegressor(random_state=0, **params)
    cv_results = cross_validate(
        rf,
        X_selected, y,
        scoring='neg_mean_absolute_error',
        return_estimator=True,
        cv=cv,
        n_jobs=-1)
    return np.mean(cv_results['test_score'])*-1

In [None]:
study = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=0))
study.optimize(objective, n_trials=optimum_number_of_trials)

In [None]:
study.best_params

In [None]:
study.best_value

In [None]:
cv = RepeatedKFold(n_splits=10, n_repeats=5, random_state=0)
rf = RandomForestRegressor(random_state=0, **study.best_params)
cv_results = cross_validate(rf, X_selected, y, scoring='neg_mean_absolute_error', return_estimator=True, cv=cv)
np.mean(cv_results['test_score'])*-1

In [26]:
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.ensemble import RandomForestRegressor
errors = []
for random_state in range(5):
    cv = KFold(n_splits=10, shuffle=True, random_state=random_state)
    preds = cross_val_predict(rf, X_selected, y, cv=cv)
    errors.append(np.abs(preds-y))

In [27]:
errors_per_molecule = np.mean(np.array(errors), axis=0)

In [None]:
result = target_df[['Type', 'Molecule', 'HOMO_UPS']]
result = result[result['Type'] != 'External Validation']
result['Error'] = errors_per_molecule
result.sort_values(by='Error')

In [None]:
np.mean(result['Error'][result['Type'] == 'Target Dataset'])

In [None]:
np.mean(result['Error'][result['Type'] == 'Online Dataset'])

## Final model

In [None]:
homo_ups_predictor = RandomForestRegressor(random_state=0, **study.best_params)
homo_ups_predictor.fit(X_selected, y)

In [None]:
from joblib import dump, load
dump(homo_ups_predictor, 'outputs/models/homo_ups_predictor.joblib')

In [None]:
# Feature Importance
df = pd.DataFrame({
    "Feature Name": homo_ups_predictor.feature_names_in_,
    "Feature Importance": homo_ups_predictor.feature_importances_
})

# Sort the DataFrame by 'Feature Importance' in descending order
df_sorted = df.sort_values(by="Feature Importance", ascending=False)

# Display the sorted DataFrame
df_sorted