In [3]:
# Load in the raw data
import pandas as pd

DATASETS_PATH = '../../data/data'

df = pd.read_csv('../../data/csvs/hack_data_extended.csv', header=0, na_values=('NA', 'nan', 'NaN'))
filtered_df = df.groupby('PDSC').head(12).reset_index(drop=True)
filtered_df.drop(['C'], axis=1, inplace=True)

In [None]:
import os
from qsprpred.data import QSPRDataset
from qsprpred.data.descriptors.fingerprints import MorganFP, RDKitFP
from qsprpred.data.sampling.splits import ScaffoldSplit
from qsprpred.tasks import TargetTasks

DATA_PATH_QSAR = "data/data/qsar"
os.makedirs(DATA_PATH_QSAR, exist_ok=True)

# create the data set
dataset = QSPRDataset(
    name="PDSC_pred",
    df=filtered_df.copy(), 
    target_props=[{"name" : "PDSC", "task" : TargetTasks.REGRESSION}], 
    store_dir=DATA_PATH_QSAR,
)

# split on scaffolds
split = ScaffoldSplit(test_fraction=0.15)

dataset.prepareDataset(
    split=split,
    feature_calculators=[MorganFP(radius=3, nBits=2048), RDKitFP(nBits=2 * 1024)]
)

print(f"Number of samples train set: {len(dataset.y)}")
print(f"Number of samples test set: {len(dataset.y_ind)}, {len(dataset.y_ind) / len(dataset.df) * 100}%")

Number of samples train set: 197
Number of samples test set: 35, 15.086206896551724%


In [None]:
# A small example of how to optimize hyperparameters for a model
from qsprpred.models import CrossValAssessor
from qsprpred.models.scikit_learn import SklearnModel
from qsprpred.models.hyperparam_optimization import GridSearchOptimization
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor

params = {
    'n_estimators': [50, 100, 200],         # количество деревьев (лучше не больше 200 при малом датасете)
    'max_depth': [5, 10, 20, 35],         # глубина дерева (None — до полного разбиения)
    'min_samples_split': [2, 5, 10],        # минимальное число образцов для разбиения
    'min_samples_leaf': [1, 2, 4],          # минимальное число образцов в листе
    'max_features': ['sqrt', 'log2', None], # стратегия выбора признаков
    'bootstrap': [True, False],
    'n_jobs': [20]
}

model = SklearnModel(
    name="A2AR_ForestRegressor_hack_data_extended_with_submits",
    base_dir='../../models/A2AR_ForestRegressor_hack_data_extended_with_submits',
    alg = RandomForestRegressor
)

gridsearcher = GridSearchOptimization(param_grid=params, model_assessor=CrossValAssessor(scoring='neg_mean_squared_error'))
best_params = gridsearcher.optimize(model, dataset)
best_params

{'bootstrap': False,
 'max_depth': 35,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 100,
 'n_jobs': 20}

In [19]:
# A small example of how to assess a model
from qsprpred.models.assessment.methods import CrossValAssessor, TestSetAssessor

CrossValAssessor(scoring='neg_mean_squared_error')(model, dataset)
TestSetAssessor(scoring='neg_mean_squared_error')(model, dataset)
_ = model.fitDataset(dataset)