In [4]:
import numpy as np
from genetic_selection import GeneticSelectionCV
from model_training import get_X_Y
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import global_config as cfg

csv_path = cfg.ECCF_csvfilepath
X, blood_y, brain_y, ratio_y, SMILES = get_X_Y(csv_path)

y = brain_y
model_type = cfg.model_type
param_name = 'blood_params'
params = cfg.model_params.get(model_type)
model = XGBRegressor()

selector = GeneticSelectionCV(
    model,
    cv=5,
    verbose=1,
    scoring="r2",
    max_features=50,
    n_population=200,
    crossover_proba=0.5,
    mutation_proba=0.2,
    n_generations=30,
    crossover_independent_proba=0.5,
    mutation_independent_proba=0.05,
    tournament_size=3,
    n_gen_no_change=10,
    caching=True,
    n_jobs=-1,
)
selector = selector.fit(X, y)



Selecting features with genetic algorithm.
gen	nevals	avg                                  	std                                  	min                                     	max                                     
0  	200   	[-324.823424   27.43      465.540921]	[ 240.771087   14.576869  357.300702]	[-1326.365348     1.           0.304503]	[    0.037119    50.        2199.565428]
1  	130   	[-2458.68573     68.23      2537.738288]	[ 4124.236787    82.272639  4083.889307]	[-10000.            1.            0.304503]	[     0.037119    253.        10000.      ]
2  	130   	[-2702.112631    70.065     2778.504049]	[ 4272.99743     82.203837  4231.976359]	[-10000.            1.            0.231152]	[     0.266112    246.        10000.      ]
3  	112   	[-2031.445326    56.185     2102.140324]	[ 3861.436858    72.874967  3829.50486 ]	[-10000.            1.            0.231152]	[     0.266112    259.        10000.      ]
4  	124   	[-2209.721225    62.34      2273.370467]	[ 4019.233195    78.5206

In [5]:
print(selector.support_)
print(selector.n_features_)
fea = []
idx = 0
for i in selector.support_:
    if i:
        fea.append(idx)
    idx = idx + 1
print(fea)

f = open('./ga_output.txt', 'w')
f.write(str(selector.support_))
f.write("\n")
f.write(str(fea))
f.close()

[False False False ... False False False]
42
[98, 205, 342, 356, 562, 572, 679, 693, 727, 926, 1078, 1082, 1190, 1209, 1216, 1222, 1265, 1380, 1400, 1583, 1943, 1986, 2104, 2138, 2209, 2287, 2543, 2667, 2691, 2751, 2995, 3167, 3331, 3403, 3405, 3435, 3472, 3515, 3666, 3671, 3680, 3772]


In [6]:
X_s = X.iloc[:, selector.support_]
X_train, X_val, y_train, y_val = train_test_split(X_s, y, test_size=0.2)
model.fit(X_train, y_train)
pred = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, pred))
r2 = r2_score(y_val, pred)
print("RMSE: ", rmse)
print("R2: ", r2)


RMSE:  6.9786333062967465
R2:  0.10518506135125039
