In [8]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cross_decomposition import PLSRegression
from sklearn.feature_selection import SelectFromModel, RFE, RFECV
from sklearn.linear_model import ElasticNet, ElasticNetCV, Lasso, LassoCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.svm import SVR
from abess import LinearRegression
from abess.decomposition import SparsePCA
from kennard_stone import train_test_split as ks_train_test_split
from sklearn.pipeline import Pipeline
# from genetic_selection import GeneticSelectionCV
import pandas as pd
import numpy as np

In [2]:
def extractDragonDescriptors(family_Y_df, descriptorFileString):
    family_X = pd.read_csv(descriptorFileString, sep='\t', index_col=0, low_memory=False)
    colsX = family_X.columns
    family_X_resetIndex = pd.DataFrame(family_X.to_numpy(), index=family_Y_df.index, columns=family_X.columns).replace('na', np.NaN)
    return family_X_resetIndex[colsX[1:]].dropna(axis=1, how='any')

In [3]:
global_Y = pd.read_csv('Data/exactPLS/global.csv', index_col=0)

In [4]:
global_X = extractDragonDescriptors(global_Y, 'Data/exactPLS/desc_global.txt')

In [5]:
global_Y.shape, global_X.shape

((1147, 2), (1147, 1023))

In [7]:
pd.concat([global_X, global_Y], axis = 1).corr()['PCE'].abs().sort_values(ascending=False).head(10)

PCE         1.000000
mintsC      0.474240
MaxtsC      0.432035
LOGPcons    0.405476
nCs         0.403580
C-002       0.403276
SssCH2      0.402945
nCsp3       0.397295
LOGP99      0.396008
H-046       0.395082
Name: PCE, dtype: float64

In [6]:
def returnNextRow(rowDf, to_drop):
    max = -200
    nextRow='None'
    for i in range(len(rowDf.columns)):
        if (rowDf[rowDf.columns[i]][0] > max) and (rowDf.columns[i] not in to_drop) and (rowDf[rowDf.columns[i]][0] != np.NaN):
            max = rowDf[rowDf.columns[i]][0]
            nextRow = rowDf.columns[i]
    return nextRow

def getTo_dropOfRow(rowDf, to_drop=[], threshold=0.95):
    for i in range(len(rowDf.columns)):
        if (rowDf[rowDf.columns[i]][0] > threshold) and (rowDf[rowDf.columns[i]][0] != np.NaN) and (rowDf.columns[i] not in to_drop):
            to_drop.append(rowDf.columns[i])
    return to_drop

def vWSPFeatureSelect(rowName,corr_matrix,to_drop=[],thresh=0.95):
    if rowName=='None':
        return to_drop
    else:
        to_DropThisRow = getTo_dropOfRow(corr_matrix.loc[[rowName]], to_drop, threshold=thresh) #array of column names
        nextRow = returnNextRow(corr_matrix.loc[[rowName]], to_DropThisRow)
        updateTo_drop = to_DropThisRow
        return vWSPFeatureSelect(rowName=nextRow, corr_matrix=corr_matrix,to_drop=updateTo_drop, thresh=0.95)
    
def vWSP(X_train, threshold=0.95):
    cor_matrix = X_train.corr().abs()
    upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape), k=1).astype(bool))
    to_drop=vWSPFeatureSelect('MW', corr_matrix=upper_tri, to_drop=[], thresh=threshold) #MW chosen as seed
    X_train_rm = X_train.drop(to_drop, axis=1)
    return X_train_rm

In [10]:
global_X_train, global_X_test, global_Y_train, global_Y_test = ks_train_test_split(global_X, global_Y['PCE'], test_size=0.2)

In [None]:
rfeRF = RandomForestRegressor()

min_features_to_select = 600

rfeCV = RFECV(
    estimator=rfeRF,
    step=5,
    cv=5,
    scoring="r2",
    min_features_to_select=min_features_to_select,
)

rfeCV.fit(global_X_train, global_Y_train)

In [None]:
print("Optimal numberof features : %d" % rfeCV.n_features_)

plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (r2)")
plt.plot(
    range(min_features_to_select, len(rfecv.grid_scores_) + 10),
    rfecv.grid_scores_,
)
plt.show()

In [None]:
gridSearchRF = RandomForestRegressor()

param_grid={
    "n_estimators":[100, 300, 500, 1000],
    "max_features":["auto","sqrt","log2"],
    "min_samples_leaf":[1,2,3,4]
}

grid = GridSearchCV(gridSearchRF, param_grid, cv=5)

grid.fit(global_X_train, global_Y_train)

grid.best_score_, grid.best_params_