In [87]:
import numpy as np
import pandas as pd
from sklearn import (
    compose,
    linear_model,
    model_selection,
    preprocessing,
)
from tqdm import tqdm

In [3]:
df = pd.read_csv("logP.csv", index_col=0)
df

Unnamed: 0,smiles,logP,QED_score,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,MolWt,HeavyAtomMolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,COc1ccccc1[C@@H]1CC(=O)Nc2cc(C)c(C)cc21,4.17,0.909,12.053455,12.053455,0.045556,0.045556,0.909253,281.355,262.203,...,0,0,0,0,0,0,0,0,0,0
1,COc1ccc2c(c1)O[C@](O)(C(F)(F)F)CC2=O,2.79,0.839,12.616874,12.616874,0.009514,-5.052156,0.839542,262.183,253.111,...,0,0,0,0,0,0,0,0,0,0
2,C=C(c1ccc(N(C)C)cc1)c1ccc(N(C)C)cc1,5.30,0.828,4.217020,4.217020,1.056122,1.056122,0.828333,266.388,244.212,...,0,0,0,0,0,0,0,0,0,0
3,Cc1ccc([C@@H](O)C#CC(O)(c2ccccc2)c2ccccc2)o1,3.53,0.726,11.239605,11.239605,0.364327,-1.508792,0.726080,318.372,300.228,...,0,0,0,0,0,0,0,0,0,0
4,CC(C)(C)c1cc(Cc2ccccc2)n[nH]1,4.35,0.816,4.352931,4.352931,0.140811,0.140811,0.816043,214.312,196.168,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,Cc1cccc(NC(=O)C[C@@H]2S/C(=N\c3c(C)cccc3C)N(C)...,4.75,0.863,12.562081,12.562081,0.093167,-0.455663,0.863227,381.501,358.317,...,1,0,0,0,0,0,0,0,0,0
240,CCOc1ccccc1OCC(=O)Nc1ccc2ccccc2c1,4.73,0.737,12.114099,12.114099,0.072506,-0.211498,0.737878,321.376,302.224,...,0,0,0,0,0,0,0,0,0,0
241,OCC#CC(O)(c1ccc(F)cc1)c1ccc(F)cc1,3.61,0.824,12.974979,12.974979,0.339313,-1.732014,0.824192,274.266,262.170,...,0,0,0,0,0,0,0,0,0,0
242,CC(=O)Nc1ccc(/N=C/c2c([O-])oc3ccccc3c2=O)cc1,3.64,0.750,12.347197,12.347197,0.125336,-0.724707,0.750035,321.312,308.208,...,0,0,0,0,0,0,0,0,0,0


In [4]:
cv = model_selection.RepeatedKFold(n_repeats=5, n_splits=5, random_state=42)

In [26]:
y = df.logP
X = df.iloc[:, 3:]
all_columns = X.columns

In [38]:
X.dtypes

MaxAbsEStateIndex    float64
MaxEStateIndex       float64
MinAbsEStateIndex    float64
MinEStateIndex       float64
qed                  float64
                      ...   
fr_thiazole            int64
fr_thiocyan            int64
fr_thiophene           int64
fr_unbrch_alkane       int64
fr_urea                int64
Length: 208, dtype: object

In [72]:
TYPES_ = pd.DataFrame({'name': list(X.columns), 'type':list(X.dtypes)})
fl64 = TYPES_[TYPES_.type == 'float64']


Unnamed: 0,name,type
0,MaxAbsEStateIndex,float64
1,MaxEStateIndex,float64
2,MinAbsEStateIndex,float64
3,MinEStateIndex,float64
4,qed,float64
...,...,...
101,VSA_EState7,float64
102,VSA_EState8,float64
103,VSA_EState9,float64
104,FractionCSP3,float64


In [None]:
var_cols = X[ X.select_dtypes(np.float64).columns].var().nlargest(50).index

In [84]:
def test_model(model, n_col):

    var_col = X[ X.select_dtypes(np.float64).columns].var().nlargest(n_col).index #выбор n колонок с макс дисперсией

    prep = compose.make_column_transformer(
        (preprocessing.StandardScaler(), var_col), remainder="passthrough" 
    )

    X_ = prep.fit_transform(X)

    scores = model_selection.cross_val_score(model, X_, y, cv=cv)

    return scores.mean()

In [85]:
models = [linear_model.Ridge(), linear_model.RidgeCV(), linear_model.Lasso(), linear_model.LassoCV()]

In [82]:
df_R = pd.DataFrame(columns=["model", "n_stand", "R^2"])
df_R

Unnamed: 0,model,n_stand,R^2


In [None]:
for i in models:
    for j in tqdm(range(10)):
        df_R.loc[len(df_R.index)] = [i, j, test_model(i, j)] 

In [None]:
df_R