In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold

PaDEL_Descriptors = pd.read_csv(r"/Users/sambelfield/Desktop/PaDEL_Descriptors.csv", nrows=56)
PaDEL_Descriptors.shape

(55, 943)

In [4]:
num_colums = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerical_columns = list(PaDEL_Descriptors.select_dtypes(include=num_colums).columns)
PaDEL_Descriptors = PaDEL_Descriptors[numerical_columns]

In [5]:
PaDEL_Descriptors.shape

(55, 943)

In [6]:
train_features, test_features, train_labels, test_labels = train_test_split(
    PaDEL_Descriptors.drop(labels=['EC50', 'ID'], axis=1),
    PaDEL_Descriptors['EC50'],
    test_size=0.2,
    random_state=41)

In [7]:
correlated_features = set()
correlation_matrix = PaDEL_Descriptors.corr()

In [8]:
for i in range(len(correlation_matrix .columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.9:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)

In [9]:
len(correlated_features)

638

In [10]:
print(correlated_features)

{'ETA_EtaP_F_L', 'SpMax2_Bhp', 'ATS8m', 'maxsOH', 'AATSC2v', 'MPC3', 'ATS3p', 'MATS8c', 'MATS2c', 'SpMin6_Bhs', 'AATS3p', 'MATS7s', 'GATS8p', 'TopoPSA', 'SpMin4_Bhv', 'EE_Dzs', 'CrippenMR', 'MPC8', 'nAtom', 'SP-7', 'minHAvin', 'SpMin6_Bhp', 'fragC', 'ETA_EtaP_F', 'maxssNH', 'SIC4', 'SpMin4_Bhi', 'maxaaN', 'MLFER_BH', 'SpAbs_Dze', 'TIC1', 'SpMax6_Bhe', 'VE2_Dze', 'VR1_Dzv', 'GATS6s', 'GGI1', 'VE3_Dt', 'maxdO', 'VE2_Dt', 'SpMin5_Bhv', 'ETA_dEpsilon_A', 'SpMAD_Dzp', 'SpMax1_Bhv', 'ATSC0e', 'minssssC', 'WTPT-1', 'VR3_Dzm', 'ATSC0s', 'SpMin1_Bhe', 'SHssNH', 'AATSC5s', 'SpMin3_Bhs', 'AATSC0m', 'SM1_Dze', 'BIC3', 'GATS3e', 'nBondsS2', 'SM1_DzZ', 'ATS5e', 'AATS2i', 'SpMax6_Bhp', 'maxdsCH', 'Kier2', 'SpMin2_Bhs', 'CIC5', 'WTPT-4', 'SpMax_DzZ', 'VR1_Dzi', 'AATS2m', 'TPC', 'SpAD_Dzv', 'GATS3s', 'SpMax3_Bhi', 'TpiPC', 'GATS3v', 'EE_Dzi', 'ZMIC1', 'GATS7v', 'minsOH', 'MATS3v', 'minsNH2', 'SPC-5', 'SHaaCH', 'minaaN', 'IC0', 'VR2_D', 'ATS5m', 'ATS1i', 'AATS8p', 'ETA_dBetaP', 'SpAD_Dzp', 'SP-2', 'VE2_

In [11]:
train_features.drop(labels=correlated_features, axis=1, inplace=True)
test_features.drop(labels=correlated_features, axis=1, inplace=True)

In [12]:
train_features.head

<bound method NDFrame.head of      ALogP    ALogp2      AMR       apol  naAromAtom  nN  nO        ATS0m  \
30  0.4403  0.193864  42.5492  33.288309           6   6   0  3745.464715   
27 -0.7882  0.621259  44.3064  36.571067           6   5   1  2698.894672   
41 -1.4122  1.994309  46.4098  49.620688          12   6   6  5776.782618   
39  0.1498  0.022440  35.6038  36.124274           6   2   1  2397.818703   
29  1.4663  2.150036  58.1454  31.517102           0   4   1  3236.933661   
7   0.5519  0.304594  11.9095  18.635965           6   0   1  2527.599668   
31  1.2384  1.533635  49.5812  38.669067           6   5   0  3470.770271   
5   0.4677  0.218743  10.1839  19.159965           6   1   3  1979.029219   
48 -0.7590  0.576081  28.2174  36.634309          11   3   3  3984.449913   
43 -2.0388  4.156705  57.1077  44.760309           6   5   6  5433.274256   
18  0.0695  0.004830  10.7478  19.847551           6   1   0  1357.421465   
47 -1.6864  2.843945  15.3637  33.171930      

In [13]:
test_features.head

<bound method NDFrame.head of      ALogP    ALogp2      AMR       apol  naAromAtom  nN  nO        ATS0m  \
40 -0.4231  0.179014  21.3360  26.843516           6   2   1  1958.929956   
9  -0.6527  0.426017   8.8585  17.924758           6   0   2  1527.881233   
10  0.4496  0.202140  11.7960  20.216344           6   0   1  1418.209481   
54 -1.1990  1.437601  34.9926  43.448274          12   4   3  3590.675045   
37  0.3212  0.103169  26.9516  28.356723           6   2   1  3214.616392   
50 -1.1527  1.328717  21.4659  34.685137          12   4   2  5033.052084   
15  0.1685  0.028392  22.5312  13.609551           0   1   1  1036.332982   
14 -0.6274  0.393631  13.3252   7.422379           0   1   1   743.740484   
36  0.1671  0.027922  28.5707  29.158723           6   2   2  3470.584393   
8   0.6964  0.484973  14.8930  19.505965           6   0   1  7655.546384   
33  0.2570  0.066049  45.2760  35.575481           6   5   0  3324.474022   

        AATS0m      AATS1m  ...      JGI4    

In [14]:
frames = [test_features, train_features]

joined_features = pd.concat(frames)

In [15]:
joined_features.head

<bound method NDFrame.head of      ALogP    ALogp2      AMR       apol  naAromAtom  nN  nO        ATS0m  \
40 -0.4231  0.179014  21.3360  26.843516           6   2   1  1958.929956   
9  -0.6527  0.426017   8.8585  17.924758           6   0   2  1527.881233   
10  0.4496  0.202140  11.7960  20.216344           6   0   1  1418.209481   
54 -1.1990  1.437601  34.9926  43.448274          12   4   3  3590.675045   
37  0.3212  0.103169  26.9516  28.356723           6   2   1  3214.616392   
50 -1.1527  1.328717  21.4659  34.685137          12   4   2  5033.052084   
15  0.1685  0.028392  22.5312  13.609551           0   1   1  1036.332982   
14 -0.6274  0.393631  13.3252   7.422379           0   1   1   743.740484   
36  0.1671  0.027922  28.5707  29.158723           6   2   2  3470.584393   
8   0.6964  0.484973  14.8930  19.505965           6   0   1  7655.546384   
33  0.2570  0.066049  45.2760  35.575481           6   5   0  3324.474022   
30  0.4403  0.193864  42.5492  33.288309      

In [20]:
joined_features.to_csv('/Users/sambelfield/Desktop/PaDEL_Descriptors_Collinear2.csv', header=True, index=True)