# Feature Selection and PCA Cumulative Variance Analysis

In [None]:
import pandas as pd
from sklearn import preprocessing, decomposition
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

## Import Data and Proprocess

In [None]:
data = pd.read_csv(r'input_csv_path')

dataset = data.iloc[:,4:]
dataset = dataset.corr().pow(2)
dataset

In [None]:
dataset_sort = dataset.iloc[:,:1]
dataset_sort = dataset_sort.sort_values('IE', axis=0, ascending=False)
dataset_sort = dataset_sort.drop('apol')
highcorr_discriptors = list(dataset_sort.index[0:21])
print(highcorr_discriptors)
top_twenty = dataset.loc[highcorr_discriptors,highcorr_discriptors]
top_twenty

## Remain the 20 Descriptors of Highest R2 and Plot

In [None]:
def plot_heatmap(data, vmin=0):
    plt.figure(figsize=(10,10))
    ax = sns.heatmap(data, cbar=False, cmap='OrRd',
                     vmin=vmin, square=True, annot=True,
                     fmt='.2f')
    plt.tight_layout()
    plt.show()


plot_heatmap(top_twenty, 0)

## Cumulative Variance Analysis by PCA

In [None]:
# list the descriptors as the descending of R2
dataset_sort = dataset.iloc[:,:1]
dataset_sort = dataset_sort.sort_values('IE', axis=0, ascending=False)
dataset_sort = dataset_sort.drop('apol')
highcorr_discriptors = list(dataset_sort.index[1:])

dataset_descriptors = data[highcorr_discriptors]

dataset_descriptors = preprocessing.quantile_transform(dataset_descriptors)
pca_dict = {'number of descriptors':[],
            'cumulative variance':[]}
n = 0
for i in range(len(data.iloc[:,5:].columns)):
    pca = decomposition.PCA(n_components=i)
    pca.fit(dataset_descriptors)
    cumvar = sum(pca.explained_variance_ratio_)
    pca_dict['number of descriptors'].append(i+1)
    pca_dict['cumulative variance'].append(cumvar)
    
    try:
        if i==19:
            print(pca_dict['number of descriptors'][i])
            print(pca_dict['cumulative variance'][i])
            continue
        
        if pca_dict['cumulative variance'][i] >= 0.90 and n==0:
            print(pca_dict['number of descriptors'][i])
            print(pca_dict['cumulative variance'][i])
            n+=1
            continue
        if pca_dict['cumulative variance'][i] >= 0.95 and n==1:
            print(pca_dict['number of descriptors'][i])
            print(pca_dict['cumulative variance'][i])
            n+=1
            continue
        if pca_dict['cumulative variance'][i] >= 0.99 and n==2:
            print(pca_dict['number of descriptors'][i])
            print(pca_dict['cumulative variance'][i])
            n+=1
            continue
        if pca_dict['cumulative variance'][i] >= 0.999 and n==3:
            print(pca_dict['number of descriptors'][i])
            print(pca_dict['cumulative variance'][i])
            break
    except:
        continue

In [None]:
plt.figure(figsize=(16,16))

plt.subplot(221)
ax = sns.heatmap(top_twenty, cbar=False, cmap='OrRd',
                    vmin=0, square=True, annot=True,
                    fmt='.2f')
plt.tight_layout()

plt.subplot(222)

num_component = [20, 90, 160, 360, 640, 1138]
portion_dict = {'20':[], '90':[], '160':[], '360':[], '640':[], '1138':[]}

for i in num_component:
    descriptor = dataset_sort.index[1:i+1]
    count_mordred = 0
    count_maccs = 0
    count_pubchem = 0
    for feature in descriptor:
        if 'Pubchem' in feature:
            count_pubchem+=1
        elif 'MACCSFP' in feature:
            count_maccs+=1
        else:
            count_mordred+=1
    portion_dict[str(i)].append(count_mordred)
    portion_dict[str(i)].append(count_maccs)
    portion_dict[str(i)].append(count_pubchem)

x = portion_dict.keys()
y1 = [i[0] for i in portion_dict.values()]
y2 = [i[1] for i in portion_dict.values()]
y3 = [i[2] for i in portion_dict.values()]

width = 5
bins2 = np.array(np.arange(0,102,17))
print(bins2)
bins1 = bins2-5
bins3 = bins2+5

plt.bar(bins1,y1,width=width,label='Mordred')
plt.bar(bins2,y2,width=width,tick_label=num_component,label='MACCS')
plt.bar(bins3,y3,width=width,label='PUBCHEM')

x0 = -5
for xi,yi in zip(bins1,y1):
    plt.text(x0,yi+2,yi, ha='center', va='bottom')
    x0+=17

x0 = 0
for xi,yi in zip(bins2,y2):
    plt.text(x0,yi+2,yi, ha='center', va='bottom')
    x0+=17

x0 = 5
for xi,yi in zip(bins3,y3):
    plt.text(x0,yi+2,yi, ha='center', va='bottom')
    x0+=17

plt.legend()
plt.xlabel('Number of Ascending Ranked Descriptors in the Set')
plt.ylabel('Number of Descriptors')
plt.tight_layout()

plt.subplot(212)
df = pd.read_csv(r'input_csv_path')
x = [20,90,160,360,640,1138]
for i in x:
    plt.vlines(i, 0, 1,colors='black',linewidth=1,linestyles='dashdot')
for i in range(8):
    plt.plot(x,df.iloc[i,1:],label=df['Algorithm'][i],linewidth=2)
plt.ylim([0.39,0.82])
plt.xlim([10,1145])
plt.ylabel('RMSE')
plt.xlabel('Number of Descriptors')
plt.legend()
plt.xticks(x)
plt.tight_layout()
plt.savefig(r'output_png_path',dpi=328)
plt.show()
