In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
source_df = pd.read_pickle('outputs/source_descriptors_processed.pkl')
target_df = pd.read_pickle('outputs/target_descriptors_calculated_n_processed.pkl')

In [3]:
import pickle
with open('outputs/non_constant_columns.pkl', 'rb') as f:
    non_constant_columns = pickle.load(f)

In [4]:
source_descriptors_df = source_df[non_constant_columns]
target_descriptors_df = target_df[non_constant_columns]

In [None]:
source_descriptors_df.info()

In [None]:
target_descriptors_df.info()

In [7]:
del source_df

## PCA

In [8]:
source1_size = 233

df1 = source_descriptors_df.iloc[:source1_size]
df2 = source_descriptors_df.iloc[source1_size:]
df3 = target_descriptors_df[target_df['Type'] == 'Online Dataset']
df4 = target_descriptors_df[target_df['Type'] == 'Target Dataset']
df5 = target_descriptors_df[target_df['Type'] == 'External Validation']

combined_df = pd.concat([df1, df2, df3, df4, df5])

scaler = StandardScaler()
scaled_data = scaler.fit_transform(combined_df)

pca = PCA(n_components=2) # We want to reduce to 2 components
principal_components = pca.fit_transform(scaled_data)

pc_df1 = principal_components[:len(df1)]
pc_df2 = principal_components[len(df1):len(df1) + len(df2)]
pc_df3 = principal_components[len(df1)+len(df2):len(df1) + len(df2) + len(df3)]
pc_df4 = principal_components[len(df1)+len(df2)+len(df3):len(df1) + len(df2) + len(df3)+len(df4)]
pc_df5 = principal_components[len(df1)+len(df2)+len(df3)+len(df4):]
df_save = {}
for i in range(1, 6):
    list_instance = []
    if i == 3:
        for j, row in target_df[target_df['Type']=='Online Dataset'].iterrows():
            list_instance.append(row['Molecule'])
    if i == 4:
        for j, row in target_df[target_df['Type']=='Target Dataset'].iterrows():
            list_instance.append(row['Molecule'])
    if i == 5:
        for j, row in target_df[target_df['Type']=='External Validation'].iterrows():
            list_instance.append(row['Molecule'])
    df_instance = pd.DataFrame(eval(f'pc_df{i}'), columns=["Principal Component 1", "Principal Component 2"])
    if len(df_instance) > 1000000:
        df_instance = df_instance.sample(n=200000)
    if i > 2:
        df_instance["Molecule"] = list_instance
    df_instance.to_csv(f'outputs/EDA_source{i}.csv', sep= ';', index= False)


In [None]:
# Plot
plt.figure(dpi=300)
figure, axis = plt.subplots()
axis.scatter(pc_df2[:, 0], pc_df2[:, 1], label=f'HCEP', alpha=0.5, edgecolors='none', zorder=0, color = '#8f0f06')
axis.scatter(pc_df1[:, 0], pc_df1[:, 1], label=f'HOPV', alpha=0.7, edgecolors='none', zorder=1, color = '#06868f')
# axis.scatter(pc_df3[:, 0], pc_df3[:, 1], color='k', label=f'Online n={len(df3)}', alpha=0.7, s=60, edgecolors='none', zorder=2, marker='*')
axis.scatter(pc_df4[:, 0], pc_df4[:, 1], label=f'Target Dataset', alpha=0.7, edgecolors='none', zorder=3, marker='D', color = '#53068f')
# axis.scatter(pc_df5[:, 0], pc_df5[:, 1], label=f'Validation n={len(df5)}', alpha=0.8, edgecolors='none', zorder=4, marker='x', color = '#418f06')
axis.legend(loc='upper left', frameon= False)
axis.text(axis.get_xlim()[0] - (axis.get_xlim()[1] - axis.get_xlim()[0])*0.11, axis.get_ylim()[1] - (axis.get_ylim()[1] - axis.get_ylim()[0])*(-0.02), 'a)', fontdict = {'size': 15})
axis.set_xlabel('Principal Component 1', fontsize=18)
axis.set_ylabel('Principal Component 2', fontsize=18)
axis.minorticks_on()
axis.tick_params('both', which = 'major', top = True, right = True, direction = 'in', width= 2, length= 4, labelsize=15)
axis.tick_params('both', which = 'minor', top = True, right = True, direction = 'in', width= 1, length= 3)
for spine in axis.spines.values():
    spine.set_linewidth(2)
plt.savefig("EDA.png", dpi=300)
plt.show()