**Requirements** 
* According to this [paper](https://arxiv.org/pdf/1904.01561.pdf), features are computed with [descriptastorus](https://github.com/bp-kelley/descriptastorus) package
* Install via: `pip install git+https://github.com/bp-kelley/descriptastorus`
* Taken from this paper: https://github.com/theislab/chemCPA/blob/main/embeddings/rdkit/embedding_rdkit.ipynb

## General imports

In [None]:
import numpy as np
from tqdm.notebook import tqdm
from joblib import Parallel, delayed
import pandas as pd
from descriptastorus.descriptors.DescriptorGenerator import MakeGenerator
import torch
import os

## Load Smiles list

In [None]:
dataset_path = "/Users/niklaskiermeyer/Desktop/Codespace/DruxAI/data/preprocessed/"
dataset_name = "auc_secondary_screen_prediction_targets.csv"

In [None]:
smiles_df = pd.read_csv(dataset_path + dataset_name, index_col=0)
smiles_list = smiles_df["smiles"].drop_duplicates().values

In [None]:
smiles_df.head()

In [None]:
print(f"Number of smiles strings: {len(smiles_list)}")

In [None]:
generator = MakeGenerator(("RDKit2D",))
for name, numpy_type in generator.GetColumns():
    print(f"{name}({numpy_type.__name__})")

In [None]:
n_jobs = 16
data = Parallel(n_jobs=n_jobs)(delayed(generator.process)(smiles) for smiles in tqdm(smiles_list, position=0,
                                                                                     leave=True) )

In [None]:
embedding = np.array(data)
embedding.shape

## Check `nans` and `infs`

Check for `nans`

In [None]:
drug_idx, feature_idx = np.where(np.isnan(embedding))
print(f"drug_idx:\n {drug_idx}")
print(f"feature_idx:\n {feature_idx}")

Check for `infs` and add to idx lists

In [None]:
drug_idx_infs, feature_idx_infs = np.where(np.isinf(embedding))

drug_idx = np.concatenate((drug_idx, drug_idx_infs))
feature_idx = np.concatenate((feature_idx, feature_idx_infs))

Features that have these invalid values:

In [None]:
np.array(generator.GetColumns())[np.unique(feature_idx)]

Set values to `0`

In [None]:
embedding[drug_idx, feature_idx]

In [None]:
embedding[drug_idx, feature_idx] = 0

## Save

In [None]:
df = pd.DataFrame(data=embedding,index=smiles_list,columns=[f"latent_{i}" for i in range(embedding.shape[1])])

# Drop first feature from generator (RDKit2D_calculated)
df.drop(columns=["latent_0"], inplace=True)

# Drop columns with 0 standard deviation
threshold = 0.01
columns=[f"latent_{idx+1}" for idx in np.where(df.std() <= threshold)[0]]
print(f"Deleting columns with std<={threshold}: {columns}")
df.drop(columns=[f"latent_{idx+1}" for idx in np.where(df.std() <= 0.01)[0]], inplace=True)

Check that correct columns were deleted: 

In [None]:
np.where(df.std() <= threshold)

### Normalise dataframe

In [None]:
normalized_df=(df-df.mean())/df.std()

In [None]:
normalized_df.reset_index(inplace=True)
normalized_df.rename(columns={"index": "smiles"}, inplace=True)
normalized_df.head()

In [None]:
normalized_df = normalized_df.merge(smiles_df[["DRUG", "smiles"]].drop_duplicates(), on="smiles", how="inner")

In [None]:
normalized_df.drop(columns=["smiles"], inplace=True)
normalized_df

In [None]:
normalized_df.set_index("DRUG", inplace=True)

In [None]:
normalized_df

Check destination folder

In [None]:
# Initialize an empty dictionary to store the drug-latent pairs
drug_fingerprint_dict = {}

# Iterate over the rows of the dataframe
for drug, row in normalized_df.iterrows():
    # Convert the latent vector to a NumPy array
    latent_vector = np.array(row)
    # Add the drug-latent pair to the dictionary
    drug_fingerprint_dict[drug] = latent_vector

In [None]:
drug_fingerprint_dict

In [None]:
torch.save(drug_fingerprint_dict, os.path.join(dataset_path, "drug_fingerprint_dict.pt"))

In [None]:
torch.load(os.path.join(dataset_path, "drug_fingerprint_dict.pt"))