In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
source_df = pd.read_csv(filepath_or_buffer='outputs/source_descriptors.csv', sep=',')
source_df.head()

In [None]:
source_df.info()

## 1. Find and drop columns with a constant value

In [None]:
# Columns with a constant value
constant_columns = list(source_df.columns[source_df.nunique() <= 1])
len(constant_columns)

Except 4 descriptors, constant columns are fragment (fr) counts. For example `fr_NH2` is "Number of Primary amines".

http://rdkit.org/docs/source/rdkit.Chem.Fragments.html

The 4 descriptors that are not fragment counts are

```
['NumRadicalElectrons',
 'SMR_VSA8',
 'SlogP_VSA9',
 'NumSaturatedCarbocycles']
 ```

In [None]:
source_df = source_df.drop(columns=constant_columns)
source_df.info()

In [6]:
non_constant_columns = list(source_df.columns)

In [7]:
import pickle
with open('outputs/non_constant_columns.pkl', 'wb') as f:
    pickle.dump(non_constant_columns, f)

## 2. Handling NaN values

In [8]:
# Interpreting inf as nan
source_df = source_df.replace([np.inf, -np.inf], np.nan)

In [None]:
# Number of rows with NaN values
source_df.isna().any(axis=1).sum()

In [None]:
# Number of columns with NaN values
source_df.isna().any(axis=0).sum()

In [None]:
# The columns that contain NaN values
nan_columns = list(source_df.columns[source_df.isnull().any(axis=0)])
nan_columns

There are two groups: columns related to `PartialCharge` and `BCUT2D_x`.

For BCUT2D here is the explanation

You'll get NaN for any molecule which contains an atom that isn't parameterized for the Gasteiger charges. (e.g. [Na+])

https://github.com/rdkit/rdkit/discussions/5824


For Partial charge, I couldn't find much but there is an issue here:

https://github.com/rdkit/rdkit/issues/5674


In [None]:
source_df.dropna(axis='index').hist(column=nan_columns[0:4])

In [None]:
source_df.dropna(axis='index').hist(column=nan_columns[4:8])

In [None]:
source_df.dropna(axis='index').hist(column=nan_columns[8:])

In [15]:
# Filling nan values with an arbitrary number
# -5 is smaller than the minimum number
source_df = source_df.fillna(-5)

## 3. Very large and small numbers

In [None]:
source_df.columns[(source_df > 1e6).any()]

In [None]:
source_df.columns[((source_df < 1e-6) & (source_df > 0)).any()]

In [None]:
np.max(source_df['Ipc'])

In [None]:
# Natural log
plt.hist(np.log(source_df['Ipc']))

In [None]:
np.log(np.max(source_df['Ipc']))

In [21]:
source_df['Ipc'] = np.log(source_df['Ipc'])

## 4. Optimizing memory usage

In [None]:
source_df.info()

In [23]:
float64_cols = list(source_df.select_dtypes(include='float64'))
source_df[float64_cols] = source_df[float64_cols].astype('float32')

In [None]:
source_df.info()

In [25]:
int64_cols = list(source_df.select_dtypes(include='int64'))

In [None]:
source_df[int64_cols].min().min(), source_df[int64_cols].max().max()

|Data type|Description|
|----|----|
|int8|Byte (-128 to 127)|
|int16|Integer (-32768 to 32767)|
|int32	|Integer (-2147483648 to 2147483647)|
|int64	|Integer (-9223372036854775808 to 9223372036854775807)|
|uint8	|Unsigned integer (0 to 255)|
|uint16	|Unsigned integer (0 to 65535)|
|uint32	|Unsigned integer (0 to 4294967295)|
|uint64	|Unsigned integer (0 to 18446744073709551615)|

In [27]:
source_df[int64_cols] = source_df[int64_cols].astype('int16')

In [None]:
source_df.info()

## 5. Putting it together

In [29]:
def descriptor_processor(df, non_constant_columns):
    df = df[non_constant_columns]

    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.fillna(-5)

    df['Ipc'] = np.log(df['Ipc'])

    float64_cols = list(df.select_dtypes(include='float64'))
    df[float64_cols] = df[float64_cols].astype('float32')

    int64_cols = list(df.select_dtypes(include='int64'))
    df[int64_cols] = df[int64_cols].astype('int16')

    return df

In [None]:
source_df = pd.read_csv(filepath_or_buffer='outputs/source_descriptors.csv', sep=',')
source_df.head()

In [None]:
source_df.info()

In [None]:
source_df = descriptor_processor(source_df, non_constant_columns)
source_df.head()

In [None]:
source_df.info()

In [34]:
source_df.to_pickle('outputs/source_descriptors_processed.pkl')

In [35]:
del source_df