In [2]:
import pandas as pd
from rdkit import Chem
from collections import defaultdict
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


### Node Features Range Distribution

In [None]:
# Load dataset
df = pd.read_csv('PreprocessData/FrequentOdorExtraction/(sat)openpom_Top138.csv', encoding='ISO-8859-1')
smiles_list = df['SMILES'].dropna().tolist()

# Initialize feature distributions
feature_distribution = {
    'atomic_num': defaultdict(int),
    'degree': defaultdict(int),
    'formal_charge': defaultdict(int),
    'num_hs': defaultdict(int),
    'num_radical_electrons': defaultdict(int),
    'valence': defaultdict(int),
    'smallest_ring': defaultdict(int),
}

for smiles in smiles_list:
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        continue

    for atom in mol.GetAtoms():
        feature_distribution['atomic_num'][atom.GetAtomicNum()] += 1
        feature_distribution['degree'][atom.GetDegree()] += 1
        feature_distribution['formal_charge'][atom.GetFormalCharge()] += 1
        feature_distribution['num_hs'][atom.GetTotalNumHs()] += 1
        feature_distribution['num_radical_electrons'][atom.GetNumRadicalElectrons()] += 1
        feature_distribution['valence'][atom.GetTotalValence()] += 1
        feature_distribution['smallest_ring'][atom.GetOwningMol().GetRingInfo().NumAtomRings(atom.GetIdx())] += 1

# Summary
summary = {}
for key, dist in feature_distribution.items():
    summary[key] = {
        "min": min(dist.keys()) if dist else None,
        "max": max(dist.keys()) if dist else None,
        "most_common": max(dist.items(), key=lambda x: x[1]) if dist else None,
        "total_unique_values": len(dist)
    }

# Print results
import pprint
pprint.pprint(summary)




{'atomic_num': {'max': 35,
                'min': 1,
                'most_common': (6, 37931),
                'total_unique_values': 13},
 'degree': {'max': 4,
            'min': 0,
            'most_common': (2, 23273),
            'total_unique_values': 5},
 'formal_charge': {'max': 2,
                   'min': -2,
                   'most_common': (0, 44974),
                   'total_unique_values': 5},
 'num_hs': {'max': 4,
            'min': 0,
            'most_common': (2, 12566),
            'total_unique_values': 5},
 'num_radical_electrons': {'max': 1,
                           'min': 0,
                           'most_common': (0, 45048),
                           'total_unique_values': 2},
 'smallest_ring': {'max': 4,
                   'min': 0,
                   'most_common': (0, 28903),
                   'total_unique_values': 5},
 'valence': {'max': 6,
             'min': 0,
             'most_common': (4, 37951),
             'total_unique_values': 7}}


In [3]:
# Initialize feature distributions
feature_distribution = {
    'atomic_num': defaultdict(int),
    'degree': defaultdict(int),
    'formal_charge': defaultdict(int),
    'num_hs': defaultdict(int),
    'num_radical_electrons': defaultdict(int),
    'valence': defaultdict(int),
    'smallest_ring': defaultdict(int),
}

for smiles in smiles_list:
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        continue

    for atom in mol.GetAtoms():
        feature_distribution['atomic_num'][atom.GetAtomicNum()] += 1
        feature_distribution['degree'][atom.GetDegree()] += 1
        feature_distribution['formal_charge'][atom.GetFormalCharge()] += 1
        feature_distribution['num_hs'][atom.GetTotalNumHs()] += 1
        feature_distribution['num_radical_electrons'][atom.GetNumRadicalElectrons()] += 1
        feature_distribution['valence'][atom.GetTotalValence()] += 1
        feature_distribution['smallest_ring'][mol.GetRingInfo().NumAtomRings(atom.GetIdx())] += 1

expanded_data = {}
for feature, dist in feature_distribution.items():
    values = []
    for val, count in dist.items():
        values.extend([val] * count)
    expanded_data[feature] = values

# Align all lists to same length by padding with NaN
max_len = max(len(lst) for lst in expanded_data.values())
for feature in expanded_data:
    if len(expanded_data[feature]) < max_len:
        expanded_data[feature] += [None] * (max_len - len(expanded_data[feature]))

# Create DataFrame
df_expanded = pd.DataFrame(expanded_data)

# Plotting function
def plot_feature_boxplots(dataframe, save_path="feature_distributions.png"):
    num_features = dataframe.shape[1]
    num_cols = 4
    num_rows = int(np.ceil(num_features / num_cols))

    plt.figure(figsize=(4 * num_cols, 4 * num_rows))

    for i, col in enumerate(dataframe.columns):
        ax = plt.subplot(num_rows, num_cols, i + 1)
        sns.countplot(x=dataframe[col], ax=ax, color='skyblue')
        min_val = dataframe[col].min()
        max_val = dataframe[col].max()
        ax.set_title(f"{col}\nmin={min_val}, max={max_val}", fontsize=9)
        ax.set_xlabel("")
        ax.set_ylabel("")

    plt.suptitle("Node Feature Distributions", fontsize=16)
    plt.tight_layout(rect=[0, 0.03, 1, 0.97])
    plt.savefig(save_path, dpi=300)
    plt.close()

# Call the function
plot_feature_boxplots(df_expanded, save_path="atomic_feature_distributions.png")
print("Hist plot saved as 'atomic_feature_distributions.png'")



Hist plot saved as 'atomic_feature_distributions.png'
