
# Descriptor Calculation for Positive and Negative Data

This notebook calculates molecular descriptors for both **Positive** and **Negative** compound datasets using the Padel Descriptor tool, performs data cleaning, and saves the processed results for further QSAR modeling.

### Steps:
1. Calculate descriptors using the Padel Descriptor tool.
2. Remove columns with a high number of missing values.
3. Remove rows with a high number of missing values.
4. Save the cleaned data.

---
### Inputs and Outputs:
- **Positive Input File**: `PTP1B_Positive_group_200.sdf`
- **Negative Input File**: `PTP1B_Negative_dataset_02_213.sdf`
- **Raw Outputs**:
  - `posi_des_raw.csv` (raw Positive descriptors)
  - `nega_des_raw.csv` (raw Negative descriptors)
- **Processed Outputs**:
  - `posi_descriptor.csv` (cleaned Positive descriptors)
  - `nega_descriptor.csv` (cleaned Negative descriptors)


In [None]:

# Step 1: Import Libraries
from padelpy import padeldescriptor
import pandas as pd


In [None]:

# Step 2: Define File Paths
positive_input_sdf = "PTP1B_positive_compounds_BindingDB(200).sdf"
negative_input_sdf = "PTP1B_negative_compounds_BindingDB(218).sdf"

positive_descriptor_csv = "posi_des_raw.csv"
negative_descriptor_csv = "nega_des_raw.csv"

positive_processed_csv = "posi_descriptor.csv"
negative_processed_csv = "nega_descriptor.csv"


In [None]:

# Step 3: Calculate Descriptors for Positive Data
padeldescriptor(
    mol_dir=positive_input_sdf,
    d_file=positive_descriptor_csv,
    d_2d=True,
    d_3d=False,
    detectaromaticity=True,
    removesalt=True,
    standardizenitro=True,
    usefilenameasmolname=True,
    retainorder=True,
    threads=-1,
    maxruntime=10000,
    headless=True
)
print("Positive dataset descriptor calculation completed!")


In [None]:

# Step 4: Calculate Descriptors for Negative Data
padeldescriptor(
    mol_dir=negative_input_sdf,
    d_file=negative_descriptor_csv,
    d_2d=True,
    d_3d=False,
    detectaromaticity=True,
    removesalt=True,
    standardizenitro=True,
    usefilenameasmolname=True,
    retainorder=True,
    threads=-1,
    maxruntime=10000,
    headless=True
)
print("Negative dataset descriptor calculation completed!")


In [None]:

# Step 5: Load Positive Data and Check Missing Values
positive_data = pd.read_csv(positive_descriptor_csv)
print("Positive Data Shape:", positive_data.shape)
positive_nan_count = positive_data.isna().sum()
print("Positive Data Total Missing Values:", positive_nan_count.sum())


In [None]:

# Step 6: Remove Columns with Many Missing Values for Positive Data
positive_cols_with_nan = positive_data.columns[positive_data.isna().sum() >= 10]
positive_cleaned_cols = positive_data.drop(columns=positive_cols_with_nan)
print("Shape After Column Cleaning for Positive Data:", positive_cleaned_cols.shape)


In [None]:

# Step 7: Remove Rows with Many Missing Values for Positive Data
positive_rows_with_nan = positive_cleaned_cols[positive_cleaned_cols.isna().sum(axis=1) >= 700]
positive_cleaned_data = positive_cleaned_cols.drop(positive_rows_with_nan.index)
print("Shape After Row Cleaning for Positive Data:", positive_cleaned_data.shape)


In [None]:

# Step 8: Save Cleaned Positive Data
positive_final_data = positive_cleaned_data.drop(['Name'], axis=1, errors='ignore')
positive_final_data.to_csv(positive_processed_csv, index=False)
print(f"Processed Positive data saved to {positive_processed_csv}")


In [None]:

# Step 5-8: Process Negative Data
negative_data = pd.read_csv(negative_descriptor_csv)
print("Negative Data Shape:", negative_data.shape)
negative_nan_count = negative_data.isna().sum()
print("Negative Data Total Missing Values:", negative_nan_count.sum())

negative_cols_with_nan = negative_data.columns[negative_data.isna().sum() >= 40]
negative_cleaned_cols = negative_data.drop(columns=negative_cols_with_nan)
print("Shape After Column Cleaning for Negative Data:", negative_cleaned_cols.shape)

negative_rows_with_nan = negative_cleaned_cols[negative_cleaned_cols.isna().sum(axis=1) >= 700]
negative_cleaned_data = negative_cleaned_cols.drop(negative_rows_with_nan.index)
print("Shape After Row Cleaning for Negative Data:", negative_cleaned_data.shape)

negative_final_data = negative_cleaned_data.drop(['Name'], axis=1, errors='ignore')
negative_final_data.to_csv(negative_processed_csv, index=False)
print(f"Processed Negative data saved to {negative_processed_csv}")
