
# Descriptor Preprocessing

This notebook performs preprocessing on calculated molecular descriptors, including:
1. Handling missing values by imputing with the mean.
2. Removing low-variance features.
3. Removing highly correlated features.

The final preprocessed data is saved for further use in QSAR modeling.

---
### Inputs and Outputs:
- **Inputs**:
  - `posi_descriptor.csv` (Positive descriptors with label `1`)
  - `nega_descriptor.csv` (Negative descriptors with label `0`)
- **Output**:
  - `Descriptor_preprocessing_results.csv` (Preprocessed descriptors)


In [None]:

# Step 1: Import Libraries and Load Data
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

# Load data
po_data = pd.read_csv('posi_descriptor.csv')
ne_data = pd.read_csv('Nega_descriptor.csv')

print(f"Positive data shape: {po_data.shape}")
print(f"Negative data shape: {ne_data.shape}")


In [None]:

# Step 2: Impute Missing Values and Add Labels
imp = SimpleImputer(missing_values=np.nan, strategy='mean')

# Process positive data
imp.fit(po_data)
df_posi = pd.DataFrame(imp.transform(po_data), columns=po_data.columns)
df_posi['y_label'] = 1

# Process negative data
imp.fit(ne_data)
df_nega = pd.DataFrame(imp.transform(ne_data), columns=ne_data.columns)
df_nega['y_label'] = 0

# Concatenate positive and negative datasets
df_concat = pd.concat([df_posi, df_nega])

# Separate features and labels
x_data = df_concat.drop(columns=['y_label'])
y_data = df_concat['y_label']

print(f"Combined data shape: {x_data.shape}")


In [None]:

# Step 3: Remove Low-Variance Features
from sklearn.feature_selection import VarianceThreshold

sel = VarianceThreshold(threshold=0.01)
featuredDataSet = sel.fit_transform(x_data)

# Create DataFrame for selected features
X_selected_df = pd.DataFrame(featuredDataSet, columns=[x_data.columns[i] for i in range(len(x_data.columns)) if sel.get_support()[i]])

print("============================================")
print("After Removing Low-Variance Features")
print("============================================")
print(X_selected_df.info())


In [None]:

# Step 4: Remove Highly Correlated Features
corr_table = X_selected_df.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_table.where(np.triu(np.ones(corr_table.shape), k=1).astype(np.bool))

# Find columns with correlation > 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

# Drop highly correlated features
X_corr_df = X_selected_df.drop(to_drop, axis=1)

print("============================================")
print("After Removing Highly Correlated Features")
print("============================================")
print(X_corr_df.info())


In [None]:

# Step 5: Save Final Preprocessed Data
# Add 'y_label' back to the preprocessed data
# Reset index for both X_corr_df and y_data to ensure alignment
X_corr_df = X_corr_df.reset_index(drop=True)
y_data = pd.Series(y_data, name='y_label').reset_index(drop=True)

# Concatenate X_corr_df and y_data
final_data_with_label = pd.concat([X_corr_df, y_data], axis=1)

# Save the final data with labels
final_data_with_label.to_csv('Descriptor_preprocessing_results.csv', index=False)
print("Preprocessed data with labels saved to Descriptor_preprocessing_results.csv")
