
# Feature Selection

This notebook performs feature selection on the preprocessed molecular descriptors using Logistic Regression with L1 regularization.

### Steps:
1. Load preprocessed descriptor data from `Descriptor_preprocessing_results.csv`.
2. Perform feature selection over 100 iterations.
3. Summarize the frequency of feature selection.
4. Identify and save the most frequently selected features.

---
### Inputs and Outputs:
- **Input**:
  - `Descriptor_preprocessing_results.csv` (Preprocessed descriptors)
- **Outputs**:
  - `00_feature_selection_df.csv` (Frequency of feature selection for all features)
  - List of most frequently selected features.


In [None]:

# Step 1: Import Libraries and Load Data
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt

# Load preprocessed descriptor data
pre_data_df = pd.read_csv('Descriptor_preprocessing_results.csv')

# Separated x_data and y_data
x_data = pre_data_df.drop(columns=['y_label'])
y_data = pre_data_df['y_label']

print(f"Loaded data shape: {pre_data_df.shape}")
print(f"Target variable shape: {len(y_data)}")


In [None]:

# Step 2: Perform Feature Selection
# Dictionary to store the count of how many times each feature is selected
feature_selection_counts = {feature: 0 for feature in x_data.columns}

# Perform 100 iterations
for _ in range(100):
    # Create a new logistic regression model with L1 penalty
    logreg = LogisticRegression(penalty='l1', solver='liblinear')
    
    # Train the model
    logreg.fit(x_data, y_data)
    
    # Update feature_selection_counts based on which features were selected (non-zero coefficients)
    selected_features = x_data.columns[logreg.coef_[0] != 0]
    for feature in selected_features:
        feature_selection_counts[feature] += 1

print("Feature selection completed over 100 iterations.")


In [None]:

# Step 3: Analyze Feature Selection Results
# Convert the dictionary to a DataFrame for easier analysis and visualization
feature_selection_df = pd.DataFrame.from_dict(feature_selection_counts, orient='index', columns=['Selection Count'])

# Sort the DataFrame to see the most frequently selected features
feature_selection_df = feature_selection_df.sort_values(by='Selection Count', ascending=False)

# Summarize the frequency of feature selection
summary_df = feature_selection_df['Selection Count'].value_counts().reset_index()
summary_df.columns = ['Selection Count', 'Number of Descriptors']
summary_df = summary_df.sort_values(by='Selection Count', ascending=True)

print("Feature selection summary:")
print(summary_df)


In [None]:

# Step 4: Extract Most Frequently Selected Features
Final_feature_selection_df = feature_selection_df[feature_selection_df['Selection Count'] == 100]
Final_features = Final_feature_selection_df.index.tolist()

print(f"Number of features selected in all 100 iterations: {len(Final_features)}")
print("Most frequently selected features:")
print(Final_features)


In [None]:

# Step 5: Save Final Features and Extract Relevant Data

# Save the most frequently selected features to a text file
with open('Final_selected_descriptor_list.txt', 'w') as f:
    for feature in Final_features:
        f.write(f"{feature}\n")
print("Final features saved to Final_selected_descriptor_list.txt")

# Extract relevant data based on Final_features and add y_data
Final_feature_selection_data = pre_data_df[Final_features].copy()
Final_feature_selection_data['y_label'] = y_data

# Save the data to a CSV file
Final_feature_selection_data.to_csv('Final_feature_selection_data_with_ylabel.csv', index=False)
print("Final feature selection data saved to Final_feature_selection_data_with_ylabel.csv")
