In [None]:
#(1)To process a dataset of chemical compounds, convert SMILES strings to molecular representations, generate an SDF file, and handle invalid SMILES
import os
import pandas as pd
from rdkit import Chem
from rdkit.Chem import SDWriter
from tqdm import tqdm

# Define input and output folder paths
input_folder = 'data'
output_folder = 'data'

# Create the output folder if it does not exist
os.makedirs(output_folder, exist_ok=True)

# Input CSV file path
input_file = os.path.join(input_folder, 'datasets.csv')

# Import data, keeping only the SMILES and ID columns
data = pd.read_csv(input_file, usecols=['SMILES', 'ID'])

# List to store invalid SMILES with their corresponding IDs and error messages
invalid_smiles = []

# Function to convert SMILES to molecular objects and handle invalid cases
def smiles_to_mol(smiles, mol_id):
    try:
        if pd.isna(smiles):  # Check for empty values
            print(f"Empty SMILES for ID: {mol_id}")
            invalid_smiles.append((mol_id, smiles, "Empty SMILES"))
            return None
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:  # Check if the molecule object is None
            print(f"Invalid SMILES: {smiles} for ID: {mol_id}")
            invalid_smiles.append((mol_id, smiles, "Invalid SMILES"))
        return mol
    except Exception as e:  # Handle unexpected errors
        print(f"Error processing SMILES {smiles} for ID: {mol_id}: {e}")
        invalid_smiles.append((mol_id, smiles, f"Error: {e}"))
        return None

# Convert SMILES to molecular objects and record invalid SMILES
data['ROMol'] = data.apply(lambda row: smiles_to_mol(row['SMILES'], row['ID']), axis=1)

# Remove rows with invalid or empty molecules
data = data[data['ROMol'].notnull()]

# Define the output SDF file path
output_sdf = os.path.join(output_folder, 'datasets.sdf')
writer = SDWriter(output_sdf)

# List to store mapping between SDF IDs and original IDs
relationship_data = []

# Write molecules to the SDF file and record ID relationships
for i, row in tqdm(data.iterrows(), total=len(data), desc="Writing SDF file"):
    mol = row['ROMol']
    if mol:
        # Set the ID property for the molecule in the SDF file
        mol_id = str(row['ID'])
        mol.SetProp("_ID", mol_id)
        writer.write(mol)
        
        # Add the SDF ID and original ID to the relationship table
        relationship_data.append({'SDF_ID': mol_id, 'Original_ID': row['ID']})

# Close the SDF file writer
writer.close()

# Save the ID relationship table to a CSV file
relationship_df = pd.DataFrame(relationship_data)
relationship_file = os.path.join(output_folder, 'ID_Relationship.csv')
relationship_df.to_csv(relationship_file, index=False)

# Save invalid SMILES information to a CSV file
invalid_smiles_df = pd.DataFrame(invalid_smiles, columns=['ID', 'SMILES', 'Error'])
invalid_smiles_file = os.path.join(output_folder, 'Invalid_SMILES.csv')
invalid_smiles_df.to_csv(invalid_smiles_file, index=False)

# Print completion messages
print(f"SDF file generated at {output_sdf}")
print(f"ID relationship saved to {relationship_file}")
print(f"Invalid SMILES details saved to {invalid_smiles_file}")


In [None]:
#(2)Processing an SDF file to generate molecular descriptors and fingerprints using PaDEL-Descriptor, while handling large datasets efficiently by splitting them into smaller chunks.
import os
import pandas as pd
from padelpy import padeldescriptor
from concurrent.futures import ThreadPoolExecutor

# Define output folder paths
output_folder = 'data'
input_folder = 'data'
input_sdf = os.path.join(input_folder, 'datasets.sdf')
temp_folder = os.path.join(output_folder, 'temp')
chunk_size = 10000  # Number of compounds per chunk

# Function to split a large SDF file into smaller chunks
def split_sdf(input_sdf, output_folder, chunk_size):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    with open(input_sdf, 'r') as f:
        content = f.read().split('$$$$\n')
    
    for i in range(0, len(content), chunk_size):
        chunk_content = content[i:i+chunk_size]
        chunk_file = os.path.join(output_folder, f'chunk_{i//chunk_size}.sdf')
        with open(chunk_file, 'w') as chunk_f:
            chunk_f.write('$$$$\n'.join(chunk_content) + '$$$$\n')

# Function to generate descriptors and fingerprints for a single chunk
def generate_descriptor_and_fingerprint(chunk_file, chunk_folder, temp_folder):
    input_sdf_chunk = os.path.join(chunk_folder, chunk_file)
    temp_descriptors = os.path.join(temp_folder, f'temp_desc_{chunk_file}.csv')
    temp_fingerprints = os.path.join(temp_folder, f'temp_fp_{chunk_file}.csv')
    
    padeldescriptor(mol_dir=input_sdf_chunk, d_file=temp_descriptors, d_2d=True, d_3d=False, retainorder=True, threads=1)
    padeldescriptor(mol_dir=input_sdf_chunk, d_file=temp_fingerprints, fingerprints=True, retainorder=True, threads=1)

# Function to process all chunks and generate descriptors and fingerprints
def generate_descriptors_and_fingerprints(chunk_folder, temp_folder):
    if not os.path.exists(temp_folder):
        os.makedirs(temp_folder)
        
    chunk_files = [f for f in os.listdir(chunk_folder) if f.endswith('.sdf')]
    
    with ThreadPoolExecutor(max_workers=4) as executor:
        executor.map(lambda chunk_file: generate_descriptor_and_fingerprint(chunk_file, chunk_folder, temp_folder), chunk_files)

# Function to merge all temporary descriptor and fingerprint files into final outputs
def merge_temp_files(temp_folder, output_descriptors, output_fingerprints):
    temp_descriptor_files = [os.path.join(temp_folder, f) for f in os.listdir(temp_folder) if f.startswith('temp_desc_')]
    temp_fingerprint_files = [os.path.join(temp_folder, f) for f in os.listdir(temp_folder) if f.startswith('temp_fp_')]
    
    all_descriptors = pd.concat([pd.read_csv(f) for f in temp_descriptor_files])
    all_fingerprints = pd.concat([pd.read_csv(f) for f in temp_fingerprint_files])
    
    all_descriptors.to_csv(output_descriptors, index=False)
    all_fingerprints.to_csv(output_fingerprints, index=False)

# Function to replace the first column of a CSV file with custom values
def replace_first_column_in_chunks(input_file, replacement, chunk_size=10000):
    chunk_list = []
    for chunk in pd.read_csv(input_file, chunksize=chunk_size):
        chunk.iloc[:, 0] = replacement[:len(chunk)]
        replacement = replacement[len(chunk):]
        chunk_list.append(chunk)
    
    modified_df = pd.concat(chunk_list)
    modified_df.to_csv(input_file, index=False)

# Main script
# Step 1: Split the input SDF file into smaller chunks
split_sdf(input_sdf, output_folder, chunk_size)

# Define output file paths
output_descriptors = os.path.join(output_folder, 'DP_Molecule_Features.csv')
output_fingerprints = os.path.join(output_folder, 'FP_Molecule_Features.csv')

# Step 2: Generate descriptors and fingerprints for all chunks
generate_descriptors_and_fingerprints(output_folder, temp_folder)

# Step 3: Merge temporary files into final output files
merge_temp_files(temp_folder, output_descriptors, output_fingerprints)

# Step 4: Replace the first column of descriptors and fingerprints with custom IDs
input_file = os.path.join(output_folder, 'Datasets.csv')
data = pd.read_csv(input_file, usecols=['SMILES', 'ID'])
rt_values = data['RI'].tolist() if 'RI' in data.columns else data['ID'].tolist()

replace_first_column_in_chunks(output_descriptors, rt_values)
replace_first_column_in_chunks(output_fingerprints, rt_values)


In [None]:
# TO filter molecular features by removing sparse and highly correlated columns
import pandas as pd
import numpy as np

def remove_sparse_features(df, threshold=0.8):
    """
    Remove features that are identical for 80% or more of the compounds.
    Retain columns 'ID' and 'RI'.
    
    Parameters:
        df (pd.DataFrame): Input dataframe with molecular features.
        threshold (float): Proportion of identical values above which the column will be removed.

    Returns:
        pd.DataFrame: Dataframe with sparse features removed.
    """
    # Calculate the proportion of unique values for each column, ignoring 'ID' and 'RI'
    nunique_ratios = df.apply(lambda x: x.nunique() / df.shape[0] if x.name not in ['ID', 'RI'] else 0, axis=0)
    
    # Retain columns where the proportion of unique values is below the threshold or columns 'ID' and 'RI'
    filtered_df = df.loc[:, (nunique_ratios < threshold) | (df.columns.isin(['ID', 'RI']))]
    
    return filtered_df

def remove_highly_correlated_features(df, threshold=0.95):
    """
    Remove highly correlated features, retaining one feature from each group of correlated features.
    Retain columns 'ID' and 'RI'.
    
    Parameters:
        df (pd.DataFrame): Input dataframe with molecular features.
        threshold (float): Correlation threshold above which features are considered redundant.

    Returns:
        pd.DataFrame: Dataframe with highly correlated features removed.
    """
    # Drop 'ID' and 'RI' columns and select only numerical columns
    numeric_df = df.drop(columns=['ID', 'RI'], errors='ignore').select_dtypes(include=[np.number])
    
    # Compute the correlation matrix
    corr_matrix = numeric_df.corr().abs()
    
    # Extract the upper triangle of the correlation matrix (excluding the diagonal)
    upper_triangle = corr_matrix.where(~np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    
    # Identify columns with a correlation higher than the threshold
    to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > threshold)]
    
    # Retain only one column from each group of highly correlated columns
    selected_columns = set()
    for column in to_drop:
        correlated_columns = [col for col in upper_triangle.columns if upper_triangle.loc[column, col] > threshold]
        # Keep the current column if no correlated columns have been retained
        if not any(col in selected_columns for col in correlated_columns):
            selected_columns.add(column)
    
    # Drop redundant columns and retain 'ID' and 'RI' columns
    cleaned_df = df.drop(columns=list(set(to_drop) - selected_columns), errors='ignore')
    
    return cleaned_df

def filter_features(input_file, output_file, sparse_threshold=0.8, corr_threshold=0.95):
    """
    Apply feature filtering to the molecular feature dataset.
    
    Parameters:
        input_file (str): Path to the input CSV file with molecular features.
        output_file (str): Path to save the filtered dataset.
        sparse_threshold (float): Threshold for removing sparse features.
        corr_threshold (float): Threshold for removing highly correlated features.

    Returns:
        None
    """
    # Load data from the input CSV file
    df = pd.read_csv(input_file, dtype=str, low_memory=False)
    
    # Remove sparse features
    filtered_df = remove_sparse_features(df, threshold=sparse_threshold)
    
    # Remove highly correlated features
    cleaned_df = remove_highly_correlated_features(filtered_df, threshold=corr_threshold)
    
    # Save the cleaned dataframe to the output file
    cleaned_df.to_csv(output_file, index=False)
    print(f"Filtered data saved to {output_file}")

# Define file paths
input_file = 'Molecule_Features.csv'  # Input file with molecular features
output_file = 'Molecule_Features_filtered.csv'  # Output file for filtered features

# Execute feature filtering
filter_features(input_file, output_file)
