<a href="https://colab.research.google.com/github/Vanitha-Jain/capstone/blob/Molecular-descriptors_-Unknown/Morgan_UK.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
!pip install rdkit-pypi
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem

Collecting rdkit-pypi
  Downloading rdkit_pypi-2022.9.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Downloading rdkit_pypi-2022.9.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit-pypi
Successfully installed rdkit-pypi-2022.9.5


In [5]:
def calculate_morgan_fingerprint(smiles, radius=2, n_bits=2048):
    """
    Calculate the Morgan fingerprint for a given molecule.

    Parameters:
    - smiles (str): The SMILES string of the molecule.
    - radius (int): Radius of the Morgan fingerprint.
    - n_bits (int): Number of bits for the fingerprint.

    Returns:
    - fingerprint (list): The Morgan fingerprint as a bit vector.
    """
    try:
        # Convert SMILES to RDKit Molecule object
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None
        # The nested try-except was unnecessary and incorrectly indented
        # If mol is None, it will return None and the rest of the code won't execute
        # The following lines are now correctly indented within the main try block
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
        return list(fp)
    except Exception as e:
        print(f"Error processing SMILES {smiles}: {e}")
        return None

In [8]:
def process_excel(input_excel, output_csv, smiles_column='canonical_smiles', radius=2, n_bits=2048):
    """
    Process a CSV file to calculate Morgan fingerprints for each SMILES.

    Parameters:
    - input_excel (str): Path to the input CSV file containing SMILES strings.
    - output_csv (str): Path to the output CSV file to save fingerprints.
    - smiles_column (str): Column name in the CSV that contains SMILES strings.
    - radius (int): Radius of the Morgan fingerprint.
    - n_bits (int): Number of bits for the fingerprint.
    """
    # Load the CSV file
    df = pd.read_excel('/content/Cluster 1 Rank.xlsx')

    if smiles_column not in df.columns:
        print(f"Column '{smiles_column}' not found in the input CSV.")
        return

    # Calculate fingerprints
    fingerprints = df[smiles_column].apply(lambda x: calculate_morgan_fingerprint(x, radius, n_bits))

    # Convert fingerprints to DataFrame
    fingerprint_df = pd.DataFrame(fingerprints.tolist(), columns=[f'FP_{i}' for i in range(n_bits)])

    # Concatenate original DataFrame with fingerprint DataFrame
    result_df = pd.concat([df, fingerprint_df], axis=1)

    # Save the result to a new CSV
    result_df.to_csv(output_csv, index=False)
    print(f"Fingerprints saved to {output_csv}")

# Example usage
if __name__ == "__main__": # Changed _name_ to __name__
    input_excel = "/content/Cluster 1 Rank.xlsx"  # Replace with your input CSV file
    output_csv = "molecules_with_fingerprints.csv"  # Replace with your desired output file
    smiles_column = "smiles"  # Column name with SMILES strings
    process_excel(input_excel, output_csv, smiles_column)

Fingerprints saved to molecules_with_fingerprints.csv
