In [None]:
import pandas as pd

df = pd.read_csv('name.csv')

In [None]:
import numpy as np
from rdkit import Chem

def canonicalize_smiles(smiles_str):
    try:
        mol = Chem.MolFromSmiles(smiles_str, sanitize=True)
        if mol is None:
            return None
        return Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True)
    except:
        return None

In [None]:
df['SMILES'] = df['SMILES'].apply(canonicalize_smiles)
df = df[df['SMILES'].notna()]

In [None]:
df = df.drop_duplicates(subset="SMILES", keep="first")

In [None]:
from rdkit import Chem
from rdkit.Chem import Descriptors, Crippen

# Разрешённые элементы
allowed_atoms = {'C', 'H', 'O', 'N', 'P', 'S'}

def is_valid_molecule(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return False

    # 1. Нейтральность: без зарядов
    if Chem.GetFormalCharge(mol) != 0:
        return False

    # 2. Без радикалов (атомы с неспаренными электронами)
    if any(atom.GetNumRadicalElectrons() != 0 for atom in mol.GetAtoms()):
        return False

    # 3. Молекулярная масса ≤ 1000
    if Descriptors.MolWt(mol) > 1000:
        return False

    # 4. Только разрешённые атомы
    atoms = {atom.GetSymbol() for atom in mol.GetAtoms()}
    if not atoms.issubset(allowed_atoms):
        return False

    # 5. logP > 1
    if Crippen.MolLogP(mol) <= 1:
        return False

    return True

df_filtered = df[df['SMILES'].apply(is_valid_molecule)]

print(f"Прошло фильтрацию: {len(df_filtered)} молекул из {len(df)}")

In [None]:
df = df_filtered.copy()

In [None]:
from rdkit.Chem import Fragments

def has_phenol_or_aromatic_amine(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return False
    return Fragments.fr_phenol(mol) > 0 or Fragments.fr_aniline(mol) > 0

df['has_target_group'] = df['SMILES'].apply(has_phenol_or_aromatic_amine)

df_fragments = df[df['has_target_group'] == True]

In [None]:
df = df_fragments.drop(columns=['has_target_group'])

In [None]:
import sascorer

def calculate_sa_score(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return False
    return sascorer.calculateScore(mol)

df['SA_Score'] = df['SMILES'].apply(calculate_sa_score)

df_filtered = df[df['SA_Score'] < 6].reset_index(drop=True)

In [None]:
df = df_filtered.drop(columns=['SA_Score'])

In [None]:
df.to_csv('name_1.csv', index= False)

In [None]:
df = pd.read_csv('name_1.csv')

In [None]:
import numpy as np
from rdkit import Chem

def mol_from_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.nan
    else:
        return mol

In [None]:
df['mol'] = df['SMILES'].apply(mol_from_smiles)
df['Hmol'] = df['mol'].apply(lambda mol: Chem.AddHs(mol))

In [None]:
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors

descriptor_names = [desc[0] for desc in Descriptors._descList]

Desc_list_func = MoleculeDescriptors.MolecularDescriptorCalculator(descriptor_names)

def compute_descriptors(mol):
    return Desc_list_func.CalcDescriptors(mol)

df_desc_values = df['Hmol'].apply(compute_descriptors)

df_desc = pd.DataFrame(
    df_desc_values.tolist(),
    columns=descriptor_names,
)

df = pd.concat([df, df_desc], axis=1)

In [None]:
from mordred import Calculator, descriptors

calc = Calculator(descriptors, ignore_3D=True)

df_mordred_desc = calc.pandas(df['Hmol'])

df = pd.concat([df, df_mordred_desc], axis=1)

In [None]:
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator
from rdkit.DataStructs import ConvertToNumpyArray

radius = 3
nBits = 2048

generator = GetMorganGenerator(radius=radius, fpSize=nBits, countSimulation=True)

def mol_to_morgan_fp(mol):
    arr = np.zeros((nBits,), dtype=int)
    if mol is not None:
        fp = generator.GetFingerprint(mol)
        ConvertToNumpyArray(fp, arr)
    return arr

df_fingerprints_value = np.array([mol_to_morgan_fp(mol) for mol in df['Hmol']])

df_fingerprints = pd.DataFrame(df_fingerprints_value, columns=[f'fp_{i}' for i in range(nBits)])

df = pd.concat([df, df_fingerprints], axis=1)

In [None]:
df = df.drop(columns = ['mol', 'Hmol'])

In [None]:
df1 = df.copy()

In [None]:
Y = df1[['SMILES']]
X = df1.drop(columns=['SMILES'])

In [None]:
df = X

df_clean = df.copy()

df_clean = df.apply(pd.to_numeric, errors='coerce')

In [None]:
nan_ratio = df_clean.isna().mean()
print(nan_ratio.sort_values(ascending=False))

In [None]:
df_filtered = df_clean.loc[:, nan_ratio < 0.8]

In [None]:
df = df_filtered

In [None]:
missing_count = df.isna().sum()
missing_ratio = df.isna().mean()

summary = pd.DataFrame({
    'NaN_count': missing_count,
    'NaN_ratio': missing_ratio
}).query("NaN_count > 0").sort_values("NaN_ratio", ascending=False)

print(summary)

In [None]:
df2 = df.copy()

In [None]:
import pandas as pd
import numpy as np

def process_missing_values(df, nan_threshold=0.3, fill_strategy='median'):
    """
    Удаляет признаки с большим количеством пропусков и заполняет остальные.
    
    Параметры:
    - df: исходный DataFrame
    - nan_threshold: максимальная доля NaN в колонке (удалим всё, что выше)
    - fill_strategy: 'median', 'mean', 'zero', 'ffill', 'bfill'
    
    Возвращает:
    - очищенный DataFrame
    - таблицу с информацией о NaN до обработки
    """
    df = df.copy()
    
    # 1. Анализ NaN
    nan_count = df.isna().sum()
    nan_ratio = df.isna().mean()
    summary = pd.DataFrame({
        'NaN_count': nan_count,
        'NaN_ratio': nan_ratio
    }).sort_values('NaN_ratio', ascending=False)

    print("Обнаружены пропущенные значения в следующих столбцах:")
    print(summary[summary['NaN_count'] > 0])
    
    # 2. Удалим столбцы, где NaN больше порога
    cols_to_drop = summary[summary['NaN_ratio'] > nan_threshold].index.tolist()
    df.drop(columns=cols_to_drop, inplace=True)
    print(f"\n Удалено {len(cols_to_drop)} колонок с NaN > {nan_threshold * 100:.0f}%")

    # 3. Заполним оставшиеся NaN
    if fill_strategy == 'median':
        df = df.fillna(df.median(numeric_only=True))
    elif fill_strategy == 'mean':
        df = df.fillna(df.mean(numeric_only=True))
    elif fill_strategy == 'zero':
        df = df.fillna(0)
    elif fill_strategy == 'ffill':
        df = df.fillna(method='ffill')
    elif fill_strategy == 'bfill':
        df = df.fillna(method='bfill')
    else:
        raise ValueError(f"Неизвестная стратегия заполнения: {fill_strategy}")
    
    print("Остаточные NaN после заполнения:", df.isna().sum().sum())
    
    return df, summary


In [None]:
df_cleaned, nan_summary = process_missing_values(df, nan_threshold=0.3, fill_strategy='median')

In [None]:
df = df_cleaned.copy()

missing_count = df.isna().sum()
missing_ratio = df.isna().mean()

summary = pd.DataFrame({
    'NaN_count': missing_count,
    'NaN_ratio': missing_ratio
}).query("NaN_count > 0").sort_values("NaN_ratio", ascending=False)

print(summary)

In [None]:
X = df

In [None]:
corr_matrix = X.corr().abs()

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]

X_filtered = X.drop(columns=to_drop)

In [None]:
X = X_filtered

In [None]:
X

In [None]:
df = pd.concat([Y, X], axis=1)

In [None]:
df.to_csv('name_end.csv', index=False)