In [549]:
import os, sys, random, itertools, math
import pandas as pd
import numpy as np
from sklearn.metrics import mutual_info_score
from rdkit import Chem
from rdkit.Chem import Draw
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import ast

In [550]:
#___________________Prep____________________#

In [551]:
MOLECULE_CSV = 'molecule.csv'
MOSS_IDENTIFIERS = f"newdata/moss_identifiers.csv"
MOSS_SUBSTRUCTURES = f"newdata/moss_substructures.csv"

In [552]:
molecule_df = pd.read_csv(MOLECULE_CSV, sep=";")

# Excluded Test Molecule 
test_mol = molecule_df.iloc[14]
molecule_df = molecule_df.drop([14])

with open("molecule_moss.smile", "w") as f:
    for i, row in molecule_df.iterrows():
        mol_id = f"{i+1}"
        smiles = row["SMILES"]
        f.write(f"{mol_id},0,{smiles}\n")

In [553]:
#_____________________A_____________________#

In [554]:
molecule_df = pd.read_csv(MOLECULE_CSV, sep=";")
# Excluded Molecule 
test_mol = molecule_df.iloc[14]
molecule_df = molecule_df.drop([14])

moss_df = pd.read_csv(MOSS_SUBSTRUCTURES, sep=",")
moss_df["mol_ids"] = pd.read_csv(MOSS_IDENTIFIERS, sep=";")['list'].apply(lambda x: [int(i) for i in ast.literal_eval(x)])


cleaned_moss_df = moss_df.drop_duplicates(subset=['description']).copy().reset_index()
cleaned_moss_df.rename(columns={'description': 'substructure'}, inplace=True)
cleaned_moss_df["frq"] = (cleaned_moss_df["mol_ids"].apply(len) / molecule_df.shape[0])

print(f"Original size: {moss_df.shape[0]}")
print(f"Cleaned size: {cleaned_moss_df.shape[0]}")
print(f"Removed {len(moss_df) - len(cleaned_moss_df)} duplicates")   #NOTE! Idk why there are duplicates. Use the cleaned set. (type 3 is the worst, others don't lose much)

Original size: 102
Cleaned size: 98
Removed 4 duplicates


In [555]:
sorted_df = cleaned_moss_df.sort_values("frq", ascending=False).head(5).drop(columns=["nodes","edges","s_abs","s_rel","c_abs","c_rel","id"])
top5 = sorted_df.head(5)
top5_mols = []
test = [0,0,0,0,0]
for sml in top5["substructure"]:
    mol = Chem.MolFromSmiles(sml, sanitize=False)
    top5_mols.append(mol)
    
for sml in molecule_df["SMILES"]:
    mol = Chem.MolFromSmiles(sml)
    for i,q_mol in enumerate(top5_mols):
        if mol.HasSubstructMatch(q_mol):
            test[i] += 1
            
img = Draw.MolsToGridImage(top5_mols, molsPerRow=5, subImgSize=(250,250), returnPNG=False)
img.save(f"results/top5_substructures.png")
print(test)

[46, 46, 46, 44, 31]


In [556]:
#_____________________B_____________________#

In [557]:
molecule_df["id"] = list(molecule_df.index.values +1)
mis = []

def mutual_information_binary(X, C):
    X = np.asarray(X)
    C = np.asarray(C)

    P11 = np.mean((X == 1) & (C == 1))
    P10 = np.mean((X == 1) & (C == 0))
    P01 = np.mean((X == 0) & (C == 1))
    P00 = np.mean((X == 0) & (C == 0))

    PX1 = P11 + P10
    PX0 = P01 + P00
    PC1 = P11 + P01
    PC0 = P10 + P00

    def calc(Pxy, Px, Pc):
        if Pxy == 0 or Px == 0 or Pc == 0:
            return 0
        return Pxy * np.log2(Pxy / (Px * Pc))

    MI = (calc(P11, PX1, PC1) +calc(P10, PX1, PC0) +calc(P01, PX0, PC1) +calc(P00, PX0, PC0))

    return MI


In [558]:
n = 1 # Cancer Type
threshold = 0.5

molecule_df["id"] = list(molecule_df.index.values +1)
mis = []
MI_df = pd.DataFrame([])
MI_ids = []
supports = []
P_C_given_X = []

for idx1, row1 in cleaned_moss_df.iterrows():
    for idx2, row2 in cleaned_moss_df.iterrows():
        if idx1 > idx2:
            continue 
        if idx1 == idx2:
            pair_id = f"{row1['id']}"
        else:
            pair_id = f"{row1['id']}&{row2['id']}"
            
            
        X1 = molecule_df["id"].isin(row1['mol_ids']).values
        X2 = molecule_df["id"].isin(row2['mol_ids']).values
        X = X1 & X2
        C = molecule_df[f'anti_cancer_{n}'].values
        
        if np.sum(X) == 0:
            continue

        
        
        P_C = C.mean()
        P_C_X1 = C[X1].mean()
        P_C_X2 = C[X2].mean()
        P_C_X = C[X].mean()
        
        if idx1 != idx2:
            MIC = P_C_X - max(P_C_X1,P_C_X2)
            if MIC < threshold:
                continue
                
        if P_C_X <= P_C:
            continue
            
        supports.append(X.mean())
        P_C_given_X.append(P_C_X)
        MI_ids.append(pair_id)
        MI = mutual_information_binary(X,C)
        mis.append(MI)
        

MI_df["id"] = MI_ids
MI_df[f'MI_anti_cancer_{n}'] = mis
MI_df["support"] = supports
MI_df["P_C_given_X"] = P_C_given_X

MI_df.head(3)

Unnamed: 0,id,MI_anti_cancer_1,support,P_C_given_X
0,1&76,0.068953,0.043478,1.0
1,1&83,0.068953,0.043478,1.0
2,1&84,0.068953,0.043478,1.0


In [559]:
print(len(MI_df))

54


In [560]:
mol = Chem.MolFromSmiles(test_mol["SMILES"])
test_substructures = []
for i,q_mol_smile in enumerate(cleaned_moss_df["substructure"]):
    q_mol =  Chem.MolFromSmiles(q_mol_smile, sanitize=False)
    if mol.HasSubstructMatch(q_mol):
           test_substructures.append(i +1)

In [561]:
print(test_substructures)

[26, 34, 42, 43, 44, 50, 52, 53, 89, 91, 94, 96, 98]


In [562]:
selected_rules = []

for idx, row in MI_df.iterrows():
    rule_id = row['id']
    sub_ids = list(map(int, rule_id.split('&')))
    if all(s in test_substructures for s in sub_ids):
        selected_rules.append(row)

selected_rules_df = pd.DataFrame(selected_rules)
print(selected_rules_df)


    id  MI_anti_cancer_1   support  P_C_given_X
20  26          0.439291  0.282609     0.923077
26  34          0.265627  0.260870     0.833333
30  42          0.212630  0.391304     0.666667
46  89          0.439291  0.282609     0.923077
48  91          0.373905  0.304348     0.857143


In [563]:
C = molecule_df['anti_cancer_2'].values
P_C_global = np.mean(C)

selected_rules_df['MI_conf'] = selected_rules_df[f'MI_anti_cancer_{n}'] * selected_rules_df['P_C_given_X']
selected_rules_df['lift'] = selected_rules_df['P_C_given_X'] / P_C_global
selected_rules_df['leverage'] = selected_rules_df['P_C_given_X'] - P_C_global

stats = {
    'avg_confidence': selected_rules_df['P_C_given_X'].mean(),
    'avg_MI': selected_rules_df[f'MI_anti_cancer_{n}'].mean(),
    'avg_MI_conf': selected_rules_df['MI_conf'].mean(),
    'avg_lift': selected_rules_df['lift'].mean(),
    'avg_leverage': selected_rules_df['leverage'].mean()
}

print(stats)


{'avg_confidence': 0.8406593406593406, 'avg_MI': 0.3461490304392965, 'avg_MI_conf': 0.2989198068617565, 'avg_lift': 2.274725274725275, 'avg_leverage': 0.4710941232680364}


In [None]:
predicted_activity = 1 if stats['avg_confidence'] > 0.85 else 0
true_activity = test_mol[f'anti_cancer_{n}']
print(f"Predicted: {predicted_activity}, True: {true_activity}")

Predicted: 1, True: 0
