In [28]:
import torch
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv, GINConv, NNConv, global_add_pool, global_mean_pool, aggr
from pairing.data import PairData, to_torch, Dataset
import pandas as pd
from itertools import combinations, product
import os
import numpy as np
import ast
import re

class GCN(torch.nn.Module):

    def __init__(self, num_convs, num_linear, embedding_size, aggr_steps, architecture):
        super(GCN, self).__init__()

        self.layers = []
        self.task = "graph"

        self.pad = torch.nn.ZeroPad2d(
            (0, embedding_size - Dataset.num_node_features(), 0, 0))

        self.gcn = self.make_conv(architecture, num_linear, embedding_size)
        self.gcn.to(device)
        self.num_convs = num_convs

        self.architecture = architecture

        self.aggr_steps = aggr_steps
        self.readout = aggr.Set2Set(embedding_size, aggr_steps)
        self.readout.to(device)

        self.post_mp = make_sequential(num_linear,
                                       2 * embedding_size,
                                       embedding_size,
                                       is_last=True)
        self.post_mp.to(device)

    def make_conv(self, architecture, num_linear, embedding_size):
        if architecture == "GCN":
            return GCNConv(embedding_size, embedding_size)
        elif architecture == "GIN":
            return GINConv(make_sequential(num_linear, embedding_size, embedding_size))
        elif architecture == "NNConv":
            mpfn = make_sequential(1, Dataset.num_edge_features(), embedding_size**2)
            return NNConv(embedding_size, embedding_size, mpfn)
        else:
            raise KeyError(f"Received invalid architecture = {architecture}.")

    def forward(self, x, edge_index, edge_attr, batch_index):
        x = self.pad(x)
        for _ in range(self.num_convs):
            if self.architecture == "NNConv":
                x = self.gcn(x, edge_index, edge_attr)
            else:
                x = self.gcn(x, edge_index)

        pooled = torch.cat([global_add_pool(x, batch_index), global_mean_pool(x, batch_index)], dim=1)
        if self.aggr_steps > 0:
            pooled = self.readout(x, index=batch_index)
        return self.post_mp(pooled)


class MixturePredictor(torch.nn.Module):

    def __init__(self, num_convs, num_linear, embedding_size, aggr_steps, architecture):
        super(MixturePredictor, self).__init__()

        self.gcn = GCN(num_convs, num_linear, embedding_size, aggr_steps, architecture)
        self.out = make_sequential(num_linear,
                                   2 * embedding_size,
                                   33,  # Dataset.num_classes(),
                                   is_last=True)

    def forward(self, x_s, edge_index_s, edge_attr_s, x_s_batch, x_t, edge_index_t,
                edge_attr_t, x_t_batch, y, *args, **kwargs):
        emb_s = self.gcn(x_s, edge_index_s, edge_attr_s, x_s_batch)
        emb_t = self.gcn(x_t, edge_index_t, edge_attr_t, x_t_batch)

        embedding = torch.cat([emb_s, emb_t], dim=1)
        return self.out(embedding)

def make_sequential(num_layers, input_dim, output_dim, is_last=False):
    layers = []
    layers.append(
        torch.nn.Sequential(torch.nn.Linear(input_dim, output_dim),
                            torch.nn.ReLU(), torch.nn.Dropout(p=0.5)))
    while len(layers) < num_layers:
        layers.append(
            torch.nn.Sequential(torch.nn.Linear(output_dim, output_dim),
                                torch.nn.ReLU(), torch.nn.Dropout(p=0.5)))

    if is_last:
        if num_layers == 1:
            layers[-1] = torch.nn.Sequential(
                torch.nn.Linear(input_dim, output_dim))
        else:
            layers[-1] = torch.nn.Sequential(
                torch.nn.Linear(output_dim, output_dim))

    return torch.nn.Sequential(*layers)


# Load the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_path = "model_33.pt"  # Path to your trained model
model = torch.load(model_path, map_location=device)
model.eval()

# Load the CIDs to SMILES mapping
cids_df = pd.read_csv('CID_SMILES.csv')
cid_to_smiles = dict(zip(cids_df['CID'], cids_df['Canonical SMILES']))

# Load the mixtures
mixtures_df = pd.read_csv('Mixure_Definitions_augmented_dataset.csv')

# Function to create PairData instances from SMILES pairs without notes
def create_pair_data_without_notes(smiles1, smiles2, num_classes):
    graph1 = to_torch(smiles1)
    graph2 = to_torch(smiles2)
    
    # Create a dummy target tensor
    y = torch.zeros(num_classes)
    
    return PairData(
        x_s=graph1['node_feat'].float(),
        edge_attr_s=graph1['edge_feat'].float(),
        edge_index_s=graph1['edge_index'],
        smiles_s=smiles1,
        x_t=graph2['node_feat'].float(),
        edge_attr_t=graph2['edge_feat'].float(),
        edge_index_t=graph2['edge_index'],
        smiles_t=smiles2,
        y=y.float()
    )



        
def predict_for_mixture(cid_list, num_classes):
    smiles_list = [cid_to_smiles[int(cid)] for cid in cid_list if int(cid) in cid_to_smiles]
    if len(smiles_list) == 1:
        pair_combinations = product(smiles_list, repeat=2)
    else:
        pair_combinations = combinations(smiles_list, 2)
    
    predictions = []
    
    for smiles1, smiles2 in pair_combinations:
        pair_data = create_pair_data_without_notes(smiles1, smiles2, num_classes)
        loader = DataLoader([pair_data], batch_size=1, follow_batch=['x_s', 'x_t'])
        
        for batch_data in loader:
            batch_data.to(device)
            with torch.no_grad():
                logits = model(**batch_data.to_dict())
                sigmoid = torch.nn.Sigmoid()
                probs = sigmoid(logits)
                predictions.append(probs.cpu().numpy())
                
    
    if predictions:
        predictions_array = np.vstack(predictions)
        return np.mean(predictions_array, axis=0)
    else:
        return None

def main():
    num_classes = 33  # this must match the number that model was trained with
    results = []

    for idx, row in mixtures_df.iterrows():
        dataset = row['Dataset']
        mixture_label = row['Mixture Label']
        cid_list = [row[f'CID_{i+1}'] for i in range(10) if not pd.isna(row[f'CID_{i+1}'])]

        avg_prediction = predict_for_mixture(cid_list, num_classes)
        
        if avg_prediction is not None:
            results.append({
                'Dataset': dataset,
                'Mixture Label': mixture_label,
                'Average Prediction': avg_prediction
            })

    results_df = pd.DataFrame(results)
    results_df.to_csv(inetrmediate_outpt, index=False)

inetrmediate_outpt = 'mixture_predictions_33.csv'

if __name__ == "__main__":
    main()
    
predictions_df = pd.read_csv(inetrmediate_outpt)

# Function to safely evaluate the string representation of arrays
def safe_eval(array_str):
    try:
        # Use regex to clean the string and make it a valid Python list
        array_str = re.sub(r'(\s+)', ',', array_str)  # Replace any whitespace with a comma
        array_str = re.sub(r'([0-9])\s+(?=[0-9])', r'\1,', array_str)  # Replace space between numbers with a comma
        return np.array(ast.literal_eval(array_str))
    except Exception as e:
        print(f"Error parsing array: {e}")
        return np.array([])

average_predictions = predictions_df['Average Prediction'].apply(safe_eval).values

if len(average_predictions) == 0 or all(arr.size == 0 for arr in average_predictions):
    print("No valid predictions found in the 'Average Prediction' column.")
else:
    average_predictions = np.vstack(average_predictions)

    num_predictions = average_predictions.shape[1]
    prediction_columns = [f'Prediction_{i+1}' for i in range(num_predictions)]

    predictions_array_df = pd.DataFrame(average_predictions, columns=prediction_columns)

    # Combine the original data with the new prediction columns
    combined_df = pd.concat([predictions_df[['Dataset', 'Mixture Label']], predictions_array_df], axis=1)

    # Save the updated dataframe to a new CSV file
    output_file2 = 'predictions_separated_mean_33_Augmentation_Dataset.csv'
    combined_df.to_csv(output_file2, index=False)

    print(f"Predictions separated into columns saved to {output_file}")
    print(combined_df.head())  
