# Read Dataset

In [1]:
import os
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
import random 
import numpy as np
def pre_process(path, tasks):
    '''
    Đọc dữ liệu từ mỗi file csv tương ứng với mỗi task, lấy ra chuỗi smiles và labels của chúng trong từng task. Sau đó 
    đánh nhãn cho mỗi chuỗi smiles nếu như chuỗi smiles không tồn tại trong một số task thì nhãn của chúng không xác định được lấy giá trị=6 

    Input
        path: folder chứa file dữ liệu của từng task 
        tasks: danh sách tên của các task 
    Output
        all_smiles: danh sách chuỗi smiles 
        list_labels: danh sách nhãn của mỗi task ứng chuỗi smiles tương ứng
    '''
    # Get smiles and labels for each task 
    task_smiles = []
    task_labels = []
    all_smiles  = []
    for task in tasks:
        path_task = path + "/refined_merged_{}.csv".format(task)
        data      = pd.read_csv(path_task)
        smiles    = data['SMILES'].tolist()
        label     = data['Label'].tolist()
        task_smiles.append(smiles)
        task_labels.append(label)
        all_smiles.extend(smiles)

    # labeling for all smiles 
    all_smiles  = list(set(all_smiles))
    list_labels = []
    for smiles in all_smiles:
        labels = []
        for i in range(len(tasks)):
            if smiles in task_smiles[i]:
                idx = task_smiles[i].index(smiles)
                labels.append(task_labels[i][idx])
            else:
                # smiles in not labeled in this task
                labels.append(np.nan)
        list_labels.append(labels)
    return all_smiles, list_labels

tasks = ['BRE','CNS','COL','LEU','LNS','MEL','OVA','PRO','REN']
all_smiles, list_labels = pre_process('./raw_data', tasks)

## Feature MP 

In [2]:
import pandas as pd 
from sklearn.preprocessing import StandardScaler
data = pd.read_csv('./df_mf.csv')

vector = np.array(data)
feature   = vector[::,2:]
# Normalizing data
scaler = StandardScaler()
data_normal = scaler.fit_transform(feature)

# Mapping smiles and labels
smiles = data['SMILES'].tolist()

list_index = []
for smile in smiles:
    idx = all_smiles.index(smile)
    list_index.append(idx)
all_smiles = np.array(all_smiles)[list_index]
list_labels = np.array(list_labels)[list_index]

# Cluster


In [3]:
from sklearn_som.som import SOM
m = 3
n = 3

som = SOM(m=m, n=n, dim=data_normal.shape[1], random_state=24)
# Fit it to the data
som.fit(data_normal)

# Assign each datapoint to its predicted cluster
predictions = som.predict(data_normal)

list_len_sample = []
list_index = []
for i in range(m*n): 
    list_len_sample.append(len(np.where(predictions == i)[0]))
    list_index.append(np.where(predictions == i)[0])
    print(len(np.where(predictions == i)[0]))

5978
7160
3222
6958
6578
3199
4867
4896
3853


## Manual check num of sample in each cluster

In [4]:

data_cluster = [[] for i in range(len(list_index))]
for i in range(len(list_index)):
    data_cluster[i] = list_labels[list_index[i]]

def get_data_task(sample, list_index):
    num_sample_task = []
    for i in range(len(list_index)):
        task_data = np.where((sample.transpose()[i] == 0) | (sample.transpose()[i] == 1))[0]
        num_sample_task.append(len(task_data))
    return num_sample_task

for idx, sample in enumerate(data_cluster):
    num_sample_task = get_data_task(sample, list_index)
    print(f"Cluster {idx}:      num_sample: {len(sample)}       : num_sample for each task: {num_sample_task}")

Cluster 0:      num_sample: 5978       : num_sample for each task: [5657, 5712, 5750, 5335, 5779, 5771, 5790, 5457, 5779]
Cluster 1:      num_sample: 7160       : num_sample for each task: [6715, 6794, 6760, 6267, 6871, 6902, 6935, 6453, 6864]
Cluster 2:      num_sample: 3222       : num_sample for each task: [2728, 2833, 2900, 2406, 2919, 2936, 3012, 2573, 2935]
Cluster 3:      num_sample: 6958       : num_sample for each task: [6538, 6602, 6681, 6072, 6675, 6709, 6758, 6294, 6734]
Cluster 4:      num_sample: 6578       : num_sample for each task: [5849, 5983, 6091, 5316, 6100, 6189, 6213, 5662, 6125]
Cluster 5:      num_sample: 3199       : num_sample for each task: [2601, 2694, 2799, 2260, 2777, 2888, 2919, 2399, 2833]
Cluster 6:      num_sample: 4867       : num_sample for each task: [4398, 4490, 4577, 4068, 4560, 4600, 4629, 4196, 4580]
Cluster 7:      num_sample: 4896       : num_sample for each task: [4258, 4399, 4513, 3817, 4461, 4565, 4606, 4054, 4530]
Cluster 8:      num_samp

## Split data

In [7]:
from sklearn.model_selection import train_test_split

test_cluster_idx = 0
# get index test dataset 
idx_test = list_index[test_cluster_idx]

# Split train_val with 90/10
idx_train = [i  for i in range(len(all_smiles)) if i not in idx_test]
idx_train, idx_val = train_test_split(idx_train, test_size=0.10, random_state=2)

print(len(idx_train))
print(len(idx_val))
print(len(idx_test))

print(list_labels[idx_test].shape)

# Save index split data
# np.save('./cluster_split/index_train_{}.npy'.format(test_cluster_idx),idx_train_1)
# np.save('./cluster_split/index_val_{}.npy'.format(test_cluster_idx),idx_val_1)
# np.save('./cluster_split/index_test_{}.npy'.format(test_cluster_idx),idx_test)




36659
4074
5978
(5978, 9)
