In [2]:
import pandas as pd
import os
import numpy as np

# PATH = f"{os.getcwd()}/Documents/class/TCC"

df_fame_train = pd.read_csv('raw/signature_mol_train.csv').rename(columns={'cell_id': 'cell_line'})
df_fame_test = pd.read_csv('raw/signature_mol_test.csv').rename(columns={'cell_id': 'cell_line'})
df_fame_val = pd.read_csv('raw/signature_mol_val.csv').rename(columns={'cell_id': 'cell_line'})

df_fame_train_cl = pd.read_csv('raw/signature_mol_train_cl.csv').rename(columns={'cell_id': 'cell_line'})
df_fame_test_cl = pd.read_csv('raw/signature_mol_test_cl.csv').rename(columns={'cell_id': 'cell_line'})
df_fame_val_cl = pd.read_csv('raw/signature_mol_val_cl.csv').rename(columns={'cell_id': 'cell_line'})


df_fame = pd.concat([
    df_fame_train,
    df_fame_test,
    df_fame_val
])

df_fame_cl = pd.concat([
    df_fame_train_cl,
    df_fame_test_cl,
    df_fame_val_cl
])

In [19]:
import re
import selfies as sf

def pre_processing_dataframe(dataset):
    # Taking out repetitive molecules
    counts = dataset.groupby(['cell_line', 'smiles']).size().reset_index(name='count')
    molecules_to_use = pd.DataFrame(counts[counts['count'] == 1]).drop('count', axis=1)

    dataset = dataset.merge(molecules_to_use, on=['smiles', 'cell_line'], how='inner')
    
    # Applying SELFIES
    dataset['selfies'] = dataset.smiles.apply(lambda x: re.findall(r'\[.*?\]', sf.encoder(x)))
    
    # Formating gene_e
    dataset['gene_e'] = dataset.gene_expression.apply(lambda x: '//'.join(f"{float(num):.6f}" for num in x[1:-1].split(', ')))

    
    return dataset

In [20]:
df_fame = pre_processing_dataframe(df_fame)
df_fame_cl = pre_processing_dataframe(df_fame_cl)

### Split in train and test

In [21]:
import pandas as pd
import numpy as np

def split_data_in_train_and_validation(df_fame_new, df_fame_test, target_proportion):
    """
    Splits the dataset into training and test sets while ensuring that:
    1. `pert_iname` does not overlap between the training and test sets.
    2. The `cell_line` distribution is maintained in both the training and test sets.
    3. The test set contains approximately the target proportion of the data.

    Parameters:
    - df_train: DataFrame containing the training data.
    - df_test: DataFrame containing the test data.
    - target_proportion: The proportion of data to allocate to the test set (default is 10%).

    Returns:
    - df_train_final: DataFrame for the final training set.
    - df_test_final: DataFrame for the final test set.
    """
    # Example DataFrame (replace with your actual data)
    data = df_fame_new[~df_fame_new.smiles.isin(df_fame_test.smiles)]

    # Parameters
    total_rows = len(data)
    test_target_count = int(total_rows * target_proportion)

    # Step 1: Split `pert_iname` into test and train datasets
    pert_inames = data['pert_iname'].unique()
    np.random.shuffle(pert_inames)  # Shuffle `pert_iname` to ensure randomness

    # Calculate how many `pert_iname` should go into the test set
    test_pert_inames = pert_inames[:int(len(pert_inames) * target_proportion)]

    # Step 2: Create test and train datasets based on `pert_iname`
    df_fame_val = data[data['pert_iname'].isin(test_pert_inames)]
    df_fame_train = data[~data['pert_iname'].isin(test_pert_inames)]

    # Step 3: Proportionally split by `cell_line` to ensure the same `cell_line` distribution in both datasets
    # First, calculate the proportions of each `cell_line` in the train data
    cell_line_proportions = df_fame_train['cell_line'].value_counts(normalize=True)

    # Now, for each `cell_line`, we will move the same proportion of rows to the test dataset
    df_fame_train_final = pd.DataFrame()
    df_fame_val_final = pd.DataFrame()

    # Track the number of rows added to test data to maintain the target size
    added_rows_count = len(df_fame_val)

    for cell_line, _ in cell_line_proportions.items():
        # Filter data by `cell_line`
        cell_line_train = df_fame_train[df_fame_train['cell_line'] == cell_line]
        
        # Calculate how many rows should go to the test set based on the `cell_line` proportion
        cell_line_test_count = int(len(cell_line_train) * target_proportion)
        
        if added_rows_count + cell_line_test_count <= test_target_count:
            # Add rows to the test data
            df_fame_val_final = pd.concat([df_fame_val_final, cell_line_train.head(cell_line_test_count)])
            # Remove rows from the train data
            df_fame_train_final = pd.concat([df_fame_train_final, cell_line_train.tail(len(cell_line_train) - cell_line_test_count)])
            added_rows_count += cell_line_test_count
        else:
            # If adding the calculated number of rows exceeds the target size, add remaining rows to reach the target size
            remaining_count = test_target_count - added_rows_count
            df_fame_val_final = pd.concat([df_fame_val_final, cell_line_train.head(remaining_count)])
            df_fame_train_final = pd.concat([df_fame_train_final, cell_line_train.tail(len(cell_line_train) - remaining_count)])
            added_rows_count += remaining_count
            break
        
    return df_fame_train, df_fame_val




In [23]:
df_fame_test = pd.concat([
    df_fame[(
        (df_fame['pert_iname'] == 'dexamethasone') |
        (df_fame['pert_iname'] == 'testosterone') 
    )],
])


df_fame_train, df_fame_val = split_data_in_train_and_validation(df_fame, df_fame_test, target_proportion=0.10)

df_fame_val.cell_line.value_counts() / df_fame_train.cell_line.value_counts()



VCAP    0.113279
MCF7    0.110645
Name: cell_line, dtype: float64

In [24]:
df_fame_test_cl = df_fame_cl.loc[df_fame_test.index.tolist()]
df_fame_train_cl = df_fame_cl.loc[df_fame_train.index.tolist()]
df_fame_val_cl = df_fame_cl.loc[df_fame_val.index.tolist()]

## Save results

In [25]:
df_fame_test.to_csv('process/df_fame_test.csv', index=False)
df_fame_train.to_csv('process/df_fame_train.csv', index=False)
df_fame_val.to_csv('process/df_fame_val.csv', index=False)

df_fame_test_cl.to_csv('process/df_fame_test_cl.csv', index=False)
df_fame_train_cl.to_csv('process/df_fame_train_cl.csv', index=False)
df_fame_val_cl.to_csv('process/df_fame_val_cl.csv', index=False)