In [15]:
import numpy as np
import pandas as pd 
import pickle
from pathlib import Path
import polars as pl

In [16]:
dtypes = {'buildingblock1_smiles': np.int16, 'buildingblock2_smiles': np.int16, 'buildingblock3_smiles': np.int16,
          'binds_BRD4':np.byte, 'binds_HSA':np.byte, 'binds_sEH':np.byte}

directory = Path("../data/shrunken/")

## What is in the train data?

In [None]:
train_data =  pl.read_parquet(directory / "train.parquet")
train_data = train_data.to_pandas(use_pyarrow_extension_array=True)

In [None]:
# Load building blocks
BBs_dict_reverse_1 = pickle.load(open(directory / 'train_dicts/BBs_dict_reverse_1.p', 'br'))
BBs_dict_reverse_2 = pickle.load(open(directory / 'train_dicts/BBs_dict_reverse_2.p', 'br'))
BBs_dict_reverse_3 = pickle.load(open(directory / 'train_dicts/BBs_dict_reverse_3.p', 'br'))

In [19]:
train_data.head(10)

### Counts and unique values

In [23]:
print(f"Length of train data: {len(train_data)}")
print(f"Number of unique building blocks 1: {len(BBs_dict_reverse_1)}")
print(f"Number of unique building blocks 2: {len(BBs_dict_reverse_2)}")
print(f"Number of unique building blocks 3: {len(BBs_dict_reverse_3)}")

In [25]:
print(f"Value counts of binds_BRD4: {train_data['binds_BRD4'].value_counts()}")
print(f"Value counts of binds_HSA: {train_data['binds_HSA'].value_counts()}")
print(f"Value counts of binds_sEH: {train_data['binds_sEH'].value_counts()}")

In [28]:
print(f"Value counts of buildingblock1_smiles: {train_data['buildingblock1_smiles'].value_counts()}")

In [29]:
print(f"Value counts of buildingblock2_smiles: {train_data['buildingblock2_smiles'].value_counts()}")

In [30]:
print(f"Value counts of buildingblock3_smiles: {train_data['buildingblock3_smiles'].value_counts()}")

In [46]:
train_BB1 = set(BBs_dict_reverse_1.values())
train_BB2 = set(BBs_dict_reverse_2.values())
train_BB3 = set(BBs_dict_reverse_3.values())

print(f"Overlap between building blocks 1 and 2: {len(train_BB1.intersection(train_BB2))}")
print(f"Overlap between building blocks 1 and 3: {len(train_BB1.intersection(train_BB3))}")
print(f"Overlap between building blocks 2 and 3: {len(train_BB2.intersection(train_BB3))}")

print(f"Difference between building blocks 1 and 2: {len(train_BB1.difference(train_BB2))}")
print(f"Difference between building blocks 1 and 3: {len(train_BB1.difference(train_BB3))}")
print(f"Difference between building blocks 2 and 3: {len(train_BB2.difference(train_BB3))}")

In [47]:
train_BB1

## What is in the test data?

In [31]:
test_data =  pl.read_parquet(directory / "test.parquet")
test_data = test_data.to_pandas(use_pyarrow_extension_array=True)

In [33]:
test_BBS_dict_reverse_1 = pickle.load(open(directory / 'test_dicts/BBs_dict_reverse_1_test.p', 'br'))
test_BBS_dict_reverse_2 = pickle.load(open(directory / 'test_dicts/BBs_dict_reverse_2_test.p', 'br'))
test_BBS_dict_reverse_3 = pickle.load(open(directory / 'test_dicts/BBs_dict_reverse_3_test.p', 'br'))

In [34]:
test_data.head(10)

#### Counts and unique values

In [35]:
print(f"Length of test data: {len(test_data)}")
print(f"Number of unique building blocks 1: {len(test_BBS_dict_reverse_1)}")
print(f"Number of unique building blocks 2: {len(test_BBS_dict_reverse_2)}")
print(f"Number of unique building blocks 3: {len(test_BBS_dict_reverse_3)}")

In [36]:
print(f"Value counts of is_BRD4: {test_data['is_BRD4'].value_counts()}")

In [37]:
print(f"Value counts of is_HSA: {test_data['is_HSA'].value_counts()}")

In [38]:
print(f"Value counts of is_sEH: {test_data['is_sEH'].value_counts()}")

In [39]:
print(f"Value counts of buildingblock1_smiles: {test_data['buildingblock1_smiles'].value_counts()}")

In [40]:
print(f"Value counts of buildingblock2_smiles: {test_data['buildingblock2_smiles'].value_counts()}")

In [41]:
print(f"Value counts of buildingblock3_smiles: {test_data['buildingblock3_smiles'].value_counts()}")

In [52]:
test_BB1 = set(test_BBS_dict_reverse_1.values())
test_BB2 = set(test_BBS_dict_reverse_2.values())
test_BB3 = set(test_BBS_dict_reverse_3.values())

print(f"Overlap between building blocks 1 and 2: {len(test_BB1.intersection(test_BB2))}")
print(f"Overlap between building blocks 1 and 3: {len(test_BB1.intersection(test_BB3))}")
print(f"Overlap between building blocks 2 and 3: {len(test_BB2.intersection(test_BB3))}")

## Overlap between train set and test set

In [53]:
# Check for any overlap between the training and test set
print(f"Overlap between train BB1 and test BB1: {len(train_BB1.intersection(test_BB1))}")
print(f"Overlap between train BB1 and test BB2: {len(train_BB1.intersection(test_BB2))}")
print(f"Overlap between train BB1 and test BB3: {len(train_BB1.intersection(test_BB3))}")
print(f"Overlap between train BB2 and test BB1: {len(train_BB2.intersection(test_BB1))}")
print(f"Overlap between train BB2 and test BB2: {len(train_BB2.intersection(test_BB2))}")
print(f"Overlap between train BB2 and test BB3: {len(train_BB2.intersection(test_BB3))}")
print(f"Overlap between train BB3 and test BB1: {len(train_BB3.intersection(test_BB1))}")
print(f"Overlap between train BB3 and test BB2: {len(train_BB3.intersection(test_BB2))}")
print(f"Overlap between train BB3 and test BB3: {len(train_BB3.intersection(test_BB3))}")

In [56]:
# Overlap in molecule_smiles
len(set(train_data['molecule_smiles']).intersection(set(test_data['molecule_smiles'])))