In [1]:
import polars as pl
from polars import col

In [2]:
RAW_PATH = 'libs/datasets/chembl_selected_ds.parquet'

In [3]:
df = pl.scan_parquet(RAW_PATH)

In [4]:
df.fetch(5)

  df.fetch(5)


activity_id,molregno,canonical_smiles,mw_freebase,alogp,hba,hbd,psa,rtb,aromatic_rings,qed_weighted,standard_value,standard_units,standard_type,pchembl_value,target_chembl_id,target_name
u32,u32,str,f64,f32,u8,u8,f32,u8,u8,f32,f32,str,str,f32,str,str
579288,49931,"""CN(Cc1cnc2nc(N)nc(N)c2n1)c1ccc…",454.45,0.27,10,5,210.539993,9,3,0.29,8.8,"""nM""","""IC50""",8.06,"""CHEMBL614508""","""143B"""
865583,3133,"""O=C1OC(=O)C2C3CCC(O3)C12""",168.15,-0.14,4,0,52.599998,0,0,0.37,43000.0,"""nM""","""IC50""",4.37,"""CHEMBL614508""","""143B"""
881843,69448,"""C[C@@]12C(=O)OC(=O)[C@]1(C)[C@…",196.2,0.64,4,0,52.599998,0,0,0.42,10000.0,"""nM""","""IC50""",5.0,"""CHEMBL614508""","""143B"""
602041,144846,"""CCCC(=O)Oc1ccc(C[C@@H](C)[C@@H…",582.73,7.57,8,0,105.199997,17,2,0.14,300.0,"""nM""","""AC50""",6.52,"""CHEMBL2903""","""Polyunsaturated fatty acid lip…"
602045,144716,"""CC(C)CC(=O)Oc1ccc(C[C@@H](C)[C…",638.84,8.55,8,0,105.199997,17,2,0.13,200.0,"""nM""","""AC50""",6.7,"""CHEMBL2903""","""Polyunsaturated fatty acid lip…"


In [None]:
# Filtrowanie danych

df_filtered = df.filter(
    (col("standard_type") == "IC50") &
    (col("standard_units") == "nM") &
    (col("standard_value").is_not_null()) 
)

In [8]:
df_limited = df_filtered.select(
    col("molregno"),
    col("standard_value"),
    col("pchembl_value")
)

In [9]:
# Obliczenie pIC50 z IC50 (nM) na podstawie definicji:
# pIC50 = -log10(IC50 [M]) = -log10(IC50 [nM] / 10^9) = 9 - log10(IC50 [nM])

df_aggregated = df_limited.group_by("molregno").agg(
    IC50_nM_median = col("standard_value").median(),
    n_measurements = col("molregno").count()
).with_columns(
    pIC50 = 9 - col("IC50_nM_median").log10()
)

In [11]:
df_fin = df_aggregated.collect()

# Sprawdź wynik
print("\nFinal Activity Data:")
print(df_fin.head())
print(f"Liczba unikalnych molekuł (rekordów): {df_fin.shape[0]}")


Final Activity Data:
shape: (5, 4)
┌──────────┬────────────────┬────────────────┬──────────┐
│ molregno ┆ IC50_nM_median ┆ n_measurements ┆ pIC50    │
│ ---      ┆ ---            ┆ ---            ┆ ---      │
│ i64      ┆ f64            ┆ u32            ┆ f64      │
╞══════════╪════════════════╪════════════════╪══════════╡
│ 581744   ┆ 200.0          ┆ 1              ┆ 6.69897  │
│ 166740   ┆ 25000.0        ┆ 2              ┆ 4.60206  │
│ 133448   ┆ 8.0            ┆ 1              ┆ 8.09691  │
│ 408320   ┆ 13.513         ┆ 4              ┆ 7.869248 │
│ 423409   ┆ 1510.0         ┆ 4              ┆ 5.821023 │
└──────────┴────────────────┴────────────────┴──────────┘
Liczba unikalnych molekuł (rekordów): 198710
