In [None]:
import polars as pl
from polars import col
from libs.misc import *
from libs.misc import load_and_clean_data
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
RAW_PATH = 'libs/datasets/chembl_selected_ds.parquet'

In [None]:
df = pl.scan_parquet(RAW_PATH)

In [None]:
df.fetch(5)

In [None]:
df_limited = df.collect().slice(0, 100000)

In [None]:
df_limited.describe()

In [None]:
df_unit = impute_units(df_limited, value_col="standard_value", units_col="standard_units")

In [None]:
df_calc = compute_pIC50(df_unit)
df_calc.head()

In [None]:
# Zaokrąglanie Pica
df_calc = df_calc.with_columns(
    pl.col("pIC50")
    .round(2)                 # Najpierw naprawiamy błąd precyzji float
    .cast(pl.Decimal(None, 2)) # Potem blokujemy na 2 miejscach po przecinku
)

In [None]:
df_calc.head()

In [None]:
# False bo źle sie zaokrągla
df_calc["pchembl_value"].cast(pl.Decimal(scale=2)).equals(df_calc["pIC50"].cast(pl.Decimal(scale=2)))

In [None]:
RANDOM_SEED = 42
N_VALUE = 100000
EPOCH_NUM = 50
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

In [None]:
# --- PRZYGOTOWANIE DANYCH ---
RAW_PATH = 'libs/datasets/chembl_selected_ds.parquet' # Ścieżka do Twojego pliku
df = load_and_clean_data(RAW_PATH, N_VALUE, RANDOM_SEED)

smiles = df["canonical_smiles"].to_list()
targets = df["pIC50"].to_numpy()

# Dzielimy dane (najlepiej używać Scaffold Split w chemii, tu dla uproszczenia Random Split)
X_smiles_train, X_smiles_test, y_train, y_test = train_test_split(smiles, targets, test_size=0.2, random_state=42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Używane urządzenie: {device}")

# === ŚCIEŻKA 1: SZKOLENIE MLP (na Fingerprintach) ===
print("\n=== Rozpoczynanie ścieżki MLP ===")

# Generowanie features
X_fp_train, idx_train = generate_fingerprints(X_smiles_train)
y_train_mlp = y_train[idx_train]

X_fp_test, idx_test = generate_fingerprints(X_smiles_test)
y_test_mlp = y_test[idx_test]

# DataLoaders
train_dataset_mlp = MoleculeDatasetMLP(X_fp_train, y_train_mlp)
test_dataset_mlp = MoleculeDatasetMLP(X_fp_test, y_test_mlp)
train_loader_mlp = DataLoader(train_dataset_mlp, batch_size=64, shuffle=True)
test_loader_mlp = DataLoader(test_dataset_mlp, batch_size=64)

# Inicjalizacja MLP
mlp_model = BioActivityMLP().to(device)
optimizer_mlp = torch.optim.Adam(mlp_model.parameters(), lr=0.001)
criterion = nn.MSELoss()

# Trening MLP
print("Trenowanie MLP...")
mlp_model.train()
for epoch in range(EPOCH_NUM): # Zwiększ liczbę epok dla lepszych wyników
    total_loss = 0
    for X_batch, y_batch in train_loader_mlp:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        optimizer_mlp.zero_grad()
        output = mlp_model(X_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer_mlp.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader_mlp):.4f}")

In [None]:
# === ŚCIEŻKA 2: SZKOLENIE GNN (na Grafach) ===
print("\n=== Rozpoczynanie ścieżki GNN ===")

# Konwersja do obiektów Data
train_graphs = []
for s, y in zip(X_smiles_train, y_train):
    g = smile_to_graph(s, y)
    if g: train_graphs.append(g)

test_graphs = []
for s, y in zip(X_smiles_test, y_test):
    g = smile_to_graph(s, y)
    if g: test_graphs.append(g)

# GeoDataLoader obsługuje batchowanie grafów o różnych rozmiarach
train_loader_gnn = GeoDataLoader(train_graphs, batch_size=64, shuffle=True)
test_loader_gnn = GeoDataLoader(test_graphs, batch_size=64)

# Inicjalizacja GNN
gnn_model = BioActivityGNN(node_features_dim=13).to(device)
optimizer_gnn = torch.optim.Adam(gnn_model.parameters(), lr=0.001)

# Trening GNN
print("Trenowanie GNN...")
gnn_model.train()
for epoch in range(EPOCH_NUM):
    total_loss = 0
    for batch in train_loader_gnn:
        batch = batch.to(device)
        optimizer_gnn.zero_grad()
        output = gnn_model(batch)
        loss = criterion(output, batch.y.view(-1, 1))
        loss.backward()
        optimizer_gnn.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader_gnn):.4f}")

# === EWALUACJA (Przykład dla GNN) ===
gnn_model.eval()
preds = []
actuals = []
with torch.no_grad():
    for batch in test_loader_gnn:
        batch = batch.to(device)
        out = gnn_model(batch)
        preds.extend(out.cpu().numpy().flatten())
        actuals.extend(batch.y.cpu().numpy().flatten())

from sklearn.metrics import mean_squared_error, r2_score
rmse = np.sqrt(mean_squared_error(actuals, preds))
r2 = r2_score(actuals, preds)

print(f"\nWyniki GNN na zbiorze testowym:")
print(f"RMSE: {rmse:.4f}")
print(f"R2 Score: {r2:.4f}")