In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
pip install tensorflow




In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping

# === Wczytanie danych ===
train = pd.read_csv('/content/drive/MyDrive/heckaton_bio_ai/train.csv')
val = pd.read_csv('/content/drive/MyDrive/heckaton_bio_ai/val.csv')
test = pd.read_csv('/content/drive/MyDrive/heckaton_bio_ai/test.csv')

# === Filtrowanie klas 1–4
train = train[train['CURVE_CLASS2'].isin([1, 2, 3, 4])].copy()
val = val[val['CURVE_CLASS2'].isin([1, 2, 3, 4])].copy()
test = test[test['CURVE_CLASS2'].isin([1, 2, 3, 4])].copy()

# === Funkcja konwertująca fingerprint na listę bitów
def parse_fingerprint(fp):
    return [int(c) for c in str(fp).strip() if c in '01']

# === Konwersja fingerprintów
X_train = train['PubChemFingerprint'].apply(parse_fingerprint).tolist()
X_val = val['PubChemFingerprint'].apply(parse_fingerprint).tolist()
X_test = test['PubChemFingerprint'].apply(parse_fingerprint).tolist()

# === Sprawdzenie długości fingerprintu
fp_length = len(X_train[0])
assert all(len(x) == fp_length for x in X_train), "Niespójna długość fingerprintów"

# === Konwersja do NumPy
X_train = np.array(X_train).reshape(-1, fp_length, 1)
X_val = np.array(X_val).reshape(-1, fp_length, 1)
X_test = np.array(X_test).reshape(-1, fp_length, 1)

# === Kodowanie etykiet
encoder = LabelEncoder()
encoder.fit([1, 2, 3, 4])

y_train = encoder.transform(train['CURVE_CLASS2'])
y_val = encoder.transform(val['CURVE_CLASS2'])
y_test = encoder.transform(test['CURVE_CLASS2'])

y_train_cat = to_categorical(y_train, num_classes=4)
y_val_cat = to_categorical(y_val, num_classes=4)
y_test_cat = to_categorical(y_test, num_classes=4)

# === Wagi klas – wzmocnij klasę 3 (czyli zakodowaną jako 2)
class_weights = {0: 1.0, 1: 1.0, 2: 10.0, 3: 1.0}

# === Budowa modelu CNN
model = Sequential()
model.add(Input(shape=(fp_length, 1)))
model.add(Conv1D(filters=64, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(4, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# === Trening
es = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

model.fit(
    X_train, y_train_cat,
    epochs=20,
    batch_size=16,
    validation_data=(X_val, y_val_cat),
    class_weight=class_weights,
    callbacks=[es],
    verbose=1
)

# === Ewaluacja
y_pred_prob = model.predict(X_test)
y_pred = np.argmax(y_pred_prob, axis=1)

print("\n=== Ewaluacja na test.csv ===")
print("Accuracy:", accuracy_score(y_test, y_pred))

present_labels = np.unique(np.concatenate([y_test, y_pred]))
present_label_names = encoder.inverse_transform(present_labels).astype(str)

print("\nClassification Report:\n", classification_report(
    y_test, y_pred,
    labels=present_labels,
    target_names=present_label_names,
    zero_division=0
))

print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred, labels=present_labels))


Epoch 1/20
[1m3892/3892[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m294s[0m 75ms/step - accuracy: 0.9899 - loss: 0.3256 - val_accuracy: 0.9963 - val_loss: 0.0966
Epoch 2/20
[1m 314/3892[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m4:25[0m 74ms/step - accuracy: 0.9975 - loss: 0.1309