In [2]:
import pandas as pd

# Path check
path = "../data/fold.parquet"

try:
    # Load the data properly
    df = pd.read_parquet(path, engine='pyarrow')
    
    print("--- 1. FULL COLUMN LIST ---")
    print(df.columns.tolist())
    
    print("\n--- 2. DATA PREVIEW (First 3 rows) ---")
    # This shows the actual content of the columns
    display(df.head(3))
    
    print("\n--- 3. DATA SHAPE ---")
    print(f"Total Rows: {df.shape[0]}")
    print(f"Total Columns: {df.shape[1]}")

except Exception as e:
    print(f"Error: {e}")

--- 1. FULL COLUMN LIST ---
['index', 'sharefold', 'nonsharefold']

--- 2. DATA PREVIEW (First 3 rows) ---


Unnamed: 0,index,sharefold,nonsharefold
0,0,2.0,8.0
1,1,2.0,8.0
2,2,0.0,5.0



--- 3. DATA SHAPE ---
Total Rows: 98415610
Total Columns: 3


In [3]:
import pandas as pd
import pyarrow.parquet as pq

path_train = "../data/train_enc.parquet"

print("Streaming a portion of the dataset to avoid Memory Error...")

# 1. Use PyArrow to open the file without loading it into RAM
parquet_file = pq.ParquetFile(path_train)

# 2. Read only the first 200,000 rows into a table
subset_table = parquet_file.read_row_group(0) # Reading the first 'group' of data
df_subset = subset_table.to_pandas()

# 3. Now take our 100,000 samples from that subset
df_sample = df_subset.sample(n=100000, random_state=42)

# 4. Save the small version
df_sample.to_parquet("../data/train_sample_100k.parquet")

print("Success! 'train_sample_100k.parquet' is created.")
print(f"New file size is much smaller and ready for coding!")

Streaming a portion of the dataset to avoid Memory Error...
Success! 'train_sample_100k.parquet' is created.
New file size is much smaller and ready for coding!


In [4]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# 1. Load new sample file
df = pd.read_parquet("../data/train_sample_100k.parquet")

# 2. Separate Features (X) and Labels (y)
X_cols = [f'enc{i}' for i in range(142)]
y_cols = ['bind1', 'bind2', 'bind3']

X = df[X_cols].values
y = df[y_cols].values

# 3. Split into 80% Training and 20% Validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training shapes: X={X_train.shape}, y={y_train.shape}")
print(f"Validation shapes: X={X_val.shape}, y={y_val.shape}")

Training shapes: X=(80000, 142), y=(80000, 3)
Validation shapes: X=(20000, 142), y=(20000, 3)


In [5]:
positive_rates = (y.sum(axis=0) / len(y)) * 100
for i, rate in enumerate(positive_rates):
    print(f"Protein {i+1} Binding Rate: {rate:.4f}%")

Protein 1 Binding Rate: 0.1480%
Protein 2 Binding Rate: 0.1900%
Protein 3 Binding Rate: 0.1280%


In [6]:
import tensorflow as tf
from tensorflow.keras import layers, models

# 1. Define the model architecture
def build_cnn(input_shape):
    model = models.Sequential([
        # The Embedding layer turns integers into dense vectors of fixed size
        layers.Embedding(input_dim=38, output_dim=64, input_length=input_shape),
        
        # 1D Conv layer to find local chemical patterns
        layers.Conv1D(filters=32, kernel_size=3, activation='relu'),
        layers.GlobalMaxPooling1D(),
        
        # Hidden layer for higher-level reasoning
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.1), # Prevents overfitting
        
        # Output layer: 3 nodes (one for each protein) with Sigmoid activation
        layers.Dense(3, activation='sigmoid') 
    ])
    
    model.compile(optimizer='adam', 
                  loss='binary_crossentropy', 
                  metrics=['AUC']) # AUC is better than Accuracy for drug discovery
    return model

# 2. Initialize the model
model = build_cnn(input_shape=142)

# 3. Train it (start with just 3 epochs to test)
print("Starting Training...")
history = model.fit(X_train, y_train, 
                    validation_data=(X_val, y_val), 
                    epochs=3, 
                    batch_size=32)



Starting Training...
Epoch 1/3
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - AUC: 0.5144 - loss: 0.0400 - val_AUC: 0.5599 - val_loss: 0.0126
Epoch 2/3
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - AUC: 0.5682 - loss: 0.0115 - val_AUC: 0.5560 - val_loss: 0.0123
Epoch 3/3
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - AUC: 0.6488 - loss: 0.0110 - val_AUC: 0.6405 - val_loss: 0.0114


In [None]:
# Create a models folder
import os
if not os.path.exists('../models'):
    os.makedirs('../models')

# Save the model
model.save('../models/initial_cnn_v1.keras')
print("Model saved to /models/initial_cnn_v1.keras")