# Data Preparation for TransPolymer Finetuning

This notebook handles data loading, validation, and splitting for finetuning the TransPolymer model.

**Workflow:**
1. Load input CSV file
2. Inspect data types, rows, and data points
3. Validate data quality
4. Split into train/validation/test sets
5. Save to data folder with clear names

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os
from pathlib import Path
import sys

# Set paths
DATA_FOLDER = Path("../data")
DATA_FOLDER.mkdir(exist_ok=True)

print("Libraries imported successfully!")
print(f"Data folder: {DATA_FOLDER.absolute()}")

## Step 1: Load Input File

Enter your input CSV file path below. The CSV should contain at least two columns:
- Column 1: Polymer SMILES strings
- Column 2: Target property values

In [None]:
# Configuration: Modify these values for your dataset
INPUT_FILE = "../data/OPV.csv"  # Change this to your input file path
DATASET_NAME = "OPV"  # Name for output files (e.g., OPV, PE_I, etc.)

# Load the CSV file
try:
    df = pd.read_csv(INPUT_FILE)
    print(f"✓ Successfully loaded: {INPUT_FILE}")
    print(f"  Shape: {df.shape}")
except FileNotFoundError:
    print(f"✗ Error: File not found at {INPUT_FILE}")
    print(f"  Available files in data folder:")
    for f in DATA_FOLDER.glob("*.csv"):
        print(f"    - {f.name}")

## Step 2: Data Inspection & Validation

Examine the data types, number of rows, and data statistics.

In [None]:
# Display basic information
print("=" * 60)
print("DATA OVERVIEW")
print("=" * 60)
print(f"\n1. Dataset Shape: {df.shape[0]} rows × {df.shape[1]} columns")
print(f"\n2. Column Names and Data Types:")
print(df.dtypes)

print(f"\n3. First 5 rows:")
print(df.head())

print(f"\n4. Data Statistics:")
print(df.describe())

print(f"\n5. Missing Values:")
print(df.isnull().sum())

print(f"\n6. Column Information:")
for col in df.columns:
    print(f"\n   Column: '{col}'")
    print(f"   - Data type: {df[col].dtype}")
    print(f"   - Non-null count: {df[col].notna().sum()} / {len(df)}")
    print(f"   - Unique values: {df[col].nunique()}")

## Step 3: Data Cleaning & Validation

Check for missing values, remove invalid entries, and validate data quality.

In [None]:
# Create a copy for processing
df_clean = df.copy()

print("=" * 60)
print("DATA CLEANING")
print("=" * 60)

# Check for missing values
initial_rows = len(df_clean)
print(f"\n1. Initial number of rows: {initial_rows}")

missing_info = df_clean.isnull().sum()
if missing_info.sum() > 0:
    print(f"\n2. Removing rows with missing values:")
    print(missing_info[missing_info > 0])
    df_clean = df_clean.dropna()
    print(f"   Rows removed: {initial_rows - len(df_clean)}")
    print(f"   Remaining rows: {len(df_clean)}")
else:
    print(f"\n2. No missing values found ✓")

# Validate that we have at least 2 columns
if len(df_clean.columns) < 2:
    raise ValueError(f"Expected at least 2 columns (SMILES, target), but got {len(df_clean.columns)}")

# Assume first column is SMILES and second is target property
smiles_col = df_clean.columns[0]
target_col = df_clean.columns[1]

print(f"\n3. Column mapping:")
print(f"   SMILES column: '{smiles_col}'")
print(f"   Target property column: '{target_col}'")

# Check for duplicate SMILES
duplicates = df_clean[smiles_col].duplicated().sum()
print(f"\n4. Duplicate SMILES: {duplicates}")
if duplicates > 0:
    print(f"   Keeping first occurrence of duplicates...")
    df_clean = df_clean.drop_duplicates(subset=[smiles_col], keep='first')
    print(f"   Remaining rows: {len(df_clean)}")

# Validate target property is numeric
try:
    df_clean[target_col] = pd.to_numeric(df_clean[target_col])
    print(f"\n5. Target property successfully converted to numeric ✓")
except ValueError as e:
    print(f"\n5. Error: Could not convert target property to numeric")
    print(f"   Error: {e}")

# Check for extreme values
print(f"\n6. Target property range:")
print(f"   Min: {df_clean[target_col].min()}")
print(f"   Max: {df_clean[target_col].max()}")
print(f"   Mean: {df_clean[target_col].mean():.4f}")
print(f"   Std Dev: {df_clean[target_col].std():.4f}")

print(f"\n{'='*60}")
print(f"FINAL DATASET: {len(df_clean)} rows × {len(df_clean.columns)} columns")
print(f"{'='*60}")

## Step 4: Train/Validation/Test Split

Split the data for finetuning with validation set:
- **Training set**: 70% - used for training the model
- **Validation set**: 15% - used for hyperparameter tuning and early stopping
- **Test set**: 15% - used for final evaluation

The validation set is important for preventing overfitting during finetuning.

In [None]:
# Split configuration
RANDOM_STATE = 42  # For reproducibility
TRAIN_RATIO = 0.70
VAL_RATIO = 0.15
TEST_RATIO = 0.15

print("=" * 60)
print("DATA SPLITTING")
print("=" * 60)
print(f"\nSplit Ratios:")
print(f"  Train: {TRAIN_RATIO*100:.0f}%")
print(f"  Validation: {VAL_RATIO*100:.0f}%")
print(f"  Test: {TEST_RATIO*100:.0f}%")
print(f"  Random state: {RANDOM_STATE}")

# First split: separate test set (15%)
train_val_df, test_df = train_test_split(
    df_clean,
    test_size=TEST_RATIO,
    random_state=RANDOM_STATE
)

# Second split: separate validation from training (15% of remaining 85%)
# val_ratio = 0.15 / 0.85 ≈ 0.176 to get 15% of total
train_df, val_df = train_test_split(
    train_val_df,
    test_size=VAL_RATIO / (1 - TEST_RATIO),
    random_state=RANDOM_STATE
)

print(f"\nSplit Results:")
print(f"  Training set: {len(train_df)} samples ({len(train_df)/len(df_clean)*100:.1f}%)")
print(f"  Validation set: {len(val_df)} samples ({len(val_df)/len(df_clean)*100:.1f}%)")
print(f"  Test set: {len(test_df)} samples ({len(test_df)/len(df_clean)*100:.1f}%)")
print(f"  Total: {len(train_df) + len(val_df) + len(test_df)} samples")

# Display statistics for each split
print(f"\nTarget Property Statistics by Split:")
print(f"\n  Training Set ({target_col}):")
print(f"    Mean: {train_df[target_col].mean():.4f}")
print(f"    Std Dev: {train_df[target_col].std():.4f}")
print(f"    Range: [{train_df[target_col].min():.4f}, {train_df[target_col].max():.4f}]")

print(f"\n  Validation Set ({target_col}):")
print(f"    Mean: {val_df[target_col].mean():.4f}")
print(f"    Std Dev: {val_df[target_col].std():.4f}")
print(f"    Range: [{val_df[target_col].min():.4f}, {val_df[target_col].max():.4f}]")

print(f"\n  Test Set ({target_col}):")
print(f"    Mean: {test_df[target_col].mean():.4f}")
print(f"    Std Dev: {test_df[target_col].std():.4f}")
print(f"    Range: [{test_df[target_col].min():.4f}, {test_df[target_col].max():.4f}]")

## Step 5: Save Processed Data

Save the train, validation, and test sets with clear names to the data folder.

In [None]:
print("=" * 60)
print("SAVING PROCESSED DATA")
print("=" * 60)

# Define output file names with clear naming convention
# Format: {DATASET_NAME}_{split_type}_{purpose}.csv
# Example: OPV_train.csv, OPV_val.csv, OPV_test.csv

train_file = DATA_FOLDER / f"{DATASET_NAME}_train.csv"
val_file = DATA_FOLDER / f"{DATASET_NAME}_val.csv"
test_file = DATA_FOLDER / f"{DATASET_NAME}_test.csv"

# Save files
try:
    train_df.to_csv(train_file, index=False)
    print(f"\n✓ Saved training set: {train_file.name}")
    print(f"  Size: {len(train_df)} samples")
    
    val_df.to_csv(val_file, index=False)
    print(f"\n✓ Saved validation set: {val_file.name}")
    print(f"  Size: {len(val_df)} samples")
    
    test_df.to_csv(test_file, index=False)
    print(f"\n✓ Saved test set: {test_file.name}")
    print(f"  Size: {len(test_df)} samples")
    
    print(f"\n{'='*60}")
    print(f"All files saved successfully to: {DATA_FOLDER.absolute()}")
    print(f"{'='*60}")
    
except Exception as e:
    print(f"\n✗ Error saving files: {e}")

# Display summary
print(f"\nFile Summary:")
print(f"  Train: {train_file.name} ({len(train_df)} rows)")
print(f"  Val:   {val_file.name} ({len(val_df)} rows)")
print(f"  Test:  {test_file.name} ({len(test_df)} rows)")
print(f"\nColumn structure (all files):")
print(f"  Column 1: {smiles_col} (Polymer SMILES)")
print(f"  Column 2: {target_col} (Target property)")

## Step 6: Verification

Verify the saved files by reloading and checking their contents.

In [None]:
print("=" * 60)
print("VERIFICATION")
print("=" * 60)

# Reload and verify each file
for file_path, set_name in [(train_file, "Training"), (val_file, "Validation"), (test_file, "Test")]:
    if file_path.exists():
        df_verify = pd.read_csv(file_path)
        print(f"\n✓ {set_name} Set - {file_path.name}")
        print(f"  Rows: {len(df_verify)}")
        print(f"  Columns: {list(df_verify.columns)}")
        print(f"  Data types:\n{df_verify.dtypes}")
        print(f"  Sample row:")
        print(f"    {df_verify.iloc[0].to_dict()}")
    else:
        print(f"\n✗ File not found: {file_path.name}")

print(f"\n{'='*60}")
print("Data preparation complete! ✓")
print(f"{'='*60}")

## Step 7: Summary & Notes

### About the Splits:
- **Training Set (70%)**: Used to train the finetuned model
- **Validation Set (15%)**: Used for:
  - Early stopping during training
  - Hyperparameter tuning
  - Model selection
  - Monitoring overfitting
- **Test Set (15%)**: Used for final evaluation (held out, not touched during training)

### Why Validation Set is Important for Finetuning:
1. **Prevents overfitting**: Monitor performance on unseen data during training
2. **Early stopping**: Stop training when validation loss stops improving
3. **Hyperparameter tuning**: Test different learning rates, batch sizes, etc.
4. **Final evaluation**: Test set remains completely untouched

### Output Files Generated:
- `{DATASET_NAME}_train.csv` - Training data
- `{DATASET_NAME}_val.csv` - Validation data
- `{DATASET_NAME}_test.csv` - Test data

All files contain the same columns (SMILES, target property) and are ready for finetuning!