# Data Preprocessing 

Data preprocessing is a crucial step in any machine learning workflow. It involves cleaning, transforming, and preparing the raw data to make it suitable for model training. In this notebook, we will focus on the following preprocessing steps:

1.  **Loading the data**: We will load the raw data from CSV files using pandas.
2.  **Splitting the data**: We will split the data into training and validation sets to evaluate the model's performance.
3.  **Feature Engineering**: We will create new features from the existing ones to improve the model's accuracy. This will involve domain-driven feature engineering, where we use our knowledge of the problem to create meaningful features.

### 1. Load & Split the Dataset

In [1]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

# Directory of the datasets
data_path = Path('../raw_data')

# Load the raw dataset
train_data = None
try:
    train_data = pd.read_csv(data_path / 'train.csv')
except FileNotFoundError:
    print("File not found, recheck the directory or change the path of directory.")

if train_data is not None:
    # Feature & target columns
    feature_cols = [col for col in train_data.columns if 'Component' in col]
    target_cols = [col for col in train_data.columns if 'Blend' in col]

    # Split the data
    X_train, X_val, y_train, y_val = train_test_split(train_data[feature_cols], train_data[target_cols], test_size=0.2, shuffle=True, random_state=42)
    print(f"Training data shape: {X_train.shape}")
    print(f"Validation data shape: {X_val.shape}")

Training data shape: (1600, 55)
Validation data shape: (400, 55)


### 2. Domain-Driven Feature Engineering

#### 2.1 Weighted Component Properties

In [2]:
def create_weighted_properties(df: pd.DataFrame) -> pd.DataFrame:
    """
    Create weighted properties based on component fractions and properties.
    Each property is calculated as the sum of the product of each component's fraction and its corresponding
    property value.
    Parameters:
        df (pd.DataFrame): DataFrame containing component fractions and properties.
    Returns:
        pd.DataFrame: DataFrame with weighted properties added.
    """
    weighted_features = pd.DataFrame(index=df.index)
    for prop in range(1, 11):  # 10 properties
        weighted_col = f'Weighted_Property{prop}'
        weighted_features[weighted_col] = 0
        for comp in range(1, 6):  # 5 components
            frac_col = f'Component{comp}_fraction'
            prop_col = f'Component{comp}_Property{prop}'
            weighted_features[weighted_col] += df[frac_col] * df[prop_col]
    return weighted_features

# Create weighted properties for training and validation sets
X_train = pd.concat([X_train, create_weighted_properties(X_train)], axis=1)
X_val = pd.concat([X_val, create_weighted_properties(X_val)], axis=1)

#### 2.2 Aggregates, Interactions, Dominance

In [3]:
def add_domain_features(df: pd.DataFrame) -> pd.DataFrame:
    new_feats = pd.DataFrame(index=df.index)

    # 1. Property Aggregates (mean, min, max, std) for each property across components
    for prop in range(1, 11):
        prop_cols = [f'Component{comp}_Property{prop}' for comp in range(1, 6)]
        new_feats[f'Property{prop}_mean'] = df[prop_cols].mean(axis=1)
        new_feats[f'Property{prop}_min'] = df[prop_cols].min(axis=1)
        new_feats[f'Property{prop}_max'] = df[prop_cols].max(axis=1)
        new_feats[f'Property{prop}_std'] = df[prop_cols].std(axis=1)
        new_feats[f'Property{prop}_sum'] = df[prop_cols].sum(axis=1)
        new_feats[f'Property{prop}_range'] = new_feats[f'Property{prop}_max'] - new_feats[f'Property{prop}_min']

    # 2. Fraction Interactions (pairwise products)
    for i in range(1, 6):
        for j in range(i+1, 6):
            new_feats[f'Frac{ i }x{ j }'] = df[f'Component{i}_fraction'] * df[f'Component{j}_fraction']

    # 3. Fraction Ratios (avoid division by zero)
    for i in range(1, 6):
        for j in range(1, 6):
            if i != j:
                new_feats[f'Frac{ i }_over_{ j }'] = df[f'Component{i}_fraction'] / (df[f'Component{j}_fraction'] + 1e-6)

    # 4. Component Dominance (index of max fraction)
    frac_cols = [f'Component{comp}_fraction' for comp in range(1, 6)]
    new_feats['Dominant_Component'] = df[frac_cols].idxmax(axis=1).str.extract(r'(\d+)').astype(int)

    # 5. Count of components with fraction > 0.2
    new_feats['Num_Components_gt_0.2'] = (df[frac_cols] > 0.2).sum(axis=1)

    return new_feats

# Add new features to X_train and X_val
X_train = pd.concat([X_train, add_domain_features(X_train)], axis=1)
X_val = pd.concat([X_val, add_domain_features(X_val)], axis=1)

# Final data shapes
X_train.shape, X_val.shape

((1600, 157), (400, 157))

### 3. Save the Processed Data

In [4]:
# Directory of the datasets
data_path = Path('../processed_data')

# Save the processed datasets
X_train.to_csv(data_path / 'X_train.csv', index=False)
X_val.to_csv(data_path / 'X_val.csv', index=False)
y_train.to_csv(data_path / 'y_train.csv', index=False)
y_val.to_csv(data_path / 'y_val.csv', index=False)

print("Processed datasets saved successfully.")

Processed datasets saved successfully.


## Test Dataset

In [None]:
# Load the raw test dataset
test_data = None
try:
    test_data = pd.read_csv(data_path / "test.csv")
except FileNotFoundError:
    print("File not found, recheck the directory or change the path of directory.")

if test_data is not None:
    # Feature columns for test data
    test_feature_cols = [col for col in test_data.columns if 'Component' in col]

    # Create weighted properties for test data
    test_data = pd.concat([test_data, create_weighted_properties(test_data)], axis=1)

    # Add domain features to test data
    test_data = pd.concat([test_data, add_domain_features(test_data)], axis=1)

    # Save the processed test dataset
    test_data.to_csv(data_path / 'X_test.csv', index=False)
    print("Processed test dataset saved successfully.")

Processed test dataset saved successfully.
