<a href="https://colab.research.google.com/github/VaibhavSingh1311/Syngas-Prediction-from-Biomass-FeedStock/blob/main/DataSet_Augmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Basic data processing and ML libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Install required packages for synthetic data
!pip install sdv
!pip install table-evaluator

from sdv.single_table import CTGANSynthesizer, GaussianCopulaSynthesizer
from sdv.metadata import SingleTableMetadata
from sdv.evaluation.single_table import evaluate_quality, run_diagnostic

from table_evaluator import TableEvaluator

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)

In [None]:
# Load and Prepare Your Data

original_data = pd.read_csv('/content/drive/MyDrive/ML_Lab_Datasets/Training dataset.csv', encoding='utf-8-sig')

print(f"\nOriginal data shape: {original_data.shape}")


# Data Preprocessing

print("\n" + "="*60)
print("DATA PREPROCESSING")
print("="*60)

data_clean = original_data.copy()

# Handle missing values
for col in data_clean.columns:
    if data_clean[col].isnull().sum() > 0:
        print(f"Column '{col}' has {data_clean[col].isnull().sum()} missing values")
        # Fill numeric columns with median
        if data_clean[col].dtype in ['float64', 'int64']:
            data_clean[col].fillna(data_clean[col].median(), inplace=True)
        else:
            # Fill categorical columns with mode
            if len(data_clean[col].mode()) > 0:
                data_clean[col].fillna(data_clean[col].mode()[0], inplace=True)
            else:
                data_clean[col].fillna('Unknown', inplace=True)

print(f"\nData after cleaning: {data_clean.shape}")
print("Missing values after cleaning:")
print(data_clean.isnull().sum())

# Identify numerical and categorical columns
numerical_cols = data_clean.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = data_clean.select_dtypes(exclude=[np.number]).columns.tolist()

print(f"\nNumerical columns ({len(numerical_cols)}): {numerical_cols}")
print(f"Categorical columns ({len(categorical_cols)}): {categorical_cols}")

# Check for outliers in numerical columns
print("\nChecking for outliers (beyond 3 std dev):")
outlier_info = {}
for col in numerical_cols:
    mean = data_clean[col].mean()
    std = data_clean[col].std()
    outliers = data_clean[(data_clean[col] < mean - 3*std) | (data_clean[col] > mean + 3*std)]
    outlier_info[col] = len(outliers)
    if len(outliers) > 0:
        print(f"  {col}: {len(outliers)} outliers")

# Normalize gas composition if present
gas_cols = ['H2', 'CO', 'CO2', 'CH4']
if all(col in data_clean.columns for col in gas_cols):
    data_clean['Gas_Sum'] = data_clean[gas_cols].sum(axis=1)
    print(f"\nOriginal gas composition sum - Mean: {data_clean['Gas_Sum'].mean():.1f}%")

    # Normalize gas percentages
    for col in gas_cols:
        data_clean[col] = data_clean[col] / data_clean['Gas_Sum'] * 100

    data_clean.drop('Gas_Sum', axis=1, inplace=True)

# Display biomass type statistics
print("\n" + "="*40)
print("BIOMASS TYPE STATISTICS")
print("="*40)

print(f"Number of unique biomass types: {data_clean['TYPES'].nunique()}")

In [None]:
# Enhanced Metadata Creation with Constraints

print("\n" + "="*60)
print("CREATING ENHANCED METADATA WITH CONSTRAINTS")
print("="*60)

# Automatically detect metadata from the dataframe
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=data_clean)

print("\nDetected metadata:")
for column in metadata.columns:
    col_info = metadata.columns[column]
    print(f"  {column}: {col_info['sdtype']}")

# Prepare constraints for the synthesizer
constraints = []

# Add constraints for gas compositions to keep values between 0-100%
gas_cols = ['H2', 'CO', 'CO2', 'CH4']
for col in gas_cols:
    if col in data_clean.columns:
        # Get current range from the data
        min_val = data_clean[col].min()
        max_val = data_clean[col].max()

        # Create constraint range with 10% buffer
        constraint_min = max(0, min_val * 0.9)
        constraint_max = min(100, max_val * 1.1)

        # Add constraint only if it's valid
        if constraint_max > constraint_min:
            print(f"  {col}: Will constrain to range [{constraint_min:.1f}, {constraint_max:.1f}]")

print("\nEnhanced metadata created successfully!")
print(f"Total columns: {len(metadata.columns)}")

In [None]:
# Corrected Synthesizer Approach for Biomass Data

print("\n" + "="*60)
print("CORRECTED SYNTHESIZER APPROACH FOR BIOMASS DATA")
print("="*60)

# First, analyze the dataset to understand its structure
n_samples = len(data_clean)
n_features = len(data_clean.columns)
n_categories = data_clean['TYPES'].nunique()
min_class_size = data_clean['TYPES'].value_counts().min()

print(f"\nDataset Analysis:")
print(f"  - Total samples: {n_samples}")
print(f"  - Features: {n_features}")
print(f"  - Biomass types: {n_categories}")
print(f"  - Smallest class: {min_class_size} samples")

# For biomass data, some properties are fixed for each type
# These don't change even when conditions vary
fixed_properties = ['C', 'H', 'O', 'Ash', 'N', 'S', 'Moisture', 'Volatile matter', 'Fixed carbon']
# Only include columns that actually exist in the data
fixed_properties = [col for col in fixed_properties if col in data_clean.columns]

print(f"\nFIXED PROPERTIES (do not vary within same biomass type):")
print(f"  {fixed_properties}")

print(f"\nVARYING PROPERTIES (can vary within same biomass type):")
varying_properties = [col for col in data_clean.columns if col not in fixed_properties + ['TYPES']]
print(f"  {varying_properties}")


# Use Gaussian Copula synthesizer - works better with our dataset size
synthesizer = GaussianCopulaSynthesizer(
    metadata=metadata,
    enforce_min_max_values=True,  # Keep values within observed ranges
    enforce_rounding=False
)

synthesizer.fit(data_clean)
print("Training completed!")

In [None]:
# STEP 5: Smart Data Augmentation for Biomass Data

print("\n" + "="*60)
print("SMART DATA AUGMENTATION FOR BIOMASS GASIFICATION")
print("="*60)

target_total = 1000

print(f"\nStarting with {len(data_clean)} original samples...")

# Group by biomass type
type_groups = {}
for biomass_type in data_clean['TYPES'].unique():
    type_data = data_clean[data_clean['TYPES'] == biomass_type]
    type_groups[biomass_type] = len(type_data)

print(f"\nBiomass types and experiment counts:")
for biomass_type, count in type_groups.items():
    print(f"  {biomass_type}: {count} experiments")

# Calculate physical relationships
print("\nCalculating physical relationships...")

relationship_patterns = {}
for biomass_type, type_data in data_clean.groupby('TYPES'):
    if len(type_data) >= 2:
        patterns = {}
        if 'Temperature (°C)' in type_data.columns and 'H2' in type_data.columns:
            patterns['temp_h2_slope'] = type_data['Temperature (°C)'].corr(type_data['H2'])
        if 'H2' in type_data.columns and 'CO' in type_data.columns:
            patterns['h2_co_slope'] = type_data['H2'].corr(type_data['CO'])
        relationship_patterns[biomass_type] = patterns

# Generate augmented samples
augmented_samples_needed = target_total - len(data_clean)
print(f"\nGenerating {augmented_samples_needed} augmented samples...")

augmented_rows = []
for _ in range(augmented_samples_needed):
    original_idx = np.random.randint(0, len(data_clean))
    original_row = data_clean.iloc[original_idx].copy()
    augmented_row = original_row.copy()

    # Temperature variation
    if 'Temperature (°C)' in augmented_row:
        temp = augmented_row['Temperature (°C)']
        temp_variation = np.random.normal(0, 25)
        new_temp = temp + temp_variation
        augmented_row['Temperature (°C)'] = max(500, min(950, new_temp))

        # Adjust gas compositions
        temp_change_pct = (new_temp - temp) / max(temp, 1)

        if 'H2' in augmented_row and 'CO' in augmented_row:
            h2_change = temp_change_pct * 0.3 * augmented_row['H2']
            augmented_row['H2'] = max(0, min(100, augmented_row['H2'] + h2_change))

            co_change = temp_change_pct * -0.2 * augmented_row['CO']
            augmented_row['CO'] = max(0, min(100, augmented_row['CO'] + co_change))

            if 'CO2' in augmented_row and 'CH4' in augmented_row:
                current_sum = (augmented_row['H2'] + augmented_row['CO'] +
                             augmented_row['CO2'] + augmented_row['CH4'])
                if current_sum > 0:
                    scale_factor = 100 / current_sum
                    augmented_row['CO2'] = augmented_row['CO2'] * scale_factor
                    augmented_row['CH4'] = augmented_row['CH4'] * scale_factor

    # S/B ratio variation
    if 'S/B' in augmented_row:
        sb = augmented_row['S/B']
        sb_variation = np.random.normal(1, 0.15)
        augmented_row['S/B'] = max(0.1, sb * sb_variation)

    # Small variations to other numerical parameters
    for col in data_clean.select_dtypes(include=[np.number]).columns:
        if col not in ['TYPES', 'Temperature (°C)', 'S/B', 'H2', 'CO', 'CO2', 'CH4']:
            if col in augmented_row:
                value = augmented_row[col]
                if pd.notna(value) and value != 0:
                    variation = np.random.normal(0, 0.05)
                    augmented_row[col] = max(0, value * (1 + variation))

    augmented_rows.append(augmented_row)

# Combine original and augmented data
augmented_df = pd.DataFrame(augmented_rows)
synthetic_data = pd.concat([data_clean, augmented_df], ignore_index=True)

# Ensure exact target_total samples
if len(synthetic_data) > target_total:
    synthetic_data = synthetic_data.iloc[:target_total]
elif len(synthetic_data) < target_total:
    additional_needed = target_total - len(synthetic_data)
    additional_samples = augmented_df.sample(n=additional_needed, replace=True, random_state=42)
    synthetic_data = pd.concat([synthetic_data, additional_samples], ignore_index=True)

print(f"\nGenerated augmented dataset: {len(synthetic_data)} samples")
print(f"  Original: {len(data_clean)} samples")
print(f"  Augmented: {len(augmented_df)} samples")
print(f"  Unique biomass types: {synthetic_data['TYPES'].nunique()}")

# Validate data quality
print("\nValidating data quality...")

if all(col in synthetic_data.columns for col in ['H2', 'CO', 'Temperature (°C)']):
    orig_h2_co = data_clean['H2'].corr(data_clean['CO'])
    synth_h2_co = synthetic_data['H2'].corr(synthetic_data['CO'])
    orig_h2_temp = data_clean['H2'].corr(data_clean['Temperature (°C)'])
    synth_h2_temp = synthetic_data['H2'].corr(synthetic_data['Temperature (°C)'])

    print(f"\nCorrelation comparison:")
    print(f"  H2-CO: Original={orig_h2_co:.3f}, Synthetic={synth_h2_co:.3f}")
    print(f"  H2-Temp: Original={orig_h2_temp:.3f}, Synthetic={synth_h2_temp:.3f}")

    gas_cols = ['H2', 'CO', 'CO2', 'CH4']
    if all(col in synthetic_data.columns for col in gas_cols):
        orig_sum = data_clean[gas_cols].sum(axis=1).mean()
        synth_sum = synthetic_data[gas_cols].sum(axis=1).mean()
        print(f"\nGas composition sum:")
        print(f"  Original mean: {orig_sum:.1f}%")
        print(f"  Synthetic mean: {synth_sum:.1f}%")

print("\nAugmentation complete!")

In [None]:

print("\n" + "="*60)
print("BIOMASS TYPE DISTRIBUTION - SYNTHETIC DATASET")
print("="*60)

# Group synthetic data by biomass type
synth_type_groups = {}
for biomass_type in synthetic_data['TYPES'].unique():
    type_data = synthetic_data[synthetic_data['TYPES'] == biomass_type]
    synth_type_groups[biomass_type] = len(type_data)

print(f"\nBiomass types and experiment counts in SYNTHETIC data:")
print(f"Total samples: {len(synthetic_data)}")
print(f"Unique biomass types: {synthetic_data['TYPES'].nunique()}")

# Sort by count for better readability
synth_type_groups_sorted = dict(sorted(synth_type_groups.items(),
                                       key=lambda x: x[1], reverse=True))

for biomass_type, count in synth_type_groups_sorted.items():
    print(f"  {biomass_type}: {count} experiments")

# Compare with original
print(f"\n" + "-"*60)
print("DISTRIBUTION COMPARISON")
print("-"*60)

# Create comparison DataFrame
comparison_list = []
all_types = sorted(set(list(type_groups.keys()) + list(synth_type_groups.keys())))

for biomass_type in all_types:
    orig_count = type_groups.get(biomass_type, 0)
    synth_count = synth_type_groups.get(biomass_type, 0)
    increase = synth_count - orig_count
    increase_pct = (increase / orig_count * 100) if orig_count > 0 else 0

    comparison_list.append({
        'Biomass_Type': biomass_type,
        'Original': orig_count,
        'Synthetic': synth_count,
        'Increase': increase,
        'Increase_%': f"{increase_pct:.0f}%" if orig_count > 0 else "N/A"
    })

comparison_df = pd.DataFrame(comparison_list)

# Sort by original count (or synthetic count if no original)
comparison_df = comparison_df.sort_values(by=['Original', 'Synthetic'], ascending=[False, False])

print("\nBiomass Type Distribution Comparison:")
print(comparison_df.to_string(index=False))



In [None]:
# STEP 6: Quality Evaluation

print("\n" + "="*50)
print("QUALITY EVALUATION")
print("="*50)

# 1. Check basic statistics
print("\n1. Statistical Comparison:")

key_params = ['C', 'H', 'O', 'Ash', 'H2', 'CO', 'CO2', 'CH4', 'Temperature (°C)', 'S/B']

comparison_results = []
for param in key_params:
    if param in data_clean.columns and param in synthetic_data.columns:
        orig_mean = data_clean[param].mean()
        synth_mean = synthetic_data[param].mean()
        mean_diff_pct = abs(orig_mean - synth_mean) / orig_mean * 100 if orig_mean != 0 else 0

        orig_std = data_clean[param].std()
        synth_std = synthetic_data[param].std()
        std_diff_pct = abs(orig_std - synth_std) / orig_std * 100 if orig_std != 0 else 0

        comparison_results.append({
            'Parameter': param,
            'Orig_Mean': f"{orig_mean:.2f}",
            'Synth_Mean': f"{synth_mean:.2f}",
            'Mean_Diff_%': f"{mean_diff_pct:.1f}",
            'Orig_STD': f"{orig_std:.2f}",
            'Synth_STD': f"{synth_std:.2f}",
            'STD_Diff_%': f"{std_diff_pct:.1f}"
        })

comparison_df = pd.DataFrame(comparison_results)
print(comparison_df.to_string(index=False))

# 2. Check biomass type distribution
print("\n2. Type Distribution Check:")

original_types = data_clean['TYPES'].value_counts(normalize=True).sort_index()
synthetic_types = synthetic_data['TYPES'].value_counts(normalize=True).sort_index()

type_comparison = pd.DataFrame({
    'Original': original_types,
    'Synthetic': synthetic_types
}).fillna(0)

print("\nTop 10 biomass type comparison:")
print(type_comparison.head(10))

coverage = len(set(data_clean['TYPES']) & set(synthetic_data['TYPES'])) / len(set(data_clean['TYPES'])) * 100
print(f"\nType coverage: {coverage:.1f}%")

# 3. Check correlation patterns
print("\n3. Correlation Check:")

numerical_cols = data_clean.select_dtypes(include=[np.number]).columns.tolist()
common_numerical = [col for col in numerical_cols if col in synthetic_data.columns]

if len(common_numerical) > 1:
    corr_original = data_clean[common_numerical].corr()
    corr_synthetic = synthetic_data[common_numerical].corr()

    # Calculate average difference in correlations
    corr_diff = (corr_original - corr_synthetic).abs().mean().mean()
    print(f"Average correlation difference: {corr_diff:.4f}")

    # Check specific important relationships
    key_pairs = [('H2', 'Temperature (°C)'), ('H2', 'CO'), ('CO', 'CO2')]
    for col1, col2 in key_pairs:
        if col1 in common_numerical and col2 in common_numerical:
            orig_corr = corr_original.loc[col1, col2]
            synth_corr = corr_synthetic.loc[col1, col2]
            diff = abs(orig_corr - synth_corr)
            print(f"  {col1}-{col2}: Original={orig_corr:.3f}, Synthetic={synth_corr:.3f}, Difference={diff:.3f}")

# 4. Check gas composition totals
print("\n4. Gas Composition Check:")

if all(col in synthetic_data.columns for col in gas_cols):
    synth_gas_sum = synthetic_data[gas_cols].sum(axis=1)
    print(f"Gas total sum:")
    print(f"  Average: {synth_gas_sum.mean():.1f}%")
    print(f"  Min-Max: [{synth_gas_sum.min():.1f}%, {synth_gas_sum.max():.1f}%]")

    # Count samples with unrealistic totals
    unrealistic = (synth_gas_sum < 80) | (synth_gas_sum > 120)
    print(f"  Unrealistic totals: {unrealistic.sum()} samples")

In [None]:
# STEP 7: Visual Comparisons

print("\n" + "="*50)
print("CREATING VISUAL COMPARISONS")
print("="*50)

# 1. Temperature vs H2 Production
plt.figure(figsize=(10, 6))

# Plot original data
plt.scatter(data_clean['Temperature (°C)'], data_clean['H2'],
            alpha=0.7, label='Original', color='blue', s=60, edgecolors='black', linewidth=0.5)

# Plot synthetic data
plt.scatter(synthetic_data['Temperature (°C)'], synthetic_data['H2'],
            alpha=0.4, label='Synthetic', color='red', s=30)

# Add trend lines
z_orig = np.polyfit(data_clean['Temperature (°C)'], data_clean['H2'], 1)
p_orig = np.poly1d(z_orig)
x_range = np.linspace(data_clean['Temperature (°C)'].min(), data_clean['Temperature (°C)'].max(), 100)
plt.plot(x_range, p_orig(x_range), 'blue', linewidth=2, linestyle='-', label='Original Trend')

z_synth = np.polyfit(synthetic_data['Temperature (°C)'], synthetic_data['H2'], 1)
p_synth = np.poly1d(z_synth)
plt.plot(x_range, p_synth(x_range), 'red', linewidth=2, linestyle='--', label='Synthetic Trend')

plt.xlabel('Temperature (°C)', fontsize=12)
plt.ylabel('H₂ Production (%)', fontsize=12)
plt.title('Temperature vs H₂ Production', fontsize=14)

# Add grid and legend
plt.grid(True, alpha=0.3, linestyle='--')
plt.legend(fontsize=10)

# Set axis limits
plt.xlim(data_clean['Temperature (°C)'].min()-10, data_clean['Temperature (°C)'].max()+10)
plt.ylim(min(data_clean['H2'].min(), synthetic_data['H2'].min())-2,
         max(data_clean['H2'].max(), synthetic_data['H2'].max())+2)

# Add correlation values
orig_corr = data_clean['Temperature (°C)'].corr(data_clean['H2'])
synth_corr = synthetic_data['Temperature (°C)'].corr(synthetic_data['H2'])
plt.text(0.02, 0.98, f'Original r = {orig_corr:.3f}',
         transform=plt.gca().transAxes, fontsize=10, verticalalignment='top')
plt.text(0.02, 0.92, f'Synthetic r = {synth_corr:.3f}',
         transform=plt.gca().transAxes, fontsize=10, verticalalignment='top')

plt.tight_layout()
plt.savefig('temperature_vs_h2.png', dpi=300, bbox_inches='tight')
plt.show()

# 2. H2 Production by Biomass Type
plt.figure(figsize=(12, 6))

# Combine data for boxplot
plot_data = pd.concat([
    data_clean[['TYPES', 'H2']].assign(Source='Original'),
    synthetic_data[['TYPES', 'H2']].assign(Source='Synthetic')
])

# Select top 8 most common types
top_types = data_clean['TYPES'].value_counts().head(8).index
plot_data = plot_data[plot_data['TYPES'].isin(top_types)]

sns.boxplot(data=plot_data, x='TYPES', y='H2', hue='Source',
            palette={'Original': 'blue', 'Synthetic': 'red'})
plt.title('H₂ Production by Biomass Type', fontsize=14)
plt.xlabel('Biomass Type', fontsize=12)
plt.ylabel('H₂ Production (%)', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.legend(title='Data Source', fontsize=10)

plt.grid(True, alpha=0.3, axis='y', linestyle='--')
plt.tight_layout()
plt.savefig('h2_by_type.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# STEP 7.5: Gas Composition Analysis

print("\n" + "="*50)
print("GAS COMPOSITION ANALYSIS")
print("="*50)

# 1. Check gas correlations
print("\n1. Gas Composition Correlations:")

gas_cols = ['H2', 'CO', 'CO2', 'CH4']
if all(col in data_clean.columns for col in gas_cols):
    orig_corr = data_clean[gas_cols].corr()
    synth_corr = synthetic_data[gas_cols].corr()

    print(f"\n   H2-CO correlation:")
    print(f"     Original: {orig_corr.loc['H2', 'CO']:.3f}")
    print(f"     Synthetic: {synth_corr.loc['H2', 'CO']:.3f}")

    print(f"\n   H2-CO2 correlation:")
    print(f"     Original: {orig_corr.loc['H2', 'CO2']:.3f}")
    print(f"     Synthetic: {synth_corr.loc['H2', 'CO2']:.3f}")

# 2. Check gas distributions
print("\n2. Gas Distribution Comparison:")

critical_gases = ['CO', 'CH4', 'CO2']
for gas in critical_gases:
    if gas in data_clean.columns:
        orig_mean = data_clean[gas].mean()
        synth_mean = synthetic_data[gas].mean()
        mean_diff = abs(orig_mean - synth_mean)

        print(f"\n   {gas}:")
        print(f"     Original mean: {orig_mean:.1f}%")
        print(f"     Synthetic mean: {synth_mean:.1f}%")
        print(f"     Difference: {mean_diff:.1f}%")

# 3. Check temperature relationship
if 'Temperature (°C)' in data_clean.columns:
    print("\n3. Temperature-H2 Relationship:")
    temp_h2_orig = data_clean['Temperature (°C)'].corr(data_clean['H2'])
    temp_h2_synth = synthetic_data['Temperature (°C)'].corr(synthetic_data['H2'])

    print(f"   Original correlation: {temp_h2_orig:.3f}")
    print(f"   Synthetic correlation: {temp_h2_synth:.3f}")

In [None]:
from sklearn.metrics import mean_absolute_error

# STEP 8: Machine Learning Efficacy Test

print("\n" + "="*50)
print("MACHINE LEARNING EFFICACY TEST")
print("="*50)

target_column = 'H2'

# Prepare data for ML
def prepare_for_ml(data, target_col):
    ml_data = data.copy()

    # One-hot encode biomass types
    if 'TYPES' in ml_data.columns:
        types_dummies = pd.get_dummies(ml_data['TYPES'], prefix='TYPE')
        ml_data = pd.concat([ml_data, types_dummies], axis=1)
        ml_data.drop('TYPES', axis=1, inplace=True)

    # Convert to numeric
    ml_data = ml_data.apply(pd.to_numeric, errors='coerce')

    # Drop rows with missing target
    ml_data = ml_data.dropna(subset=[target_col])
    ml_data = ml_data.dropna(thresh=len(ml_data.columns) - 5)

    return ml_data

print("Preparing datasets...")
ml_original = prepare_for_ml(data_clean, target_column)
ml_synthetic = prepare_for_ml(synthetic_data, target_column)

print(f"\nData shapes:")
print(f"Original: {ml_original.shape}")
print(f"Synthetic: {ml_synthetic.shape}")

# Align columns
common_cols = list(set(ml_original.columns) & set(ml_synthetic.columns))
if target_column in common_cols:
    common_cols.remove(target_column)

ml_original = ml_original[common_cols + [target_column]]
ml_synthetic = ml_synthetic[common_cols + [target_column]]

print(f"\nAfter column alignment:")
print(f"Original: {ml_original.shape}")
print(f"Synthetic: {ml_synthetic.shape}")
print(f"Features: {len(common_cols)}")

# Split original data
X = ml_original.drop(target_column, axis=1)
y = ml_original[target_column]

X_train_orig, X_test, y_train_orig, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Prepare synthetic training data
X_train_synth = ml_synthetic.drop(target_column, axis=1).iloc[:len(X_train_orig)]
y_train_synth = ml_synthetic[target_column].iloc[:len(y_train_orig)]

print(f"\nTraining set sizes:")
print(f"Original training: {X_train_orig.shape}")
print(f"Synthetic training: {X_train_synth.shape}")
print(f"Test set: {X_test.shape}")

# Train models
print("\nTraining models...")

# Model on original data
rf_original = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10)
rf_original.fit(X_train_orig, y_train_orig)
y_pred_orig = rf_original.predict(X_test)

# Model on synthetic data
rf_synthetic = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10)
rf_synthetic.fit(X_train_synth, y_train_synth)
y_pred_synth = rf_synthetic.predict(X_test)

# Calculate metrics
r2_orig = r2_score(y_test, y_pred_orig)
r2_synth = r2_score(y_test, y_pred_synth)
rmse_orig = np.sqrt(mean_squared_error(y_test, y_pred_orig))
rmse_synth = np.sqrt(mean_squared_error(y_test, y_pred_synth))
mae_orig = mean_absolute_error(y_test, y_pred_orig)
mae_synth = mean_absolute_error(y_test, y_pred_synth)

print("\n" + "="*50)
print("MODEL PERFORMANCE")
print("="*50)

print(f"\nOriginal Data Model:")
print(f"  R² Score: {r2_orig:.4f}")
print(f"  RMSE: {rmse_orig:.4f}")
print(f"  MAE: {mae_orig:.4f}")

print(f"\nSynthetic Data Model:")
print(f"  R² Score: {r2_synth:.4f}")
print(f"  RMSE: {rmse_synth:.4f}")
print(f"  MAE: {mae_synth:.4f}")

print(f"\nComparison:")
print(f"  R² Difference: {abs(r2_orig - r2_synth):.4f}")
print(f"  Performance Ratio: {r2_synth/r2_orig:.3f}")

# Feature importance
print(f"\nTop 10 Important Features (Original Model):")

feature_importances = pd.DataFrame({
    'Feature': X_train_orig.columns,
    'Importance': rf_original.feature_importances_
}).sort_values('Importance', ascending=False)

for i, (feature, importance) in enumerate(zip(feature_importances['Feature'][:10],
                                              feature_importances['Importance'][:10])):
    print(f"  {i+1:2d}. {feature:25s}: {importance:.4f}")

# Performance summary
print(f"\nPerformance Summary:")

if r2_orig > 0:
    performance_ratio = r2_synth / r2_orig

    if r2_synth > 0.5:
        print(f"  Synthetic model R² > 0.5: Acceptable")
    else:
        print(f"  Synthetic model R² < 0.5: Low")

    if performance_ratio > 0.8:
        print(f"  Performance ratio > 0.8: Good")
    elif performance_ratio > 0.6:
        print(f"  Performance ratio 0.6-0.8: Moderate")
    else:
        print(f"  Performance ratio < 0.6: Poor")

print("\nML test completed!")

In [None]:
# STEP 9: Final Quality Validation

print("\n" + "="*50)
print("FINAL QUALITY VALIDATION")
print("="*50)

# Check key parameter ranges
print("\nParameter Range Comparison:")

key_params = ['C', 'H', 'H2', 'Temperature (°C)']

for param in key_params:
    if param in data_clean.columns:
        orig_min = data_clean[param].min()
        orig_max = data_clean[param].max()
        synth_min = synthetic_data[param].min()
        synth_max = synthetic_data[param].max()

        print(f"\n{param}:")
        print(f"  Original:  [{orig_min:.1f}, {orig_max:.1f}]")
        print(f"  Synthetic: [{synth_min:.1f}, {synth_max:.1f}]")

        # Check if ranges are preserved
        if synth_min >= orig_min and synth_max <= orig_max:
            print(f"  Status: Within original range")
        else:
            print(f"  Status: Outside original range")

In [None]:
# STEP 10: Round Synthetic Data Values

print("\n" + "="*50)
print("ROUNDING SYNTHETIC DATA VALUES")
print("="*50)

# Round all numerical columns
print("Rounding numerical values...")

for col in synthetic_data.select_dtypes(include=[np.number]).columns:
    synthetic_data[col] = synthetic_data[col].round(3)

print(f"Rounded {synthetic_data.select_dtypes(include=[np.number]).shape[1]} columns")
print("\nSample of rounded data:")
print(synthetic_data.head(3))

In [None]:
# STEP 11: Save Results

print("\n" + "="*50)
print("SAVING RESULTS")
print("="*50)

import os
SAVE_DIR = "/content/drive/MyDrive/ML_Lab_Datasets/"
os.makedirs(SAVE_DIR, exist_ok=True)

# Save synthetic data
synthetic_data.to_csv('final_synthetic_biomass_dataset.csv', index=False)
synthetic_data.to_excel('final_synthetic_biomass_dataset.xlsx', index=False)

# Save the trained synthesizer
synthesizer.save('final_biomass_synthesizer.pkl')

# Save quality report
with open('final_synthetic_data_quality_report.txt', 'w') as f:
    f.write("BIOMASS GASIFICATION SYNTHETIC DATA QUALITY REPORT\n")
    f.write("=" * 55 + "\n\n")
    f.write(f"Original data size: {data_clean.shape}\n")
    f.write(f"Synthetic data size: {synthetic_data.shape}\n")
    f.write(f"Data expansion: {len(synthetic_data)/len(data_clean):.1f}x\n\n")

    f.write("TYPE DISTRIBUTION:\n")
    f.write("-" * 30 + "\n")
    f.write(f"Original types: {data_clean['TYPES'].nunique()}\n")
    f.write(f"Synthetic types: {synthetic_data['TYPES'].nunique()}\n")
    f.write(f"Type coverage: {coverage:.1f}%\n\n")

    f.write("KEY PARAMETER STATISTICS:\n")
    f.write("-" * 30 + "\n")

    if not comparison_df.empty:
        for _, row in comparison_df.iterrows():
            f.write(f"{row['Parameter']}: Original={row['Orig_Mean']}, Synthetic={row['Synth_Mean']}, Diff={row['Mean_Diff_%']}%\n")

    f.write("\nMACHINE LEARNING PERFORMANCE:\n")
    f.write("-" * 30 + "\n")

    if 'r2_orig' in locals() and 'r2_synth' in locals():
        f.write(f"R² (Original data): {r2_orig:.4f}\n")
        f.write(f"R² (Synthetic data): {r2_synth:.4f}\n")
        f.write(f"Performance difference: {abs(r2_orig - r2_synth):.4f}\n")
        if r2_orig != 0:
            f.write(f"Performance ratio: {r2_synth/r2_orig:.2f}\n")

print("\nFiles saved:")

print("\n" + "="*50)
print("PROCESS COMPLETED")
print("="*50)
print(f"Original dataset: {len(data_clean)} rows, {data_clean['TYPES'].nunique()} types")
print(f"Synthetic dataset: {len(synthetic_data)} rows, {synthetic_data['TYPES'].nunique()} types")
print(f"Data expansion: {len(synthetic_data)/len(data_clean):.1f}x")
print(f"Type coverage: {coverage:.1f}%")

print("\nSample of synthetic data:")
print(synthetic_data.head(3))