## Step 1: Setup Google Colab Environment

This installs all required libraries and downloads your dataset from Kaggle Hub

In [None]:
# Install required packages
!pip install -q scikit-image scikit-learn opencv-python albumentations tensorflow keras-applications tensorboard kagglehub

print("‚úì Environment setup complete")

## Step 2: Data Augmentation Strategy

### Why Augmentation?
With limited samples, we artificially create variations:
- **Rotation**: Fabric rotated at different angles
- **Scaling**: Zoomed in/out (simulates different microscope magnifications)
- **Elastic Deformation**: Stretches fabric slightly (realistic)
- **Brightness/Contrast**: Lighting variations
- **Noise**: Camera/sensor noise

This trains the model to be **robust** - works with imperfect real-world images!

In [None]:
import numpy as np
import cv2
from pathlib import Path
import pandas as pd
from typing import Tuple, List
import matplotlib.pyplot as plt
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

# Augmentation library
import albumentations as A
from albumentations import DualTransform

class TextileDataAugmentor:
    """
    Advanced augmentation designed for fabric microscopy images.
    
    Why these specific augmentations?
    - Rotations: Fabric can be scanned at any angle
    - Scale: Simulates different zoom levels
    - Elastic: Realistic fabric deformation
    - Noise: Camera sensor imperfections
    """
    
    def __init__(self, seed=42):
        self.seed = seed
        np.random.seed(seed)
        
        # Define augmentation pipeline
        self.augmenter = A.Compose([
            # Rotation: fabric can be oriented any direction
            A.Rotate(limit=180, p=0.8, border_mode=cv2.BORDER_REFLECT),
            
            # Scale: simulates zoom variations
            A.RandomScale(scale_limit=0.2, p=0.7),
            
            # Elastic deformation: realistic fabric stretching
            A.ElasticTransform(
                alpha=50, sigma=5, alpha_affine=20,
                p=0.6, border_mode=cv2.BORDER_REFLECT
            ),
            
            # Grid distortion: simulates wrinkles/folds
            A.GridDistortion(
                num_steps=5, distort_limit=0.15,
                p=0.5, border_mode=cv2.BORDER_REFLECT
            ),
            
            # Brightness/Contrast: lighting variations
            A.RandomBrightnessContrast(
                brightness_limit=0.2, contrast_limit=0.2, p=0.7
            ),
            
            # Gaussian blur: camera focus variations
            A.GaussBlur(blur_limit=3, p=0.3),
            
            # Gaussian noise: sensor noise
            A.GaussNoise(p=0.4),
        ])
    
    def augment(self, image: np.ndarray, num_augmentations: int = 5) -> List[np.ndarray]:
        """
        Create multiple augmented versions of image.
        
        Args:
            image: Input fabric image (H, W, C)
            num_augmentations: How many variations to create
        
        Returns:
            List of augmented images
        """
        augmented = [image]  # Include original
        
        for _ in range(num_augmentations):
            aug_image = self.augmenter(image=image)['image']
            augmented.append(aug_image)
        
        return augmented

print("‚úì Data augmentation module loaded")

## Step 3: Feature Extraction - The Heart of the Pipeline

### Feature Types Explained:

**GLCM Features** (Gray Level Co-occurrence Matrix)
- Measures pixel relationships in different directions
- **Contrast**: Difference in pixel values ‚Üí weave tightness
- **Homogeneity**: Similar pixels nearby ‚Üí dense/tight weave
- **Energy**: Pixel repeatability ‚Üí regular pattern strength
- **Correlation**: Pixel dependency ‚Üí weave structure

**LBP Features** (Local Binary Pattern)
- Looks at 8 neighbors of each pixel
- Creates histogram of patterns ‚Üí yarn texture

**Textile-Specific Features**
- **Warp Density**: Threads per inch in vertical direction
- **Weft Density**: Threads per inch in horizontal direction
- **Yarn Diameter**: Estimated from image
- **Density**: (Warp + Weft) / 2

In [None]:
from skimage import feature
from skimage.feature import graycomatrix, graycoprops
from scipy import ndimage

class TextileFeatureExtractor:
    """
    Extract textile-specific features from fabric images.
    
    This module is the CORE of our GSM prediction.
    Features directly relate to:
    - Fabric structure (weave type)
    - Yarn properties (thickness, material)
    - Surface density (GSM indicator)
    """
    
    def __init__(self):
        self.logger = None
    
    def extract_glcm_features(self, gray_image: np.ndarray) -> np.ndarray:
        """
        Extract GLCM texture features.
        
        What it does:
        - Analyzes pixel co-occurrence at different distances/angles
        - Returns 4 metrics √ó 2 distances √ó 4 angles = 32 features
        """
        # Quantize to 32 levels (reduces computation)
        gray_quant = (gray_image / 8).astype(np.uint8)
        
        # Compute GLCM at multiple distances and angles
        glcm = graycomatrix(
            gray_quant,
            distances=[1, 3],  # Pixel distances to consider
            angles=[0, np.pi/4, np.pi/2, 3*np.pi/4],  # 0¬∞, 45¬∞, 90¬∞, 135¬∞
            levels=32,
            symmetric=True,
            normed=True
        )
        
        # Extract metrics
        features = []
        for metric in ['contrast', 'homogeneity', 'energy', 'correlation']:
            props = graycoprops(glcm, metric)
            features.extend(props.flatten())
        
        return np.array(features)
    
    def extract_lbp_features(self, gray_image: np.ndarray) -> np.ndarray:
        """
        Extract Local Binary Pattern features.
        
        What it does:
        - Looks at 8 neighbors around each pixel
        - Creates histogram of micro-patterns
        - Returns 59 bins = 59 features
        """
        # Compute LBP
        lbp = feature.local_binary_pattern(
            gray_image,
            P=8,  # 8 neighbors
            R=3,  # radius 3 pixels
            method='uniform'
        )
        
        # Create histogram
        hist, _ = np.histogram(
            lbp.ravel(),
            bins=np.arange(0, 60),  # 59 unique uniform patterns
            range=(0, 59)
        )
        
        # Normalize histogram
        return hist.astype(float) / hist.sum()
    
    def estimate_density_features(self, gray_image: np.ndarray) -> np.ndarray:
        """
        Estimate yarn/warp/weft density features.
        
        What it does:
        - Analyzes horizontal/vertical thread patterns
        - Estimates yarn diameter
        - Calculates thread density
        Returns: [warp_density, weft_density, yarn_diameter, total_density]
        """
        h, w = gray_image.shape
        
        # Detect threads using edge detection
        edges = cv2.Canny(gray_image, 100, 200)
        
        # Count vertical threads (warp) - count transitions in columns
        vertical_sum = edges.sum(axis=0)
        warp_density = float(np.count_nonzero(vertical_sum > 0) / w)
        
        # Count horizontal threads (weft) - count transitions in rows
        horizontal_sum = edges.sum(axis=1)
        weft_density = float(np.count_nonzero(horizontal_sum > 0) / h)
        
        # Estimate yarn diameter from edge thickness
        yarn_diameter = float(np.mean(np.where(edges > 0)[0])) / h if np.any(edges) else 0.1
        
        # Total density (indicates GSM!)
        total_density = (warp_density + weft_density) / 2.0
        
        return np.array([
            warp_density,
            weft_density,
            yarn_diameter,
            total_density
        ])
    
    def extract_all_features(self, image: np.ndarray) -> np.ndarray:
        """
        Extract all textile features from single image.
        
        Total: 32 GLCM + 59 LBP + 4 Density = 95 features
        """
        # Convert to grayscale
        if len(image.shape) == 3:
            gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
        else:
            gray = image
        
        # Extract features
        glcm_feat = self.extract_glcm_features(gray)
        lbp_feat = self.extract_lbp_features(gray)
        density_feat = self.estimate_density_features(gray)
        
        # Combine all features
        return np.concatenate([glcm_feat, lbp_feat, density_feat])

print("‚úì Textile feature extractor loaded")

## Step 4: Deep Learning Features (Transfer Learning)

### What is Transfer Learning?
We use a **pretrained neural network** (MobileNetV3) that was trained on millions of images.

**Why this is powerful:**
- Already learned to detect edges, textures, patterns
- We "reuse" this knowledge instead of starting from scratch
- With few fabric samples, this prevents overfitting

**MobileNetV3:**
- Lightweight (runs on phones!)
- Efficient for edge devices (Raspberry Pi)
- Returns 1280 features per image

In [None]:
# Import TensorFlow for neural networks
import tensorflow as tf
from tensorflow.keras.applications import MobileNetV3Small
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.mobilenet_v3 import preprocess_input

class DeepFeatureExtractor:
    """
    Extract features using MobileNetV3 (pretrained neural network).
    
    How it works:
    1. Image goes through neural network
    2. We extract features from second-to-last layer
    3. These features capture high-level patterns
    4. Returns 1280 features per image
    """
    
    def __init__(self, input_shape=(224, 224, 3)):
        # Load pretrained MobileNetV3
        self.model = MobileNetV3Small(
            input_shape=input_shape,
            include_top=False,  # Remove classification layer
            weights='imagenet',  # Use pretrained weights
            pooling='avg'  # Global average pooling
        )
        
        # Freeze weights (don't retrain)
        self.model.trainable = False
        
        self.input_shape = input_shape
    
    def extract(self, image: np.ndarray) -> np.ndarray:
        """
        Extract deep features from image.
        
        Args:
            image: Image array (H, W, 3)
        
        Returns:
            Feature vector (1280,)
        """
        # Resize to model input size
        img_resized = cv2.resize(image, (self.input_shape[0], self.input_shape[1]))
        
        # Preprocess for ImageNet
        img_array = np.expand_dims(img_resized, axis=0)  # Add batch dimension
        img_array = preprocess_input(img_array)  # Normalize
        
        # Extract features
        features = self.model.predict(img_array, verbose=0)
        
        return features.flatten()

print("‚úì Deep feature extractor (MobileNetV3) loaded")

## Step 5: Load Dataset and Prepare Data

### Steps:
1. Download dataset from Kaggle Hub
2. Load images and GSM labels from Excel
3. Augment dataset (create multiple versions)
4. Extract features for each image
5. Normalize features (critical for regression!)

In [None]:
import kagglehub
from pathlib import Path

# Download latest version of FabricNet dataset
print("üì• Downloading FabricNet dataset from Kaggle Hub...")
path = kagglehub.dataset_download("acseckn/fabricnet")

print(f"‚úì Dataset downloaded successfully!")
print(f"  Path: {path}")
print(f"  Contents: {list(Path(path).glob('*'))[:5]}")

DATASET_PATH = path  # Use downloaded path

## Step 6: Complete Data Pipeline

This combines everything:
- Loads images
- Applies data augmentation
- Extracts features
- Normalizes data

In [None]:
class DataPipeline:
    """
    Complete data processing pipeline.
    
    Flow:
    Raw Image ‚Üí Augmentation ‚Üí Feature Extraction ‚Üí Normalization ‚Üí Training
    """
    
    def __init__(self, dataset_path: str, excel_file: str = "FabricNet_parameters.xlsx"):
        self.dataset_path = Path(dataset_path)
        self.excel_file = self.dataset_path / excel_file
        
        # Initialize feature extractors
        self.textile_extractor = TextileFeatureExtractor()
        self.deep_extractor = DeepFeatureExtractor()
        self.augmentor = TextileDataAugmentor()
        
        # Load labels
        self.labels_df = self._load_labels()
    
    def _load_labels(self) -> pd.DataFrame:
        """Load GSM labels from Excel file."""
        try:
            df = pd.read_excel(self.excel_file)
            print(f"‚úì Loaded {len(df)} samples from Excel")
            print(f"  Columns: {list(df.columns)}")
            return df
        except Exception as e:
            print(f"‚úó Error loading Excel: {e}")
            return None
    
    def load_image(self, image_name: str) -> np.ndarray:
        """Load single image."""
        img_path = self.dataset_path / image_name
        if not img_path.exists():
            return None
        
        img = cv2.imread(str(img_path))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        return img
    
    def process_image(self, image: np.ndarray, num_augmentations: int = 4) -> Tuple[list, int]:
        """
        Process single image:
        1. Augment (create variations)
        2. Extract textile features
        3. Extract deep features
        4. Combine features
        """
        features_list = []
        
        # Step 1: Augment image
        augmented_images = self.augmentor.augment(image, num_augmentations)
        
        # Step 2 & 3: Extract features from each augmented version
        for aug_img in augmented_images:
            # Textile features (95 features)
            textile_feat = self.textile_extractor.extract_all_features(aug_img)
            
            # Deep features (1280 features)
            deep_feat = self.deep_extractor.extract(aug_img)
            
            # Combine (1375 features total)
            combined = np.concatenate([textile_feat, deep_feat])
            features_list.append(combined)
        
        return features_list, len(augmented_images)
    
    def prepare_dataset(
        self,
        num_augmentations: int = 4,
        gsm_column: str = "Specific Mass"
    ) -> Tuple[np.ndarray, np.ndarray]:
        """
        Prepare complete dataset with augmentation.
        
        Returns:
            X: Feature matrix (N_samples, 1375)
            y: GSM labels (N_samples,)
        """
        all_features = []
        all_labels = []
        
        if self.labels_df is None:
            return None, None
        
        print(f"\nüìä Processing {len(self.labels_df)} samples with {num_augmentations+1} augmentations...")
        
        for idx, row in self.labels_df.iterrows():
            # Get image filename
            if 'image_filename' in row:
                image_name = row['image_filename']
            else:
                image_name = f"W{row.get('Image id', idx):03d}.jpg"
            
            # Load image
            image = self.load_image(image_name)
            if image is None:
                print(f"  ‚ö†Ô∏è Skipped {image_name} (not found)")
                continue
            
            # Get GSM label
            gsm = float(row[gsm_column])
            
            # Process image (augment + extract features)
            try:
                features_list, aug_count = self.process_image(image, num_augmentations)
                
                # Add features and labels
                all_features.extend(features_list)
                all_labels.extend([gsm] * aug_count)
                
                if (idx + 1) % 10 == 0:
                    print(f"  ‚úì Processed {idx+1}/{len(self.labels_df)} images")
            
            except Exception as e:
                print(f"  ‚úó Error processing {image_name}: {e}")
                continue
        
        # Convert to numpy arrays
        X = np.array(all_features)
        y = np.array(all_labels)
        
        print(f"\n‚úì Dataset prepared: {X.shape[0]} samples, {X.shape[1]} features")
        print(f"  GSM range: {y.min():.2f} - {y.max():.2f} g/m¬≤")
        
        return X, y

print("‚úì Data pipeline class loaded")

## Step 7: Feature Normalization

### Why Normalize?
Features have different scales:
- GLCM: 0-1
- LBP: 0-1
- Deep features: very large numbers

Normalization **centers and scales** all features to mean=0, std=1.

This helps regression algorithms work better!

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

class FeatureNormalizer:
    """
    Normalize features to zero mean and unit variance.
    
    Formula: x_normalized = (x - mean) / std
    """
    
    def __init__(self):
        self.scaler = StandardScaler()
        self.is_fitted = False
    
    def fit(self, X: np.ndarray):
        """Learn mean and std from training data."""
        self.scaler.fit(X)
        self.is_fitted = True
        print(f"‚úì Scaler fitted on {X.shape[0]} samples")
    
    def transform(self, X: np.ndarray) -> np.ndarray:
        """Apply normalization."""
        if not self.is_fitted:
            raise ValueError("Scaler not fitted yet. Call fit() first.")
        return self.scaler.transform(X)
    
    def fit_transform(self, X: np.ndarray) -> np.ndarray:
        """Fit and transform in one step."""
        self.fit(X)
        return self.transform(X)

print("‚úì Feature normalizer loaded")

## Step 8: Build and Train Regression Models

### Why Multiple Models?
We train 2 models and compare:

1. **Random Forest**
   - Many decision trees voting together
   - Robust, works well with mixed features
   - Fast prediction

2. **Gradient Boosting**
   - Sequential trees learning from mistakes
   - Usually more accurate
   - But can overfit

### Model Evaluation Metrics:
- **MAE** (Mean Absolute Error): Average prediction error in g/m¬≤
- **RMSE** (Root Mean Squared Error): Penalizes large errors
- **R¬≤** (Coefficient of Determination): Explains how much variance (0-1, higher better)

In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pickle

class GSMRegressionModels:
    """
    Train and evaluate multiple regression models.
    """
    
    def __init__(self, random_state=42):
        self.random_state = random_state
        
        # Define models
        self.models = {
            'Random Forest': RandomForestRegressor(
                n_estimators=200,          # 200 trees
                max_depth=20,              # Tree depth
                min_samples_split=5,       # Min samples to split
                min_samples_leaf=2,        # Min samples in leaf
                random_state=random_state,
                n_jobs=-1,                 # Use all CPU cores
                verbose=0
            ),
            'Gradient Boosting': GradientBoostingRegressor(
                n_estimators=200,          # 200 boosting stages
                learning_rate=0.1,         # Learning rate (lower = more careful)
                max_depth=5,               # Tree depth (keep small)
                min_samples_split=5,       # Min samples to split
                min_samples_leaf=2,        # Min samples in leaf
                subsample=0.8,             # Use 80% of samples per iteration
                random_state=random_state,
                verbose=0
            )
        }
        
        self.trained_models = {}
        self.results = {}
    
    def train(
        self,
        X_train: np.ndarray,
        y_train: np.ndarray,
        X_test: np.ndarray = None,
        y_test: np.ndarray = None
    ):
        """
        Train all models.
        """
        print("\nüéØ Training regression models...\n")
        
        for model_name, model in self.models.items():
            print(f"Training {model_name}...")
            
            # Train
            model.fit(X_train, y_train)
            self.trained_models[model_name] = model
            
            # Evaluate on training set
            y_pred_train = model.predict(X_train)
            train_mae = mean_absolute_error(y_train, y_pred_train)
            train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
            train_r2 = r2_score(y_train, y_pred_train)
            
            # Evaluate on test set
            if X_test is not None:
                y_pred_test = model.predict(X_test)
                test_mae = mean_absolute_error(y_test, y_pred_test)
                test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
                test_r2 = r2_score(y_test, y_pred_test)
            else:
                test_mae = test_rmse = test_r2 = None
            
            # Store results
            self.results[model_name] = {
                'train_mae': train_mae,
                'train_rmse': train_rmse,
                'train_r2': train_r2,
                'test_mae': test_mae,
                'test_rmse': test_rmse,
                'test_r2': test_r2,
                'predictions': y_pred_test if X_test is not None else None,
                'actual': y_test
            }
            
            # Print results
            print(f"  ‚úì Training - MAE: {train_mae:.2f}, RMSE: {train_rmse:.2f}, R¬≤: {train_r2:.4f}")
            if X_test is not None:
                print(f"  ‚úì Testing  - MAE: {test_mae:.2f}, RMSE: {test_rmse:.2f}, R¬≤: {test_r2:.4f}")
            print()
    
    def get_best_model(self):
        """Get best model based on test R¬≤ score."""
        best_name = max(
            self.results.keys(),
            key=lambda x: self.results[x]['test_r2'] if self.results[x]['test_r2'] is not None else self.results[x]['train_r2']
        )
        return best_name, self.trained_models[best_name]

print("‚úì Regression model classes loaded")

## Step 9: Execute Complete Training Pipeline

This is where everything comes together!

The dataset has been automatically downloaded from Kaggle Hub above.

In [None]:
# Initialize pipeline with Kaggle downloaded dataset
print("üöÄ Starting complete training pipeline...\n")

pipeline = DataPipeline(DATASET_PATH)

## Step 10: Visualize Results

Let's see how well our model performs!

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (15, 5)

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for idx, (model_name, results) in enumerate(trainer.results.items()):
    if results['predictions'] is not None:
        ax = axes[idx]
        
        # Scatter plot: Actual vs Predicted
        ax.scatter(
            results['actual'],
            results['predictions'],
            alpha=0.6, s=50
        )
        
        # Perfect prediction line
        min_val = min(results['actual'].min(), results['predictions'].min())
        max_val = max(results['actual'].max(), results['predictions'].max())
        ax.plot([min_val, max_val], [min_val, max_val], 'r--', label='Perfect', lw=2)
        
        ax.set_xlabel('Actual GSM (g/m¬≤)', fontsize=11)
        ax.set_ylabel('Predicted GSM (g/m¬≤)', fontsize=11)
        ax.set_title(f'{model_name}\nR¬≤ = {results["test_r2"]:.4f}', fontsize=12, fontweight='bold')
        ax.legend()
        ax.grid(True, alpha=0.3)

plt.suptitle('Model Performance: Actual vs Predicted GSM', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('/content/model_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

print("‚úì Visualization saved as /content/model_comparison.png")

## Step 11: Feature Importance Analysis

Which features are most important for predicting GSM?

This helps understand what the model learned!

In [None]:
# Get feature importance from Random Forest
if 'Random Forest' in trainer.trained_models:
    rf_model = trainer.trained_models['Random Forest']
    importances = rf_model.feature_importances_
    
    # Get top features
    top_n = 15
    top_indices = np.argsort(importances)[-top_n:][::-1]
    top_importances = importances[top_indices]
    
    # Feature names
    feature_names = []
    feature_names.extend([f'GLCM_{i}' for i in range(32)])  # GLCM features
    feature_names.extend([f'LBP_{i}' for i in range(59)])    # LBP features
    feature_names.extend(['Warp_Density', 'Weft_Density', 'Yarn_Diameter', 'Total_Density'])  # Density
    feature_names.extend([f'DeepFeat_{i}' for i in range(1280)])  # Deep features
    
    top_names = [feature_names[i] for i in top_indices]
    
    # Plot
    plt.figure(figsize=(10, 6))
    plt.barh(range(len(top_names)), top_importances, color='steelblue')
    plt.yticks(range(len(top_names)), top_names)
    plt.xlabel('Feature Importance', fontsize=12)
    plt.title('Top 15 Most Important Features for GSM Prediction', fontsize=13, fontweight='bold')
    plt.tight_layout()
    plt.savefig('/content/feature_importance.png', dpi=150, bbox_inches='tight')
    plt.show()
    
    print("‚úì Feature importance visualization saved")

## Step 12: Summary and Next Steps

### What We Did:
‚úÖ **Data Augmentation**: Created 5x more data from original images
‚úÖ **Feature Engineering**: Extracted 1375 features per image (textile + deep)
‚úÖ **Normalization**: Scaled features to comparable ranges
‚úÖ **Model Training**: Trained Random Forest & Gradient Boosting
‚úÖ **Evaluation**: Measured MAE, RMSE, R¬≤
‚úÖ **Feature Analysis**: Identified important features

### Results Summary:

In [None]:
# Print comprehensive summary
print("="*60)
print("üéâ TRAINING SUMMARY")
print("="*60)

print(f"\nüìä Dataset Statistics:")
print(f"  Total samples: {len(y)}")
print(f"  Training samples: {len(y_train)}")
print(f"  Testing samples: {len(y_test)}")
print(f"  GSM range: {y.min():.2f} - {y.max():.2f} g/m¬≤")
print(f"  GSM mean: {y.mean():.2f} g/m¬≤")

print(f"\nüî¢ Feature Engineering:")
print(f"  GLCM features: 32")
print(f"  LBP features: 59")
print(f"  Textile-specific features: 4 (warp, weft, yarn, density)")
print(f"  Deep learning features: 1280 (MobileNetV3)")
print(f"  Total features: {X.shape[1]}")

print(f"\nüéØ Model Performance Comparison:")
print(f"\n{'Model':<20} {'MAE (g/m¬≤)':<15} {'RMSE (g/m¬≤)':<15} {'R¬≤ Score':<15}")
print("-" * 65)

for model_name, results in trainer.results.items():
    mae = results['test_mae']
    rmse = results['test_rmse']
    r2 = results['test_r2']
    print(f"{model_name:<20} {mae:>14.2f} {rmse:>14.2f} {r2:>14.4f}")

print(f"\nüèÜ Best Model: {best_name}")
print(f"  Accuracy: {trainer.results[best_name]['test_r2']:.4f} (explains {trainer.results[best_name]['test_r2']*100:.1f}% of variance)")
print(f"  Average Error: ¬±{trainer.results[best_name]['test_mae']:.2f} g/m¬≤")

print(f"\nüíæ Saved Files:")
print(f"  ‚úì /content/models/feature_scaler.pkl")
print(f"  ‚úì /content/models/gsm_model_{best_name.replace(' ', '_')}.pkl")
print(f"  ‚úì /content/model_comparison.png")
print(f"  ‚úì /content/feature_importance.png")

print(f"\nüì• Download these files to your local machine!")
print("\n" + "="*60)

## Advanced: Hyperparameter Tuning (Optional)

To improve model accuracy further, tune hyperparameters:

In [None]:
# Optional: Hyperparameter tuning using RandomizedSearch
from sklearn.model_selection import RandomizedSearchCV

# This is optional - only run if you want better accuracy
# Warning: This takes longer to train!

print("\nüîç Hyperparameter Tuning (Optional)")
print("="*50)
print("\nThis searches for best hyperparameters.")
print("Takes ~5-10 minutes but can improve accuracy.")
print("\nUncomment code below to enable:")

# param_dist = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [15, 20, 25, 30],
#     'min_samples_split': [3, 5, 7],
#     'min_samples_leaf': [1, 2, 4]
# }
#
# rf_search = RandomizedSearchCV(
#     RandomForestRegressor(random_state=42, n_jobs=-1),
#     param_dist,
#     n_iter=20,  # Try 20 random combinations
#     cv=5,  # 5-fold cross validation
#     random_state=42,
#     n_jobs=-1,
#     verbose=1
# )
#
# print("\nSearching for best parameters...")
# rf_search.fit(X_train, y_train)
#
# print(f"\nBest parameters: {rf_search.best_params_}")
# print(f"Best CV score: {rf_search.best_score_:.4f}")