# Data Analyzer Agent

This notebook implements the data analysis component using TabNet for structured data analysis of experimental results.

In [None]:
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent))

import pandas as pd
import numpy as np
from pytorch_tabnet.tab_model import TabNetRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from typing import Dict, Tuple
import torch
from utils.config import setup_logging, MODEL_CONFIGS, EXPERIMENTAL_DATA_DIR, OUTPUTS_DIR
from utils.helpers import save_json, save_dataframe
from tqdm import tqdm

In [None]:
# Setup logging
logger = setup_logging('data_analyzer')

class DataAnalyzer:
    def __init__(self):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.tabnet_params = MODEL_CONFIGS['data_analysis']['tabnet_params']
        self.scaler = StandardScaler()
        
    def prepare_data(self, df: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
        """Prepare data for TabNet analysis."""
        # Identify numeric columns
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        
        # Scale numeric features
        X = self.scaler.fit_transform(df[numeric_cols])
        
        # Use last column as target variable
        y = X[:, -1]
        X = X[:, :-1]
        
        return X, y
    
    def analyze_dataset(self, df: pd.DataFrame) -> Dict:
        """Analyze dataset using TabNet."""
        X, y = self.prepare_data(df)
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
        
        # Initialize and train TabNet
        model = TabNetRegressor(**self.tabnet_params)
        model.fit(
            X_train, y_train,
            eval_set=[(X_test, y_test)],
            max_epochs=100,
            patience=10,
            batch_size=32
        )
        
        # Get feature importance
        feature_importance = model.feature_importances_
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        return {
            'feature_importance': feature_importance.tolist(),
            'predictions': y_pred.tolist(),
            'actual_values': y_test.tolist(),
            'model_params': self.tabnet_params
        }

In [None]:
def analyze_experimental_data() -> Dict:
    """Analyze all experimental data files."""
    logger.info('Starting experimental data analysis')
    
    analyzer = DataAnalyzer()
    analysis_results = {}
    
    # Process all CSV files in experimental data directory
    for data_file in tqdm(list(EXPERIMENTAL_DATA_DIR.glob('*.csv')), desc='Analyzing datasets'):
        try:
            # Load dataset
            df = pd.read_csv(data_file)
            
            # Analyze dataset
            results = analyzer.analyze_dataset(df)
            
            # Store results
            analysis_results[data_file.stem] = results
            logger.info(f'Successfully analyzed dataset: {data_file.name}')
            
            # Save individual results
            output_path = OUTPUTS_DIR / f'analysis_{data_file.stem}.json'
            save_json(results, output_path)
            
        except Exception as e:
            logger.error(f'Error analyzing dataset {data_file.name}: {str(e)}')
    
    # Save combined results
    output_path = OUTPUTS_DIR / 'experimental_analysis.json'
    save_json(analysis_results, output_path)
    logger.info(f'Saved combined analysis results to {output_path}')
    
    return analysis_results

In [None]:
if __name__ == "__main__":
    # Run analysis
    results = analyze_experimental_data()
    
    # Print summary
    print(f"Analyzed {len(results)} datasets")
    for dataset_name, analysis in results.items():
        print(f"\nDataset: {dataset_name}")
        print(f"Feature Importance:")
        for i, importance in enumerate(analysis['feature_importance']):
            print(f"Feature {i}: {importance:.3f}")