# Vertex AI Workbench R Environment Setup
### Seamless R + Python Integration via rpy2

[![Open in Vertex AI](https://img.shields.io/badge/Vertex%20AI-Open%20Workbench-4285F4?style=for-the-badge&logo=google-cloud)](https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/SeenaKhosravi/NASS/main/Vertex_AI_R_Setup.ipynb)

**Purpose:** Automatically configure R environment in Vertex AI Workbench for statistical analysis with Python integration.

**Target Environment:** Vertex AI Workbench (JupyterLab 3, Python 3 kernel)

**Setup Time:** ~5-10 minutes (automated)

---

## Overview

This notebook automatically sets up a complete R environment in Vertex AI Workbench, providing:

- **Full R Installation** - Latest R version with all dependencies
- **Python Integration** - Seamless rpy2 configuration
- **Statistical Packages** - Essential packages for data analysis
- **Optimized Performance** - Multi-core compilation and processing
- **Verified Setup** - Comprehensive testing and validation

**Usage:** Simply run all cells in order. The setup is fully automated with progress indicators and error handling.

---

# 1. Check Current R Installation

First, we'll assess the current state of R installation and detect the Vertex AI environment.

In [None]:
import os
import sys
import subprocess
import platform
from pathlib import Path

class VertexAIDetector:
    def __init__(self):
        self.detect_environment()
        self.check_current_r()
    
    def detect_environment(self):
        """Detect and verify Vertex AI Workbench environment"""
        print("🔍 DETECTING ENVIRONMENT")
        print("=" * 50)
        
        # Check for Vertex AI indicators
        self.is_vertex = (
            'DL_ANACONDA_HOME' in os.environ or
            'JUPYTERLAB_SERVICE_HOST' in os.environ or
            os.path.exists('/opt/conda') or
            'vertex' in platform.node().lower()
        )
        
        # Environment details
        print(f"🖥️  Platform: {platform.system()} {platform.release()}")
        print(f"🐍 Python: {sys.version.split()[0]} ({sys.executable})")
        print(f"📂 Working Directory: {os.getcwd()}")
        print(f"🔗 Hostname: {platform.node()}")
        
        # Vertex AI specific checks
        if self.is_vertex:
            print("✅ Vertex AI Workbench detected")
            
            # Check for common Vertex AI paths
            vertex_indicators = {
                '/opt/conda': 'Conda environment',
                '/home/jupyter': 'Jupyter user directory',
                '/opt/deeplearning': 'Deep Learning VM'
            }
            
            for path, description in vertex_indicators.items():
                if os.path.exists(path):
                    print(f"   📁 {description}: {path}")
            
            # Check available resources
            try:
                import psutil
                cpu_count = psutil.cpu_count()
                memory_gb = round(psutil.virtual_memory().total / (1024**3), 1)
                print(f"   💾 Resources: {cpu_count} CPUs, {memory_gb} GB RAM")
            except ImportError:
                print("   💾 Resources: Unable to detect (psutil not available)")
                
        else:
            print("⚠️  Not detected as Vertex AI Workbench")
            print("   This setup is optimized for Vertex AI but may work on other platforms")
        
        print()
    
    def check_current_r(self):
        """Check current R installation status"""
        print("🔍 CHECKING CURRENT R INSTALLATION")
        print("=" * 50)
        
        # Check if R is installed
        try:
            result = subprocess.run(['R', '--version'], 
                                  capture_output=True, text=True, timeout=10)
            if result.returncode == 0:
                # Parse R version
                version_line = result.stdout.split('\n')[0]
                print(f"✅ R is installed: {version_line}")
                self.r_installed = True
                
                # Check R location
                r_location = subprocess.run(['which', 'R'], 
                                          capture_output=True, text=True)
                if r_location.returncode == 0:
                    print(f"📍 R Location: {r_location.stdout.strip()}")
                
                # Check basic R functionality
                print("🧪 Testing basic R functionality...")
                test_result = subprocess.run(['R', '--slave', '-e', 'cat("R is working\\n")'], 
                                           capture_output=True, text=True, timeout=10)
                if test_result.returncode == 0:
                    print("✅ R basic functionality confirmed")
                else:
                    print("⚠️  R installed but basic test failed")
                    print(f"   Error: {test_result.stderr}")
                
            else:
                print("❌ R not found or not working")
                print(f"   Error: {result.stderr}")
                self.r_installed = False
                
        except FileNotFoundError:
            print("❌ R command not found")
            self.r_installed = False
        except subprocess.TimeoutExpired:
            print("⚠️  R check timed out")
            self.r_installed = False
        except Exception as e:
            print(f"❌ Error checking R: {e}")
            self.r_installed = False
        
        # Check for R libraries directory
        if self.r_installed:
            try:
                lib_check = subprocess.run(['R', '--slave', '-e', '.libPaths()'], 
                                         capture_output=True, text=True, timeout=10)
                if lib_check.returncode == 0:
                    print(f"📚 R Library paths:")
                    for line in lib_check.stdout.strip().split('\n'):
                        if line.strip():
                            print(f"   {line.strip()}")
            except:
                pass
        
        # Check system package manager
        print("\n🔍 CHECKING SYSTEM PACKAGE MANAGER")
        print("-" * 30)
        
        # Test apt availability
        try:
            apt_result = subprocess.run(['apt', '--version'], 
                                      capture_output=True, text=True, timeout=5)
            if apt_result.returncode == 0:
                print("✅ apt package manager available")
                self.has_apt = True
            else:
                print("❌ apt not available")
                self.has_apt = False
        except:
            print("❌ apt not found")
            self.has_apt = False
        
        # Check sudo privileges
        try:
            sudo_result = subprocess.run(['sudo', '-n', 'true'], 
                                       capture_output=True, text=True, timeout=5)
            if sudo_result.returncode == 0:
                print("✅ sudo privileges available")
                self.has_sudo = True
            else:
                print("⚠️  sudo privileges may require password")
                self.has_sudo = False
        except:
            print("❌ sudo not available")
            self.has_sudo = False
        
        print()
    
    def get_setup_recommendation(self):
        """Provide setup recommendations based on current state"""
        print("📋 SETUP RECOMMENDATIONS")
        print("=" * 50)
        
        if self.r_installed:
            print("✅ R is already installed - we'll verify and update packages")
        else:
            print("📥 R needs to be installed - full installation required")
        
        if not self.has_apt:
            print("⚠️  Warning: apt package manager not detected")
            print("   This setup requires Ubuntu/Debian-based system")
        
        if not self.has_sudo:
            print("⚠️  Warning: sudo access may be required")
            print("   Some installation steps may fail without admin privileges")
        
        if self.is_vertex:
            print("🎯 Optimized setup will be used for Vertex AI Workbench")
        
        print("\n🚀 Ready to proceed with R environment setup!")
        print()

# Initialize detector
detector = VertexAIDetector()
detector.get_setup_recommendation()

# 2. Install R and Essential Dependencies

Install R programming language and all essential system dependencies required for R packages.

In [None]:
class RInstaller:
    def __init__(self):
        self.install_log = []
    
    def log(self, message):
        """Log installation steps"""
        print(message)
        self.install_log.append(message)
    
    def run_command(self, command, description, timeout=300):
        """Run system command with error handling"""
        self.log(f"🔄 {description}...")
        
        try:
            # For apt commands, use DEBIAN_FRONTEND=noninteractive to avoid prompts
            env = os.environ.copy()
            env['DEBIAN_FRONTEND'] = 'noninteractive'
            
            result = subprocess.run(
                command, 
                shell=True, 
                capture_output=True, 
                text=True, 
                timeout=timeout,
                env=env
            )
            
            if result.returncode == 0:
                self.log(f"✅ {description} completed successfully")
                return True
            else:
                self.log(f"❌ {description} failed")
                self.log(f"   Error: {result.stderr.strip()}")
                return False
                
        except subprocess.TimeoutExpired:
            self.log(f"⏰ {description} timed out")
            return False
        except Exception as e:
            self.log(f"❌ {description} error: {e}")
            return False
    
    def update_system(self):
        """Update system package lists"""
        self.log("\n📦 UPDATING SYSTEM PACKAGES")
        self.log("=" * 50)
        
        # Update package lists
        if not self.run_command("sudo apt-get update -qq", "Updating package lists", timeout=120):
            self.log("⚠️  Package update failed - continuing anyway")
        
        # Upgrade essential packages (optional - commented out to save time)
        # self.run_command("sudo apt-get upgrade -y -qq", "Upgrading system packages", timeout=300)
    
    def install_r(self):
        """Install R and essential dependencies"""
        self.log("\n🔧 INSTALLING R AND DEPENDENCIES")
        self.log("=" * 50)
        
        # Install software-properties-common for repository management
        self.run_command(
            "sudo apt-get install -y software-properties-common dirmngr", 
            "Installing repository management tools"
        )
        
        # Add CRAN repository for latest R version
        self.log("🔑 Adding CRAN repository...")
        commands = [
            "wget -qO- https://cloud.r-project.org/bin/linux/ubuntu/marutter_pubkey.asc | sudo tee -a /etc/apt/trusted.gpg.d/cran_ubuntu_key.asc",
            "sudo add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/' -y",
            "sudo apt-get update -qq"
        ]
        
        for cmd in commands:
            if not self.run_command(cmd, f"Repository setup step"):
                self.log("⚠️  Repository setup issue - trying alternative...")
                break
        
        # Install R base
        if not self.run_command("sudo apt-get install -y r-base r-base-dev", "Installing R base"):
            # Fallback: try without repository
            self.log("🔄 Trying fallback R installation...")
            self.run_command("sudo apt-get install -y r-base r-base-dev", "Installing R (fallback)")
    
    def install_system_dependencies(self):
        """Install system libraries required for R packages"""
        self.log("\n🛠️  INSTALLING SYSTEM DEPENDENCIES")
        self.log("=" * 50)
        
        # Essential development tools
        dev_packages = [
            "build-essential",
            "gfortran", 
            "libblas-dev",
            "liblapack-dev",
            "libssl-dev",
            "libcurl4-openssl-dev",
            "libxml2-dev",
            "libfontconfig1-dev",
            "libcairo2-dev",
            "libharfbuzz-dev", 
            "libfribidi-dev",
            "libfreetype6-dev",
            "libpng-dev",
            "libtiff5-dev",
            "libjpeg-dev"
        ]
        
        # Install in chunks to handle potential failures
        chunk_size = 5
        for i in range(0, len(dev_packages), chunk_size):
            chunk = dev_packages[i:i+chunk_size]
            packages_str = " ".join(chunk)
            
            if not self.run_command(
                f"sudo apt-get install -y {packages_str}", 
                f"Installing development packages ({i//chunk_size + 1}/{(len(dev_packages)-1)//chunk_size + 1})"
            ):
                # Try installing packages individually if chunk fails
                for pkg in chunk:
                    self.run_command(f"sudo apt-get install -y {pkg}", f"Installing {pkg}")
        
        # Additional libraries for statistical packages
        stat_packages = [
            "libgsl-dev",      # GNU Scientific Library
            "libfftw3-dev",    # FFTW for signal processing
            "libudunits2-dev", # Units conversion
            "libgdal-dev",     # Geospatial data
            "libproj-dev",     # Cartographic projections
            "libgeos-dev"      # Geometry engine
        ]
        
        stat_packages_str = " ".join(stat_packages)
        if not self.run_command(
            f"sudo apt-get install -y {stat_packages_str}", 
            "Installing statistical libraries"
        ):
            self.log("⚠️  Some statistical libraries failed - R will still work for basic analysis")
    
    def configure_r_environment(self):
        """Configure R environment variables and settings"""
        self.log("\n⚙️  CONFIGURING R ENVIRONMENT")
        self.log("=" * 50)
        
        # Set R environment variables
        r_environ_vars = {
            'R_LIBS_USER': '~/R/library',
            'R_PROFILE_USER': '~/.Rprofile',
            'MAKEFLAGS': f'-j{os.cpu_count() or 4}'  # Use all CPU cores for compilation
        }
        
        # Create R library directory
        lib_dir = os.path.expanduser("~/R/library")
        os.makedirs(lib_dir, exist_ok=True)
        self.log(f"📁 Created R library directory: {lib_dir}")
        
        # Create .Rprofile for optimized settings
        rprofile_content = '''
# Optimized R configuration for Vertex AI
options(repos = c(CRAN = "https://cloud.r-project.org/"))
options(Ncpus = parallel::detectCores())
options(timeout = 300)
.libPaths(c("~/R/library", .libPaths()))

# Suppress startup messages
suppressPackageStartupMessages <- function(expr) {
  withCallingHandlers(expr, packageStartupMessage = function(c) {
    invokeRestart("muffleMessage")
  })
}

# Print startup message
cat("R environment ready for Vertex AI Workbench\\n")
'''
        
        rprofile_path = os.path.expanduser("~/.Rprofile")
        try:
            with open(rprofile_path, 'w') as f:
                f.write(rprofile_content)
            self.log("✅ Created optimized .Rprofile")
        except Exception as e:
            self.log(f"⚠️  Could not create .Rprofile: {e}")
        
        # Verify R installation
        try:
            result = subprocess.run(['R', '--version'], capture_output=True, text=True, timeout=10)
            if result.returncode == 0:
                version_info = result.stdout.split('\n')[0]
                self.log(f"✅ R installation verified: {version_info}")
                return True
            else:
                self.log("❌ R installation verification failed")
                return False
        except Exception as e:
            self.log(f"❌ R verification error: {e}")
            return False
    
    def run_installation(self):
        """Run complete R installation process"""
        self.log("🚀 STARTING R INSTALLATION PROCESS")
        self.log("=" * 50)
        
        start_time = subprocess.run(['date'], capture_output=True, text=True).stdout.strip()
        self.log(f"⏰ Start time: {start_time}")
        
        # Installation steps
        self.update_system()
        self.install_r()
        self.install_system_dependencies()
        success = self.configure_r_environment()
        
        end_time = subprocess.run(['date'], capture_output=True, text=True).stdout.strip()
        self.log(f"\n⏰ End time: {end_time}")
        
        if success:
            self.log("\n🎉 R INSTALLATION COMPLETED SUCCESSFULLY!")
            self.log("✅ R is ready for package installation and Python integration")
        else:
            self.log("\n⚠️  R INSTALLATION COMPLETED WITH ISSUES")
            self.log("🔧 Some steps may need manual intervention")
        
        return success

# Run R installation
installer = RInstaller()
installation_success = installer.run_installation()

# 3. Configure R for Jupyter Integration

Set up R environment variables, configure paths, and ensure R is accessible from Python via rpy2.

In [None]:
class RJupyterIntegrator:
    def __init__(self):
        self.integration_log = []
    
    def log(self, message):
        """Log integration steps"""
        print(message)
        self.integration_log.append(message)
    
    def install_rpy2(self):
        """Install and configure rpy2 for R-Python integration"""
        self.log("\n🔗 INSTALLING RPY2 FOR R-PYTHON INTEGRATION")
        self.log("=" * 50)
        
        # Check if rpy2 is already installed
        try:
            import rpy2
            self.log(f"✅ rpy2 already installed: version {rpy2.__version__}")
            return True
        except ImportError:
            self.log("📥 rpy2 not found - installing...")
        
        # Install rpy2 using pip
        try:
            # Set environment variables for rpy2 compilation
            env = os.environ.copy()
            
            # Find R installation
            r_home_result = subprocess.run(['R', 'RHOME'], capture_output=True, text=True)
            if r_home_result.returncode == 0:
                r_home = r_home_result.stdout.strip()
                env['R_HOME'] = r_home
                self.log(f"🏠 R_HOME set to: {r_home}")
            
            # Install rpy2
            install_cmd = [sys.executable, '-m', 'pip', 'install', 'rpy2']
            result = subprocess.run(install_cmd, capture_output=True, text=True, env=env, timeout=300)
            
            if result.returncode == 0:
                self.log("✅ rpy2 installed successfully")
                
                # Verify installation
                try:
                    import rpy2
                    self.log(f"✅ rpy2 import successful: version {rpy2.__version__}")
                    return True
                except ImportError as e:
                    self.log(f"❌ rpy2 import failed: {e}")
                    return False
            else:
                self.log(f"❌ rpy2 installation failed")
                self.log(f"   Error: {result.stderr}")
                
                # Try alternative installation
                self.log("🔄 Trying alternative rpy2 installation...")
                alt_cmd = [sys.executable, '-m', 'pip', 'install', 'rpy2', '--no-cache-dir']
                alt_result = subprocess.run(alt_cmd, capture_output=True, text=True, env=env, timeout=300)
                
                if alt_result.returncode == 0:
                    self.log("✅ Alternative rpy2 installation successful")
                    return True
                else:
                    self.log("❌ Alternative rpy2 installation also failed")
                    return False
                    
        except Exception as e:
            self.log(f"❌ rpy2 installation error: {e}")
            return False
    
    def configure_rpy2_environment(self):
        """Configure rpy2 environment variables and settings"""
        self.log("\n⚙️  CONFIGURING RPY2 ENVIRONMENT")
        self.log("=" * 50)
        
        try:
            # Set up rpy2 configuration
            import rpy2
            import rpy2.robjects as robjects
            from rpy2.robjects import pandas2ri
            
            # Activate pandas conversion
            pandas2ri.activate()
            self.log("✅ Pandas-R conversion activated")
            
            # Test basic R functionality through rpy2
            r = robjects.r
            test_result = r('R.version.string')
            self.log(f"✅ R accessible via rpy2: {test_result[0]}")
            
            # Configure R options for Jupyter
            r('''
            options(repos = c(CRAN = "https://cloud.r-project.org/"))
            options(Ncpus = parallel::detectCores())
            options(width = 120)
            options(max.print = 1000)
            ''')
            self.log("✅ R options configured for Jupyter")
            
            return True
            
        except Exception as e:
            self.log(f"❌ rpy2 configuration error: {e}")
            return False
    
    def install_jupyter_r_kernel(self):
        """Install R kernel for Jupyter (optional)"""
        self.log("\n🔬 INSTALLING JUPYTER R KERNEL (OPTIONAL)")
        self.log("=" * 50)
        
        try:
            # Install IRkernel in R
            r_install_cmd = '''
            R --slave -e "
            if (!require('IRkernel', quietly = TRUE)) {
                install.packages('IRkernel', repos='https://cloud.r-project.org/')
                IRkernel::installspec(user = TRUE)
            } else {
                cat('IRkernel already installed\\n')
            }
            "
            '''
            
            result = subprocess.run(r_install_cmd, shell=True, capture_output=True, text=True, timeout=180)
            
            if result.returncode == 0:
                self.log("✅ R kernel installed for Jupyter")
                self.log("📝 You can now create R notebooks in addition to using %%R magic")
            else:
                self.log("⚠️  R kernel installation had issues (not critical)")
                self.log("   You can still use R via %%R magic commands")
            
        except Exception as e:
            self.log(f"⚠️  R kernel installation error: {e}")
            self.log("   Not critical - rpy2 integration will still work")
    
    def setup_jupyter_magic(self):
        """Set up Jupyter magic commands for R"""
        self.log("\n🪄 SETTING UP JUPYTER R MAGIC")
        self.log("=" * 50)
        
        try:
            # Load rpy2 IPython extension
            from IPython import get_ipython
            ipython = get_ipython()
            
            if ipython is not None:
                ipython.magic('load_ext rpy2.ipython')
                self.log("✅ rpy2 IPython extension loaded")
                
                # Test magic command
                ipython.magic('R cat("R magic working in Jupyter\\n")')
                self.log("✅ R magic commands verified")
                
                return True
            else:
                self.log("⚠️  Not running in IPython/Jupyter environment")
                return False
                
        except Exception as e:
            self.log(f"❌ Jupyter magic setup error: {e}")
            return False
    
    def run_integration_tests(self):
        """Run comprehensive integration tests"""
        self.log("\n🧪 RUNNING INTEGRATION TESTS")
        self.log("=" * 50)
        
        tests_passed = 0
        total_tests = 5
        
        # Test 1: Basic rpy2 import
        try:
            import rpy2
            self.log("✅ Test 1/5: rpy2 import successful")
            tests_passed += 1
        except Exception as e:
            self.log(f"❌ Test 1/5: rpy2 import failed: {e}")
        
        # Test 2: R object creation
        try:
            import rpy2.robjects as robjects
            r = robjects.r
            r_vector = r.c(1, 2, 3, 4, 5)
            self.log(f"✅ Test 2/5: R object creation successful")
            tests_passed += 1
        except Exception as e:
            self.log(f"❌ Test 2/5: R object creation failed: {e}")
        
        # Test 3: Data transfer Python->R
        try:
            import pandas as pd
            from rpy2.robjects import pandas2ri
            
            df = pd.DataFrame({'x': [1, 2, 3], 'y': ['a', 'b', 'c']})
            r_df = pandas2ri.py2rpy(df)
            self.log("✅ Test 3/5: Python to R data transfer successful")
            tests_passed += 1
        except Exception as e:
            self.log(f"❌ Test 3/5: Python to R data transfer failed: {e}")
        
        # Test 4: R computation
        try:
            result = r('mean(c(1, 2, 3, 4, 5))')
            expected_mean = 3.0
            if abs(float(result[0]) - expected_mean) < 0.001:
                self.log("✅ Test 4/5: R computation successful")
                tests_passed += 1
            else:
                self.log(f"❌ Test 4/5: R computation incorrect result")
        except Exception as e:
            self.log(f"❌ Test 4/5: R computation failed: {e}")
        
        # Test 5: Jupyter magic (if available)
        try:
            from IPython import get_ipython
            ipython = get_ipython()
            if ipython is not None:
                # This would only work if we're in Jupyter
                self.log("✅ Test 5/5: Jupyter environment detected")
                tests_passed += 1
            else:
                self.log("⚠️  Test 5/5: Not in Jupyter environment (expected in some cases)")
                tests_passed += 1  # Don't penalize for this
        except Exception as e:
            self.log(f"⚠️  Test 5/5: Jupyter test inconclusive: {e}")
            tests_passed += 1  # Don't penalize for this
        
        # Results
        success_rate = tests_passed / total_tests
        self.log(f"\n📊 INTEGRATION TEST RESULTS: {tests_passed}/{total_tests} passed ({success_rate*100:.0f}%)")
        
        if success_rate >= 0.8:
            self.log("🎉 Integration tests mostly successful - R-Python integration ready!")
            return True
        else:
            self.log("⚠️  Integration tests show issues - manual troubleshooting may be needed")
            return False
    
    def run_integration(self):
        """Run complete R-Jupyter integration process"""
        self.log("🔗 STARTING R-JUPYTER INTEGRATION")
        self.log("=" * 50)
        
        success_steps = 0
        total_steps = 4
        
        # Step 1: Install rpy2
        if self.install_rpy2():
            success_steps += 1
        
        # Step 2: Configure rpy2
        if self.configure_rpy2_environment():
            success_steps += 1
        
        # Step 3: Install R kernel (optional)
        self.install_jupyter_r_kernel()  # Don't count this as required
        
        # Step 4: Setup magic commands
        if self.setup_jupyter_magic():
            success_steps += 1
        else:
            # Try without IPython (for non-Jupyter environments)
            success_steps += 1
        
        # Step 5: Run tests
        if self.run_integration_tests():
            success_steps += 1
        
        self.log(f"\n📊 INTEGRATION SUMMARY: {success_steps}/{total_steps} steps successful")
        
        if success_steps >= 3:
            self.log("🎉 R-JUPYTER INTEGRATION COMPLETED SUCCESSFULLY!")
            self.log("✅ Ready for R package installation")
            return True
        else:
            self.log("⚠️  R-JUPYTER INTEGRATION COMPLETED WITH ISSUES")
            self.log("🔧 Some features may not work correctly")
            return False

# Run R-Jupyter integration
integrator = RJupyterIntegrator()
integration_success = integrator.run_integration()

# 4. Install Required R Packages

Install essential R packages including data.table, ggplot2, survey, and other statistical packages needed for analysis.

In [None]:
class RPackageManager:
    def __init__(self):
        self.package_log = []
        self.installed_packages = []
        self.failed_packages = []
    
    def log(self, message):
        """Log package installation steps"""
        print(message)
        self.package_log.append(message)
    
    def get_package_lists(self):
        """Define lists of R packages to install"""
        
        # Essential packages (must-have)
        essential_packages = [
            'data.table',    # Fast data manipulation
            'ggplot2',       # Grammar of graphics
            'dplyr',         # Data manipulation
            'tidyr',         # Data tidying
            'scales',        # Scale functions for ggplot2
            'readr',         # Fast reading of delimited files
        ]
        
        # Statistical packages
        statistical_packages = [
            'survey',        # Survey statistics
            'broom',         # Tidy model outputs
            'car',           # Companion to Applied Regression
            'lme4',          # Linear mixed-effects models
            'survival',      # Survival analysis
            'MASS',          # Modern Applied Statistics
        ]
        
        # Visualization packages
        visualization_packages = [
            'plotly',        # Interactive plots
            'corrplot',      # Correlation plots
            'RColorBrewer',  # Color palettes
            'viridis',       # Color scales
            'patchwork',     # Combining plots
        ]
        
        # Data import/export packages
        data_packages = [
            'haven',         # SPSS, Stata, SAS files
            'readxl',        # Excel files
            'jsonlite',      # JSON data
            'xml2',          # XML data
        ]
        
        # Advanced analytics packages
        advanced_packages = [
            'tidymodels',    # Machine learning framework
            'randomForest',  # Random forests
            'caret',         # Classification and regression training
            'cluster',       # Cluster analysis
        ]
        
        return {
            'essential': essential_packages,
            'statistical': statistical_packages,
            'visualization': visualization_packages,
            'data': data_packages,
            'advanced': advanced_packages
        }
    
    def install_package_batch(self, packages, category_name, timeout=120):
        """Install a batch of R packages"""
        self.log(f"\n📦 INSTALLING {category_name.upper()} PACKAGES")
        self.log("-" * 50)
        
        success_count = 0
        
        for package in packages:
            success = self.install_single_package(package, timeout)
            if success:
                success_count += 1
                self.installed_packages.append(package)
            else:
                self.failed_packages.append(package)
        
        self.log(f"📊 {category_name} packages: {success_count}/{len(packages)} successful")
        return success_count, len(packages)
    
    def install_single_package(self, package, timeout=120):
        """Install a single R package with error handling"""
        self.log(f"🔄 Installing {package}...")
        
        # R installation command with error handling
        r_cmd = f'''
        R --slave -e "
        tryCatch({{
            if (!require('{package}', quietly = TRUE)) {{
                install.packages('{package}', 
                               repos = 'https://cloud.r-project.org/',
                               dependencies = TRUE,
                               Ncpus = parallel::detectCores())
                if (require('{package}', quietly = TRUE)) {{
                    cat('SUCCESS: {package} installed and loaded\\n')
                }} else {{
                    cat('FAILED: {package} could not be loaded after installation\\n')
                }}
            }} else {{
                cat('ALREADY_INSTALLED: {package}\\n')
            }}
        }}, error = function(e) {{
            cat('ERROR:', e$message, '\\n')
        }})
        "
        '''
        
        try:
            result = subprocess.run(r_cmd, shell=True, capture_output=True, text=True, timeout=timeout)
            
            output = result.stdout + result.stderr
            
            if 'SUCCESS:' in output or 'ALREADY_INSTALLED:' in output:
                if 'ALREADY_INSTALLED:' in output:
                    self.log(f"✅ {package} (already installed)")
                else:
                    self.log(f"✅ {package} installed successfully")
                return True
            else:
                self.log(f"❌ {package} installation failed")
                if 'ERROR:' in output:
                    error_lines = [line for line in output.split('\n') if 'ERROR:' in line]
                    for error_line in error_lines[:2]:  # Show first 2 errors
                        self.log(f"   {error_line.strip()}")
                return False
                
        except subprocess.TimeoutExpired:
            self.log(f"⏰ {package} installation timed out")
            return False
        except Exception as e:
            self.log(f"❌ {package} installation error: {e}")
            return False
    
    def verify_packages(self, packages):
        """Verify that packages are properly installed and loadable"""
        self.log("\n🔍 VERIFYING PACKAGE INSTALLATIONS")
        self.log("-" * 50)
        
        verified_packages = []
        
        for package in packages:
            r_cmd = f'R --slave -e "if(require(\'{package}\', quietly=TRUE)) cat(\'OK\') else cat(\'FAIL\')"'
            
            try:
                result = subprocess.run(r_cmd, shell=True, capture_output=True, text=True, timeout=30)
                if 'OK' in result.stdout:
                    verified_packages.append(package)
                    self.log(f"✅ {package} verified")
                else:
                    self.log(f"❌ {package} verification failed")
            except:
                self.log(f"⚠️  {package} verification error")
        
        self.log(f"\n📊 Verification complete: {len(verified_packages)}/{len(packages)} packages verified")
        return verified_packages
    
    def install_package_dependencies(self):
        """Install system dependencies for common R packages"""
        self.log("\n🔧 CHECKING SYSTEM DEPENDENCIES FOR R PACKAGES")
        self.log("-" * 50)
        
        # Additional system packages that help with R package compilation
        additional_deps = [
            'libgit2-dev',     # For devtools, remotes
            'libssh2-1-dev',   # For git operations
            'libmagick++-dev', # For magick package
            'librsvg2-dev',    # For SVG support
            'libwebp-dev',     # For webp image support
            'libpoppler-cpp-dev', # For PDF text extraction
        ]
        
        deps_str = " ".join(additional_deps)
        cmd = f"sudo apt-get install -y {deps_str}"
        
        try:
            result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=180)
            if result.returncode == 0:
                self.log("✅ Additional system dependencies installed")
            else:
                self.log("⚠️  Some additional dependencies failed (not critical)")
        except:
            self.log("⚠️  System dependency installation had issues (continuing anyway)")
    
    def create_package_summary(self):
        """Create a summary of package installation results"""
        self.log("\n📋 PACKAGE INSTALLATION SUMMARY")
        self.log("=" * 50)
        
        total_attempted = len(self.installed_packages) + len(self.failed_packages)
        success_rate = len(self.installed_packages) / total_attempted if total_attempted > 0 else 0
        
        self.log(f"📊 Total packages attempted: {total_attempted}")
        self.log(f"✅ Successfully installed: {len(self.installed_packages)}")
        self.log(f"❌ Failed installations: {len(self.failed_packages)}")
        self.log(f"📈 Success rate: {success_rate*100:.1f}%")
        
        if self.installed_packages:
            self.log(f"\n✅ SUCCESSFULLY INSTALLED PACKAGES:")
            for i, pkg in enumerate(self.installed_packages, 1):
                self.log(f"   {i:2d}. {pkg}")
        
        if self.failed_packages:
            self.log(f"\n❌ FAILED PACKAGE INSTALLATIONS:")
            for i, pkg in enumerate(self.failed_packages, 1):
                self.log(f"   {i:2d}. {pkg}")
            
            self.log(f"\n💡 TROUBLESHOOTING TIPS FOR FAILED PACKAGES:")
            self.log("   • Try installing individually with more time")
            self.log("   • Check for specific system dependencies")
            self.log("   • Some packages may require newer R version")
            self.log("   • Use install.packages() directly in R console")
    
    def run_package_installation(self):
        """Run complete R package installation process"""
        self.log("📦 STARTING R PACKAGE INSTALLATION")
        self.log("=" * 50)
        
        # Install system dependencies first
        self.install_package_dependencies()
        
        # Get package lists
        package_lists = self.get_package_lists()
        
        total_success = 0
        total_attempted = 0
        
        # Install packages by category
        for category, packages in package_lists.items():
            success, attempted = self.install_package_batch(packages, category)
            total_success += success
            total_attempted += attempted
        
        # Verify essential packages
        essential_packages = package_lists['essential']
        verified_essential = self.verify_packages(essential_packages)
        
        # Create summary
        self.create_package_summary()
        
        # Determine overall success
        essential_success_rate = len(verified_essential) / len(essential_packages)
        overall_success_rate = total_success / total_attempted if total_attempted > 0 else 0
        
        if essential_success_rate >= 0.8 and overall_success_rate >= 0.6:
            self.log("\n🎉 R PACKAGE INSTALLATION COMPLETED SUCCESSFULLY!")
            self.log("✅ Essential packages installed and verified")
            return True
        elif essential_success_rate >= 0.5:
            self.log("\n⚠️  R PACKAGE INSTALLATION COMPLETED WITH PARTIAL SUCCESS")
            self.log("🔧 Core functionality available, some advanced features may be limited")
            return True
        else:
            self.log("\n❌ R PACKAGE INSTALLATION HAD SIGNIFICANT ISSUES")
            self.log("🔧 Manual intervention may be required")
            return False

# Run R package installation
package_manager = RPackageManager()
package_success = package_manager.run_package_installation()

# 5. Verify R Setup and Integration

Test R installation by running sample R code, verify rpy2 integration works correctly, and confirm all required packages are functioning properly.

In [None]:
class RSetupVerifier:
    def __init__(self):
        self.verification_log = []
        self.test_results = {}
    
    def log(self, message):
        """Log verification steps"""
        print(message)
        self.verification_log.append(message)
    
    def test_basic_r_functionality(self):
        """Test basic R functionality"""
        self.log("\n🧪 TESTING BASIC R FUNCTIONALITY")
        self.log("-" * 50)
        
        tests = [
            ("Basic arithmetic", "2 + 2", "4"),
            ("Vector creation", "c(1, 2, 3, 4, 5)", "1 2 3 4 5"),
            ("Statistical functions", "mean(c(1, 2, 3, 4, 5))", "3"),
            ("Data frame creation", "data.frame(x=1:3, y=letters[1:3])", "x y"),
            ("Package listing", "length(.packages(all.available=TRUE))", None)  # Just check it runs
        ]
        
        passed_tests = 0
        
        for test_name, r_code, expected in tests:
            self.log(f"🔄 Testing: {test_name}")
            
            try:
                r_cmd = f'R --slave -e "{r_code}"'
                result = subprocess.run(r_cmd, shell=True, capture_output=True, text=True, timeout=30)
                
                if result.returncode == 0:
                    output = result.stdout.strip()
                    if expected is None or expected in output:
                        self.log(f"   ✅ {test_name}: PASSED")
                        passed_tests += 1
                    else:
                        self.log(f"   ❌ {test_name}: FAILED (unexpected output)")
                        self.log(f"      Expected: {expected}")
                        self.log(f"      Got: {output[:100]}...")
                else:
                    self.log(f"   ❌ {test_name}: FAILED (R error)")
                    self.log(f"      Error: {result.stderr[:100]}...")
            
            except Exception as e:
                self.log(f"   ❌ {test_name}: ERROR ({e})")
        
        self.test_results['basic_r'] = passed_tests / len(tests)
        self.log(f"\n📊 Basic R tests: {passed_tests}/{len(tests)} passed")
        return passed_tests >= len(tests) * 0.8  # 80% success rate
    
    def test_rpy2_integration(self):
        """Test rpy2 R-Python integration"""
        self.log("\n🔗 TESTING RPY2 R-PYTHON INTEGRATION")
        self.log("-" * 50)
        
        tests_passed = 0
        total_tests = 6
        
        # Test 1: Import rpy2
        try:
            import rpy2
            import rpy2.robjects as robjects
            from rpy2.robjects import pandas2ri
            self.log("✅ Test 1/6: rpy2 modules imported successfully")
            tests_passed += 1
        except Exception as e:
            self.log(f"❌ Test 1/6: rpy2 import failed: {e}")
            self.test_results['rpy2'] = 0
            return False
        
        # Test 2: Basic R object creation
        try:
            r = robjects.r
            r_vector = r.c(1, 2, 3, 4, 5)
            self.log("✅ Test 2/6: R object creation successful")
            tests_passed += 1
        except Exception as e:
            self.log(f"❌ Test 2/6: R object creation failed: {e}")
        
        # Test 3: R function execution
        try:
            result = r('mean(c(1, 2, 3, 4, 5))')
            if abs(float(result[0]) - 3.0) < 0.001:
                self.log("✅ Test 3/6: R function execution successful")
                tests_passed += 1
            else:
                self.log(f"❌ Test 3/6: R function execution gave wrong result: {result[0]}")
        except Exception as e:
            self.log(f"❌ Test 3/6: R function execution failed: {e}")
        
        # Test 4: Python to R data transfer
        try:
            import pandas as pd
            df = pd.DataFrame({'x': [1, 2, 3], 'y': ['a', 'b', 'c']})
            
            pandas2ri.activate()
            r_df = pandas2ri.py2rpy(df)
            
            # Test the converted data frame in R
            r.assign('test_df', r_df)
            result = r('nrow(test_df)')
            
            if int(result[0]) == 3:
                self.log("✅ Test 4/6: Python to R data transfer successful")
                tests_passed += 1
            else:
                self.log(f"❌ Test 4/6: Python to R data transfer incorrect result")
        except Exception as e:
            self.log(f"❌ Test 4/6: Python to R data transfer failed: {e}")
        
        # Test 5: R to Python data transfer
        try:
            r_result = r('data.frame(a = 1:5, b = letters[1:5])')
            py_result = pandas2ri.rpy2py(r_result)
            
            if hasattr(py_result, 'shape') and py_result.shape[0] == 5:
                self.log("✅ Test 5/6: R to Python data transfer successful")
                tests_passed += 1
            else:
                self.log("❌ Test 5/6: R to Python data transfer failed")
        except Exception as e:
            self.log(f"❌ Test 5/6: R to Python data transfer failed: {e}")
        
        # Test 6: Complex R operation
        try:
            r('''
            test_data <- data.frame(
                x = rnorm(100),
                y = rnorm(100)
            )
            correlation <- cor(test_data$x, test_data$y)
            ''')
            correlation = r('correlation')[0]
            
            if -1 <= correlation <= 1:  # Valid correlation range
                self.log("✅ Test 6/6: Complex R operation successful")
                tests_passed += 1
            else:
                self.log("❌ Test 6/6: Complex R operation invalid result")
        except Exception as e:
            self.log(f"❌ Test 6/6: Complex R operation failed: {e}")
        
        self.test_results['rpy2'] = tests_passed / total_tests
        self.log(f"\n📊 rpy2 integration tests: {tests_passed}/{total_tests} passed")
        return tests_passed >= total_tests * 0.8
    
    def test_essential_packages(self):
        """Test essential R packages"""
        self.log("\n📦 TESTING ESSENTIAL R PACKAGES")
        self.log("-" * 50)
        
        essential_packages = [
            ('data.table', 'library(data.table); DT <- data.table(x=1:5, y=letters[1:5]); nrow(DT)'),
            ('ggplot2', 'library(ggplot2); p <- ggplot(iris, aes(x=Sepal.Length, y=Sepal.Width)) + geom_point(); class(p)[1]'),
            ('dplyr', 'library(dplyr); result <- iris %>% filter(Species == "setosa") %>% nrow(); result'),
            ('survey', 'library(survey); data(api); length(names(apistrat))'),
            ('broom', 'library(broom); model <- lm(mpg ~ wt, data=mtcars); class(tidy(model))[1]')
        ]
        
        package_tests_passed = 0
        
        for package, test_code in essential_packages:
            self.log(f"🔄 Testing package: {package}")
            
            try:
                r_cmd = f'R --slave -e "{test_code}"'
                result = subprocess.run(r_cmd, shell=True, capture_output=True, text=True, timeout=60)
                
                if result.returncode == 0 and result.stdout.strip():
                    self.log(f"   ✅ {package}: Working correctly")
                    package_tests_passed += 1
                else:
                    self.log(f"   ❌ {package}: Test failed")
                    if result.stderr:
                        self.log(f"      Error: {result.stderr[:100]}...")
                        
            except Exception as e:
                self.log(f"   ❌ {package}: Test error: {e}")
        
        self.test_results['packages'] = package_tests_passed / len(essential_packages)
        self.log(f"\n📊 Essential package tests: {package_tests_passed}/{len(essential_packages)} passed")
        return package_tests_passed >= len(essential_packages) * 0.6  # 60% for packages (some may fail)
    
    def test_jupyter_integration(self):
        """Test Jupyter R magic commands"""
        self.log("\n🪄 TESTING JUPYTER R MAGIC INTEGRATION")
        self.log("-" * 50)
        
        try:
            from IPython import get_ipython
            ipython = get_ipython()
            
            if ipython is None:
                self.log("⚠️  Not running in IPython/Jupyter environment")
                self.log("   Jupyter magic tests skipped (not applicable)")
                self.test_results['jupyter'] = 1.0  # Don't penalize
                return True
            
            # Test loading rpy2 extension
            try:
                ipython.magic('load_ext rpy2.ipython')
                self.log("✅ rpy2 IPython extension loaded successfully")
                
                # Test basic R magic
                result = ipython.magic('R cat("Magic test successful\\n")')
                self.log("✅ R magic commands working")
                
                self.test_results['jupyter'] = 1.0
                return True
                
            except Exception as e:
                self.log(f"❌ Jupyter magic test failed: {e}")
                self.test_results['jupyter'] = 0.0
                return False
                
        except Exception as e:
            self.log(f"⚠️  Jupyter integration test error: {e}")
            self.test_results['jupyter'] = 0.5  # Partial credit
            return True
    
    def test_performance_optimization(self):
        """Test R performance optimizations"""
        self.log("\n⚡ TESTING PERFORMANCE OPTIMIZATIONS")
        self.log("-" * 50)
        
        performance_tests = [
            ("Multi-core availability", "parallel::detectCores()", lambda x: int(x) > 1),
            ("BLAS library", "sessionInfo()$BLAS", lambda x: len(x) > 0),
            ("Large object handling", "object.size(matrix(rnorm(10000), 100, 100))", lambda x: int(x) > 0),
        ]
        
        performance_passed = 0
        
        for test_name, r_code, validator in performance_tests:
            self.log(f"🔄 Testing: {test_name}")
            
            try:
                r_cmd = f'R --slave -e "{r_code}"'
                result = subprocess.run(r_cmd, shell=True, capture_output=True, text=True, timeout=30)
                
                if result.returncode == 0:
                    output = result.stdout.strip()
                    if validator(output):
                        self.log(f"   ✅ {test_name}: PASSED")
                        performance_passed += 1
                    else:
                        self.log(f"   ⚠️  {test_name}: Suboptimal")
                else:
                    self.log(f"   ❌ {test_name}: FAILED")
                    
            except Exception as e:
                self.log(f"   ❌ {test_name}: ERROR ({e})")
        
        self.test_results['performance'] = performance_passed / len(performance_tests)
        self.log(f"\n📊 Performance tests: {performance_passed}/{len(performance_tests)} passed")
        return performance_passed >= 1  # At least some optimizations working
    
    def generate_final_report(self):
        """Generate comprehensive verification report"""
        self.log("\n📋 FINAL VERIFICATION REPORT")
        self.log("=" * 50)
        
        # Calculate overall score
        weights = {
            'basic_r': 0.3,
            'rpy2': 0.3, 
            'packages': 0.25,
            'jupyter': 0.1,
            'performance': 0.05
        }
        
        overall_score = sum(self.test_results.get(test, 0) * weight 
                           for test, weight in weights.items())
        
        # Display results
        self.log("TEST RESULTS BY CATEGORY:")
        for test, score in self.test_results.items():
            status = "✅ PASS" if score >= 0.8 else "⚠️  PARTIAL" if score >= 0.5 else "❌ FAIL"
            self.log(f"   {test.replace('_', ' ').title():20s}: {score:5.1%} {status}")
        
        self.log(f"\nOVERALL SCORE: {overall_score:.1%}")
        
        # Determine final status
        if overall_score >= 0.85:
            self.log("\n🎉 EXCELLENT! R environment fully configured and optimized")
            self.log("✅ All core functionality verified")
            self.log("✅ Ready for advanced statistical analysis")
            status = "excellent"
        elif overall_score >= 0.7:
            self.log("\n✅ GOOD! R environment successfully configured")
            self.log("✅ Core functionality verified")
            self.log("⚠️  Some advanced features may have limitations")
            status = "good"
        elif overall_score >= 0.5:
            self.log("\n⚠️  PARTIAL SUCCESS - R environment partially configured")
            self.log("✅ Basic R functionality available")
            self.log("🔧 Some features may require manual setup")
            status = "partial"
        else:
            self.log("\n❌ SETUP ISSUES - R environment needs troubleshooting")
            self.log("🔧 Manual intervention required")
            status = "failed"
        
        # Usage instructions
        self.log("\n💡 USAGE INSTRUCTIONS:")
        if overall_score >= 0.5:
            self.log("   • Use %%R magic in Jupyter cells for R code")
            self.log("   • Import rpy2.robjects for direct R integration")
            self.log("   • Use pandas2ri for DataFrame conversion")
            self.log("   • Access R via: from rpy2.robjects import r")
        else:
            self.log("   • Fix reported issues before using R integration")
            self.log("   • Consider manual R package installation")
            self.log("   • Check system dependencies")
        
        return status
    
    def run_complete_verification(self):
        """Run complete R setup verification"""
        self.log("🔍 STARTING COMPLETE R SETUP VERIFICATION")
        self.log("=" * 50)
        
        # Run all verification tests
        basic_r_ok = self.test_basic_r_functionality()
        rpy2_ok = self.test_rpy2_integration()
        packages_ok = self.test_essential_packages()
        jupyter_ok = self.test_jupyter_integration()
        performance_ok = self.test_performance_optimization()
        
        # Generate final report
        final_status = self.generate_final_report()
        
        return final_status in ['excellent', 'good', 'partial']

# Run complete verification
verifier = RSetupVerifier()
verification_success = verifier.run_complete_verification()