# üîç Setup & Data Verification
**Automatically detect your repo path and verify data availability**

Run this notebook FIRST to:
1. Find your cloned repository
2. Verify backend/results data exists
3. Set up all necessary paths
4. Create Delta Lake directories

## Step 1: Auto-Detect Repository Path

In [None]:
import os
import json

print("=" * 80)
print("üîç AUTO-DETECTING TAMU-DATATHON REPOSITORY")
print("=" * 80)

# Try multiple possible locations
possible_paths = [
    "/Workspace/Repos",
    "/Repos",
    "/Workspace/Users"
]

repo_path = None
backend_path = None

for base in possible_paths:
    try:
        if os.path.exists(base):
            for user_dir in os.listdir(base):
                user_path = os.path.join(base, user_dir)
                if os.path.isdir(user_path):
                    for repo in os.listdir(user_path):
                        if "TAMU" in repo.upper() or "datathon" in repo.lower():
                            potential_repo = os.path.join(user_path, repo)
                            backend_check = os.path.join(potential_repo, "backend")
                            if os.path.exists(backend_check):
                                repo_path = potential_repo
                                backend_path = backend_check
                                break
                if repo_path:
                    break
        if repo_path:
            break
    except Exception as e:
        continue

if repo_path:
    print(f"‚úÖ FOUND REPOSITORY!")
    print(f"   Repository: {repo_path}")
    print(f"   Backend: {backend_path}")
else:
    print("‚ùå Repository not found. Please provide manual path.")
    raise Exception("Repository not found")

## Step 2: Verify Data Files

In [None]:
# Check results directory
results_path = os.path.join(backend_path, "results")
learning_db_path = os.path.join(results_path, "learning_database.json")

print("\n" + "=" * 80)
print("üìÅ VERIFYING DATA FILES")
print("=" * 80)

# Count JSON files in results
json_files = []
if os.path.exists(results_path):
    for file in os.listdir(results_path):
        if file.endswith('.json') and file != 'learning_database.json':
            json_files.append(file)
    
    print(f"‚úÖ Results directory: {results_path}")
    print(f"   Classification files: {len(json_files)}")
    
    if len(json_files) > 0:
        sample_file = os.path.join(results_path, json_files[0])
        with open(sample_file, 'r') as f:
            sample_data = json.load(f)
        print(f"   Sample file: {json_files[0]}")
        print(f"   Sample keys: {list(sample_data.keys())[:5]}")
else:
    print(f"‚ùå Results directory not found: {results_path}")

# Check learning database
learning_data = []
if os.path.exists(learning_db_path):
    with open(learning_db_path, 'r') as f:
        learning_raw = json.load(f)
    # Extract learning_entries array
    learning_data = learning_raw.get('learning_entries', [])
    print(f"\n‚úÖ Learning database: {learning_db_path}")
    print(f"   Total entries: {len(learning_data)}")
else:
    print(f"\n‚ö†Ô∏è  Learning database not found: {learning_db_path}")

## Step 3: Create Delta Lake Directory Structure

In [None]:
print("\n" + "=" * 80)
print("üèóÔ∏è  CREATING DELTA LAKE STRUCTURE")
print("=" * 80)

# Use workspace-local path (not /mnt/ which is now restricted)
# Option 1: Use workspace temp storage (works in all Databricks workspaces)
DELTA_BASE = "/tmp/tamu-datathon-delta"

# Option 2: If you have Unity Catalog, you can use Volumes instead:
# DELTA_BASE = "/Volumes/catalog_name/schema_name/volume_name/tamu-datathon-delta"

BRONZE_PATH = f"{DELTA_BASE}/bronze"
SILVER_PATH = f"{DELTA_BASE}/silver"
GOLD_PATH = f"{DELTA_BASE}/gold"

# Create directories using dbutils (Databricks native way)
try:
    for path in [BRONZE_PATH, SILVER_PATH, GOLD_PATH]:
        dbutils.fs.mkdirs(path)
        print(f"‚úÖ Created: {path}")
    print(f"\n‚úÖ Delta Lake structure ready at: {DELTA_BASE}")
except Exception as e:
    # Fallback to local filesystem if dbutils fails
    print(f"‚ö†Ô∏è  dbutils.fs.mkdirs failed, using local filesystem")
    for path in [BRONZE_PATH, SILVER_PATH, GOLD_PATH]:
        os.makedirs(path, exist_ok=True)
        print(f"‚úÖ Created (local): {path}")
    print(f"\n‚úÖ Delta Lake structure ready!")

## Step 4: Save Configuration

In [None]:
# Save paths for other notebooks
config = {
    "repo_path": repo_path,
    "backend_path": backend_path,
    "results_path": results_path,
    "learning_db_path": learning_db_path,
    "delta_base": DELTA_BASE,
    "bronze_path": BRONZE_PATH,
    "silver_path": SILVER_PATH,
    "gold_path": GOLD_PATH,
    "num_classification_files": len(json_files),
    "num_learning_entries": len(learning_data)
}

# Save to workspace-local temp storage (accessible in all Databricks workspaces)
config_path = "/tmp/tamu-datathon-config.json"
with open(config_path, 'w') as f:
    json.dump(config, f, indent=2)

print("\n" + "=" * 80)
print("üíæ CONFIGURATION SAVED")
print("=" * 80)
print(json.dumps(config, indent=2))

print("\n‚úÖ Setup complete! Run the next notebooks in order:")
print("   1. 01_data_ingestion.ipynb")
print("   2. 03_pattern_mining.ipynb") 
print("   3. 05_analytics_dashboard.ipynb")