In [None]:
# File: notebook/debug_pkl_files.ipynb
# RUN THIS FIRST to diagnose the .pkl file issues

import os
import pickle
import subprocess
import sys

print("="*80)
print("DIAGNOSTIC SCRIPT FOR .PKL FILES")
print("="*80)

# First, install joblib if needed
try:
    import joblib
    print("‚úÖ joblib is already installed")
except ImportError:
    print("Installing joblib...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "joblib"])
    import joblib
    print("‚úÖ joblib installed successfully")

print("\n" + "="*80)
print("CHECKING FILE PATHS")
print("="*80)

# Define file paths
files = {
    'Processed Data': '../data/processed_diabetes_data.pkl',
    'Logistic Regression': '../models/logistic_regression_model.pkl',
    'Random Forest': '../models/random_forest_model.pkl',
    'SVM': '../models/svm_model.pkl',
    'XGBoost': '../models/xgboost_model.pkl'
}

# Check if files exist
for name, path in files.items():
    if os.path.exists(path):
        size = os.path.getsize(path)
        print(f"‚úÖ {name:25} -> EXISTS ({size:,} bytes)")
    else:
        print(f"‚ùå {name:25} -> MISSING")

print("\n" + "="*80)
print("ATTEMPTING TO LOAD FILES")
print("="*80)

# Try loading each file with different methods
for name, path in files.items():
    print(f"\nüîç {name}")
    print(f"   Path: {path}")
    
    if not os.path.exists(path):
        print("   ‚ùå File does not exist")
        continue
    
    # Method 1: Standard pickle
    try:
        with open(path, 'rb') as f:
            content = pickle.load(f)
        print(f"   ‚úÖ Method 1 (pickle): SUCCESS")
        print(f"      Type: {type(content)}")
        if hasattr(content, 'shape'):
            print(f"      Shape: {content.shape}")
        elif isinstance(content, tuple):
            print(f"      Tuple length: {len(content)}")
            for i, item in enumerate(content[:3]):
                if hasattr(item, 'shape'):
                    print(f"      Item {i} shape: {item.shape}")
        elif hasattr(content, 'get_params'):
            print(f"      Model: {content.__class__.__name__}")
    except Exception as e:
        print(f"   ‚ùå Method 1 (pickle): {type(e).__name__} - {str(e)[:50]}...")
    
    # Method 2: Pickle with latin1 encoding
    try:
        with open(path, 'rb') as f:
            content = pickle.load(f, encoding='latin1')
        print(f"   ‚úÖ Method 2 (latin1): SUCCESS")
        print(f"      Type: {type(content)}")
    except Exception as e:
        print(f"   ‚ùå Method 2 (latin1): {type(e).__name__}")
    
    # Method 3: Pickle with bytes encoding
    try:
        with open(path, 'rb') as f:
            content = pickle.load(f, encoding='bytes')
        print(f"   ‚úÖ Method 3 (bytes): SUCCESS")
        print(f"      Type: {type(content)}")
    except Exception as e:
        print(f"   ‚ùå Method 3 (bytes): {type(e).__name__}")
    
    # Method 4: Joblib
    try:
        content = joblib.load(path)
        print(f"   ‚úÖ Method 4 (joblib): SUCCESS")
        print(f"      Type: {type(content)}")
        if hasattr(content, 'get_params'):
            print(f"      Model: {content.__class__.__name__}")
    except Exception as e:
        print(f"   ‚ùå Method 4 (joblib): {type(e).__name__}")
    
    # Method 5: Check file signature
    try:
        with open(path, 'rb') as f:
            first_100 = f.read(100)
            print(f"   üìä File signature (first 20 bytes): {first_100[:20]}")
            
            # Check for common formats
            if first_100.startswith(b'\x80'):  # Python pickle
                print(f"   üìä Format: Python pickle")
            elif first_100.startswith(b'\x95'):  # NumPy .npy
                print(f"   üìä Format: NumPy .npy")
            elif b'pickle' in first_100:
                print(f"   üìä Format: Contains 'pickle' string")
            elif b'joblib' in first_100:
                print(f"   üìä Format: Contains 'joblib' string")
            else:
                print(f"   üìä Format: Unknown binary")
    except Exception as e:
        print(f"   ‚ùå Cannot read file bytes: {e}")

print("\n" + "="*80)
print("RECOMMENDED ACTION BASED ON RESULTS:")
print("="*80)
print("""
1. If ANY method shows SUCCESS:
   - Use that method in your evaluation notebook
   
2. If ALL methods FAIL:
   - Files might be corrupted
   - Ask team members to re-send files
   - Check how files were originally saved
   
3. If files are very small (0 or few bytes):
   - Files are empty/corrupted
   - Need to re-generate them
""")

print("\n" + "="*80)
print("QUICK FIX - TRY THIS IF ALL ELSE FAILS:")
print("="*80)

# Quick fix: Try loading with error tolerance
for name, path in files.items():
    if os.path.exists(path):
        print(f"\nüîÑ Attempting aggressive load for {name}:")
        try:
            # Try reading raw and manually checking
            with open(path, 'rb') as f:
                raw_data = f.read()
            
            print(f"   File size: {len(raw_data)} bytes")
            print(f"   First 50 chars as text: {raw_data[:50]}")
            
            # Try multiple pickle protocols
            for protocol in range(0, 5):
                try:
                    import pickle
                    content = pickle.loads(raw_data)
                    print(f"   ‚úÖ Protocol {protocol}: SUCCESS")
                    break
                except:
                    continue
                    
        except Exception as e:
            print(f"   ‚ùå Failed: {e}")