# Downhole Pressure Prediction for ESP Wells
This notebook uses machine learning to predict downhole pressure (PDP_psi) for ESP wells based on production and operational data. It is designed for classroom exercises and is compatible with Google Colab.

## 📋 Google Colab Setup Instructions

### 🔧 What's Pre-installed in Colab?
Google Colab comes with most common packages:
- ✅ **pandas, numpy, matplotlib, seaborn, scikit-learn** (always available)
- ✅ **tensorflow** (usually pre-installed)
- ❓ **xgboost, lightgbm** (may need installation)

### 📝 Setup Steps:

1. **Install Missing Packages**: Run the package installation cell below
2. **Load Your Data**: Choose one of these methods:
   - 📤 **Upload directly** (easiest for small files)
   - 📂 **Google Drive** (best for repeated use)
   - 🌐 **Public URL** (if data is hosted online)

3. **Enable GPU** (optional but recommended):
   - Runtime → Change runtime type → Hardware accelerator → GPU

### 💡 Pro Tips:
- Upload files < 25MB directly
- Use Google Drive for larger files or repeated access
- Save your work to Drive: File → Save a copy in Drive

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import joblib  # for model serialization
from tensorflow.keras.models import Sequential  # for ANN
from tensorflow.keras.layers import Dense
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
import lightgbm as lgb

sns.set(style="whitegrid")

In [None]:
# Data Loading - Multiple options for different environments

try:
    import google.colab
    IN_COLAB = True
    print("🌐 Google Colab detected")
    
    # Method 1: Upload file directly
    print("\n📁 Option 1: Upload CSV file directly")
    print("Click 'Choose Files' below to upload ESP_PDHG_WT.csv:")
    
    from google.colab import files
    uploaded = files.upload()
    
    if uploaded:
        data_path = list(uploaded.keys())[0]
        print(f"✅ File uploaded: {data_path}")
    else:
        print("❌ No file uploaded. Please try again or use alternative method.")
        data_path = None
        
except ImportError:
    IN_COLAB = False
    print("💻 Local environment detected")
    data_path = 'ESP_PDHG_WT.csv'

# Load the dataset
if data_path:
    try:
        df = pd.read_csv(data_path)
        print(f"✅ Dataset loaded successfully! Shape: {df.shape}")
        print(f"📊 Columns: {list(df.columns)}")
    except FileNotFoundError:
        print("❌ File not found.")
        if IN_COLAB:
            print("💡 Try uploading the file using the cell above")
        else:
            print("💡 Ensure ESP_PDHG_WT.csv is in the same directory as this notebook")
    except Exception as e:
        print(f"❌ Error loading file: {e}")

# Preview data
if 'df' in locals():
    df.head()
else:
    print("⚠️ Dataset not loaded. Please resolve the data loading issue above.")

In [None]:
# Alternative Data Loading Methods (run only if upload method above failed)

# Method 2: Load from Google Drive
def load_from_drive():
    """Mount Google Drive and load dataset"""
    try:
        from google.colab import drive
        print("📂 Mounting Google Drive...")
        drive.mount('/content/drive')
        print("✅ Google Drive mounted successfully!")
        
        # List common paths to help users find their file
        import os
        print("\n📁 Common Drive locations:")
        print("- /content/drive/MyDrive/")
        print("- /content/drive/MyDrive/Colab Notebooks/")
        print("- /content/drive/MyDrive/Data/")
        
        # User needs to update this path
        drive_path = '/content/drive/MyDrive/ESP_PDHG_WT.csv'
        
        if os.path.exists(drive_path):
            df_drive = pd.read_csv(drive_path)
            print(f"✅ Dataset loaded from Drive! Shape: {df_drive.shape}")
            return df_drive
        else:
            print(f"❌ File not found at: {drive_path}")
            print("💡 Update the 'drive_path' variable with your file location")
            return None
            
    except Exception as e:
        print(f"❌ Error accessing Google Drive: {e}")
        return None

# Method 3: Load from URL
def load_from_url(url):
    """Load dataset from a public URL"""
    try:
        df_url = pd.read_csv(url)
        print(f"✅ Dataset loaded from URL! Shape: {df_url.shape}")
        return df_url
    except Exception as e:
        print(f"❌ Error loading from URL: {e}")
        return None

# Uncomment and run one of these methods if needed:
# df = load_from_drive()
# df = load_from_url('https://raw.githubusercontent.com/your-repo/ESP_PDHG_WT.csv')

In [None]:
# Google Colab Package Installation
# Run this cell first in Google Colab

try:
    import google.colab
    IN_COLAB = True
    print("🌐 Running in Google Colab")
    
    # Check what's already available in Colab by default
    packages_to_install = []
    
    try:
        import xgboost
        print("✅ XGBoost already available")
    except ImportError:
        packages_to_install.append('xgboost')
        
    try:
        import lightgbm
        print("✅ LightGBM already available")
    except ImportError:
        packages_to_install.append('lightgbm')
        
    try:
        import tensorflow
        print("✅ TensorFlow already available")
    except ImportError:
        packages_to_install.append('tensorflow')
    
    # Install missing packages
    if packages_to_install:
        print(f"🔧 Installing missing packages: {', '.join(packages_to_install)}")
        for package in packages_to_install:
            !pip install {package} -q
        print("✅ Package installation complete!")
    else:
        print("✅ All required packages are already available!")
        
except ImportError:
    IN_COLAB = False
    print("💻 Running in local environment")
    print("📝 Note: Ensure you have installed: pandas, numpy, matplotlib, seaborn, scikit-learn, xgboost, lightgbm, tensorflow")

## 1. Data Exploration
We'll explore the dataset to understand distributions, missing values, and relationships between variables.

In [None]:
# Summary statistics
df.describe()

In [None]:
# Check for missing values
print(df.isnull().sum())

# Distribution of target
sns.histplot(df['PDP_psi'], kde=True)
plt.title('Distribution of Downhole Pressure (PDP_psi)')
plt.show()

### Univariate Distributions of Numeric Features
Visualize the spread of each continuous parameter to understand ranges and detect potential skew.

In [None]:
# Univariate histograms
num_feats = ['PDHG_Dep_ft','Ql_blpd','Gor_scf_bbl','WCT, %','Choke','Freq_Hz','WHP_psi','WHT_degF']
plt.figure(figsize=(16,12))
for i, col in enumerate(num_feats,1):
    plt.subplot(3,3,i)
    sns.histplot(df[col], kde=True)
    plt.title(col)
plt.tight_layout()
plt.show()

# Pairplot for subset to check relationships (downsample if needed)
sample_df = df.sample(500, random_state=1)
sns.pairplot(sample_df[num_feats + ['PDP_psi']], corner=True)
plt.show()

In [None]:
# Correlation Matrix
plt.figure(figsize=(10,8))
# Use numerical features for correlation
num_cols = ['PDHG_Dep_ft','Ql_blpd','Gor_scf_bbl','WCT, %','Choke','Freq_Hz','WHP_psi','WHT_degF','PDP_psi']
corr = df[num_cols].corr()
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Feature Correlation Matrix")
plt.show()

## 2. Preprocessing and Feature Engineering
- Parse dates
- Encode categorical features
- Scale numerical features

In [None]:
# Parse date and extract features
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day

# Encode Well Name
le = LabelEncoder()
df['Well Name Encoded'] = le.fit_transform(df['Well Name'])

# Select features and target
features = ['Well Name Encoded', 'PDHG_Dep_ft', 'Ql_blpd', 'Gor_scf_bbl', 'WCT, %', 'Choke', 'Freq_Hz', 'WHP_psi', 'WHT_degF', 'Year', 'Month', 'Day']
X = df[features]
y = df['PDP_psi']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 3. Modeling
We'll train multiple regression models to predict downhole pressure.

In [None]:
def evaluate_model(name, model, X_tr, X_te, y_tr, y_te):
    model.fit(X_tr, y_tr)
    preds = model.predict(X_te)
    mae = mean_absolute_error(y_te, preds)
    rmse = np.sqrt(mean_squared_error(y_te, preds))
    r2 = r2_score(y_te, preds)
    print(f"{name} - MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.3f}")
    return preds


In [None]:
# Linear Regression
lr_preds = evaluate_model('Linear Regression', LinearRegression(), X_train_scaled, X_test_scaled, y_train, y_test)

In [None]:
# Random Forest
rf_preds = evaluate_model('Random Forest', RandomForestRegressor(random_state=42), X_train_scaled, X_test_scaled, y_train, y_test)

In [None]:
# XGBoost
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
xgb_preds = evaluate_model('XGBoost', xgb_model, X_train_scaled, X_test_scaled, y_train, y_test)

### 3.4 Artificial Neural Network (ANN)
Train a simple feedforward network with two hidden layers to predict downhole pressure.

In [None]:
# Build ANN model
ann = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1)  # output
])
ann.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Train
history = ann.fit(
    X_train_scaled, y_train,
    validation_split=0.2,
    epochs=50,
    batch_size=32,
    verbose=0
)

# Evaluate
ann_preds = ann.predict(X_test_scaled).flatten()
mae_ann  = mean_absolute_error(y_test, ann_preds)
rmse_ann = np.sqrt(mean_squared_error(y_test, ann_preds))
r2_ann   = r2_score(y_test, ann_preds)
print(f"ANN - MAE: {mae_ann:.2f}, RMSE: {rmse_ann:.2f}, R2: {r2_ann:.3f}")

### 3.5 Hyperparameter Tuning & Advanced Algorithms
We use GridSearchCV for RF and XGBoost, and demonstrate SVR and LightGBM as advanced models.

In [None]:
# Grid search for Random Forest
param_grid_rf = {'n_estimators':[50,100],'max_depth':[None,10,20]}
gs_rf = GridSearchCV(RandomForestRegressor(random_state=42), param_grid_rf,
                     cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
gs_rf.fit(X_train_scaled, y_train)
print('Best RF params:', gs_rf.best_params_)
rf_tuned_preds = evaluate_model('RF Tuned', gs_rf.best_estimator_, X_train_scaled, X_test_scaled, y_train, y_test)

# Grid search for XGBoost
param_grid_xgb = {'n_estimators':[50,100],'max_depth':[3,6],'learning_rate':[0.01,0.1]}
gs_xgb = GridSearchCV(xgb.XGBRegressor(objective='reg:squarederror', random_state=42),
                      param_grid_xgb, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
gs_xgb.fit(X_train_scaled, y_train)
print('Best XGB params:', gs_xgb.best_params_)
xgb_tuned_preds = evaluate_model('XGB Tuned', gs_xgb.best_estimator_, X_train_scaled, X_test_scaled, y_train, y_test)

# Support Vector Regressor
svr_preds = evaluate_model('SVR', SVR(kernel='rbf'), X_train_scaled, X_test_scaled, y_train, y_test)

# LightGBM Regressor
lgb_preds = evaluate_model('LightGBM', lgb.LGBMRegressor(random_state=42), X_train_scaled, X_test_scaled, y_train, y_test)

### 3.6 Additional Feature Engineering Ideas

For enhanced model performance, consider these additional features:

- **Pressure gradient**: Difference between WHP_psi and PDP_psi
- **Rate-of-change features**: Flow and pressure derivatives over time  
- **ESP power data**: Motor current and voltage measurements
- **Fluid properties**: Oil density, viscosity, API gravity
- **Reservoir parameters**: Porosity, permeability, skin factor
- **Well intervention**: Time since last maintenance or workover
- **Seasonal effects**: Cyclical patterns in production
- **Well age**: Days/months since well completion

## 3.7 Model Performance Summary & Feature Importance

Let's create a comprehensive comparison table and analyze which features are most important for predictions.

In [None]:
# Create performance comparison table
def create_performance_table():
    models_dict = {
        'Linear Regression': lr_preds,
        'Random Forest': rf_preds,
        'XGBoost': xgb_preds,
        'RF Tuned': rf_tuned_preds,
        'XGB Tuned': xgb_tuned_preds,
        'SVR': svr_preds,
        'LightGBM': lgb_preds,
        'ANN': ann_preds
    }
    
    results = []
    for name, preds in models_dict.items():
        mae = mean_absolute_error(y_test, preds)
        rmse = np.sqrt(mean_squared_error(y_test, preds))
        r2 = r2_score(y_test, preds)
        results.append({'Model': name, 'MAE': mae, 'RMSE': rmse, 'R²': r2})
    
    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values('R²', ascending=False)
    return results_df

# Display performance table
perf_table = create_performance_table()
print("🏆 Model Performance Ranking (by R²):")
print("=" * 50)
display(perf_table.round(3))

# Feature importance for tree-based models
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Feature Importance Analysis', fontsize=16)

# Random Forest
rf_importance = pd.DataFrame({
    'feature': features,
    'importance': gs_rf.best_estimator_.feature_importances_
}).sort_values('importance', ascending=True)

axes[0,0].barh(rf_importance['feature'], rf_importance['importance'])
axes[0,0].set_title('Random Forest Feature Importance')
axes[0,0].set_xlabel('Importance')

# XGBoost
xgb_importance = pd.DataFrame({
    'feature': features,
    'importance': gs_xgb.best_estimator_.feature_importances_
}).sort_values('importance', ascending=True)

axes[0,1].barh(xgb_importance['feature'], xgb_importance['importance'])
axes[0,1].set_title('XGBoost Feature Importance')
axes[0,1].set_xlabel('Importance')

# LightGBM
lgb_model = lgb.LGBMRegressor(random_state=42)
lgb_model.fit(X_train_scaled, y_train)
lgb_importance = pd.DataFrame({
    'feature': features,
    'importance': lgb_model.feature_importances_
}).sort_values('importance', ascending=True)

axes[1,0].barh(lgb_importance['feature'], lgb_importance['importance'])
axes[1,0].set_title('LightGBM Feature Importance')
axes[1,0].set_xlabel('Importance')

# Combined importance (average)
combined_importance = pd.DataFrame({
    'feature': features,
    'rf_imp': gs_rf.best_estimator_.feature_importances_,
    'xgb_imp': gs_xgb.best_estimator_.feature_importances_,
    'lgb_imp': lgb_model.feature_importances_
})
combined_importance['avg_importance'] = combined_importance[['rf_imp', 'xgb_imp', 'lgb_imp']].mean(axis=1)
combined_importance = combined_importance.sort_values('avg_importance', ascending=True)

axes[1,1].barh(combined_importance['feature'], combined_importance['avg_importance'])
axes[1,1].set_title('Average Feature Importance')
axes[1,1].set_xlabel('Average Importance')

plt.tight_layout()
plt.show()

print("🔍 Top 5 Most Important Features:")
top_features = combined_importance.nlargest(5, 'avg_importance')[['feature', 'avg_importance']]
for idx, row in top_features.iterrows():
    print(f"  {row['feature']}: {row['avg_importance']:.3f}")

## 4. Results and Visualization
Compare actual vs predicted values.

In [None]:
# 4.1 Actual vs Predicted for all models
plt.figure(figsize=(16,12))
models_preds = {
    'Linear Regression': lr_preds,
    'Random Forest': rf_preds,
    'XGBoost': xgb_preds,
    'RF Tuned': rf_tuned_preds,
    'XGB Tuned': xgb_tuned_preds,
    'SVR': svr_preds,
    'LightGBM': lgb_preds,
    'ANN': ann_preds
}

for i, (name, preds) in enumerate(models_preds.items(), 1):
    plt.subplot(3, 3, i)
    plt.scatter(y_test, preds, alpha=0.6)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    plt.title(f'{name}')
    plt.xlabel('Actual PDP_psi')
    plt.ylabel('Predicted PDP_psi')
plt.tight_layout()
plt.show()

# 4.2 ANN Training History
plt.figure(figsize=(12,4))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title('ANN Training Loss')
plt.xlabel('Epoch')
plt.ylabel('MSE')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['mae'], label='Train MAE')
plt.plot(history.history['val_mae'], label='Val MAE')
plt.title('ANN Training MAE')
plt.xlabel('Epoch')
plt.ylabel('MAE')
plt.legend()
plt.tight_layout()
plt.show()

# 4.3 Model Serialization for Production
print("💾 Saving trained models...")
joblib.dump(gs_xgb.best_estimator_, 'best_xgb_model.pkl')
joblib.dump(scaler, 'feature_scaler.pkl')
ann.save('ann_model.h5')

print('✅ Models saved successfully:')
print('  📁 best_xgb_model.pkl (XGBoost)')
print('  📁 feature_scaler.pkl (StandardScaler)')  
print('  📁 ann_model.h5 (Neural Network)')

# Download models in Google Colab
try:
    import google.colab
    from google.colab import files
    import os
    
    print('\n📥 Preparing models for download...')
    
    # Check if files exist before downloading
    model_files = ['best_xgb_model.pkl', 'feature_scaler.pkl', 'ann_model.h5']
    for file in model_files:
        if os.path.exists(file):
            print(f'📦 Downloading {file}...')
            files.download(file)
        else:
            print(f'⚠️ {file} not found')
    
    print('✅ Download complete! Check your Downloads folder.')
    
except ImportError:
    print('💻 Running locally - models saved to current directory')
except Exception as e:
    print(f'❌ Download error: {e}')
    print('💡 You can manually download files from the Colab file browser')

### 4.4 Model Diagnostics & Residual Analysis
Analyze model performance patterns and identify potential issues.

In [None]:
# Residual Analysis for Best Models (XGB Tuned and ANN)
best_models = {
    'XGB Tuned': xgb_tuned_preds,
    'ANN': ann_preds
}

fig, axes = plt.subplots(2, 3, figsize=(18, 10))
fig.suptitle('Model Diagnostics: Residual Analysis', fontsize=16)

for i, (name, preds) in enumerate(best_models.items()):
    residuals = y_test - preds
    
    # Residual vs Predicted
    axes[i, 0].scatter(preds, residuals, alpha=0.6)
    axes[i, 0].axhline(y=0, color='red', linestyle='--')
    axes[i, 0].set_xlabel('Predicted Values')
    axes[i, 0].set_ylabel('Residuals')
    axes[i, 0].set_title(f'{name}: Residuals vs Predicted')
    
    # Residual Distribution
    axes[i, 1].hist(residuals, bins=30, alpha=0.7, edgecolor='black')
    axes[i, 1].set_xlabel('Residuals')
    axes[i, 1].set_ylabel('Frequency')
    axes[i, 1].set_title(f'{name}: Residual Distribution')
    axes[i, 1].axvline(x=0, color='red', linestyle='--')
    
    # Q-Q Plot
    from scipy import stats
    stats.probplot(residuals, dist="norm", plot=axes[i, 2])
    axes[i, 2].set_title(f'{name}: Q-Q Plot')

plt.tight_layout()
plt.show()

# Error analysis by well
print("📊 Error Analysis by Well:")
print("=" * 40)

# Add well information to test set for analysis
test_indices = X_test.index
wells_test = df.loc[test_indices, 'Well Name'].values
xgb_residuals = y_test - xgb_tuned_preds

error_by_well = pd.DataFrame({
    'Well': wells_test,
    'Actual': y_test.values,
    'Predicted': xgb_tuned_preds,
    'Residual': xgb_residuals,
    'Abs_Error': np.abs(xgb_residuals)
})

well_stats = error_by_well.groupby('Well').agg({
    'Abs_Error': ['mean', 'std', 'count'],
    'Residual': ['mean', 'std']
}).round(2)

print(well_stats)

# Prediction intervals (for demonstration with XGBoost)
print(f"\n🎯 XGBoost Model Performance Summary:")
print(f"Mean Absolute Error: {np.mean(np.abs(xgb_residuals)):.2f} psi")
print(f"Standard Deviation: {np.std(xgb_residuals):.2f} psi")
print(f"95% Prediction Interval: ± {1.96 * np.std(xgb_residuals):.2f} psi")

## 5. Production Deployment Notes

### Model Serialization & Loading
```python
# Loading saved models for production
import joblib
from tensorflow.keras.models import load_model

# Load models
xgb_model = joblib.load('best_xgb_model.pkl')
scaler = joblib.load('feature_scaler.pkl')
ann_model = load_model('ann_model.h5')

# Make predictions on new data
def predict_pressure(new_data):
    scaled_data = scaler.transform(new_data)
    xgb_pred = xgb_model.predict(scaled_data)
    ann_pred = ann_model.predict(scaled_data)
    return xgb_pred, ann_pred
```

### API Deployment Options
- **Flask/FastAPI**: Create REST API endpoints
- **Docker**: Containerize for cloud deployment  
- **AWS SageMaker**: Managed ML inference
- **Azure ML**: Enterprise ML platform
- **Real-time monitoring**: Track model drift and performance

## 6. Exercises & Next Steps

### 🎓 Student Exercises (Choose 2-3 based on your interest)

#### **Beginner Level:**
1. **Feature Engineering**: Implement pressure gradient (WHP_psi - PDP_psi) as a new feature
2. **Data Visualization**: Create box plots showing PDP_psi distribution by well
3. **Model Comparison**: Add Decision Tree Regressor and compare performance

#### **Intermediate Level:**
4. **Cross-Validation**: Implement TimeSeriesSplit for proper temporal validation
5. **Ensemble Methods**: Create a VotingRegressor combining top 3 models
6. **Hyperparameter Optimization**: Use RandomizedSearchCV on more parameters
7. **Well-Specific Analysis**: Train separate models for each well and compare

#### **Advanced Level:**
8. **Custom Features**: Create rolling averages for pressure and flow rate
9. **Anomaly Detection**: Identify unusual pressure readings using IsolationForest
10. **Production API**: Build a Flask/FastAPI endpoint for real-time predictions
11. **Model Monitoring**: Implement drift detection using Evidently AI

### 🚀 Advanced Research Topics

#### **Physics-Informed Machine Learning:**
- Incorporate ESP performance curves and fluid dynamics
- Add thermodynamic constraints to neural networks
- Use physics-based loss functions

#### **Time Series & Sequential Modeling:**
- LSTM networks for temporal pressure prediction
- Prophet for seasonal trend analysis
- Kalman filters for real-time state estimation

#### **Uncertainty Quantification:**
- Bayesian neural networks for prediction intervals
- Monte Carlo dropout for uncertainty estimation
- Conformal prediction for reliable intervals

#### **Transfer Learning & Domain Adaptation:**
- Apply models trained on one field to another
- Few-shot learning for new wells
- Domain adversarial training for robust models

### 🏭 Real-World Implementation Challenges

#### **Data Quality & Engineering:**
- Handle sensor failures and missing data
- Real-time data streaming and processing
- Data validation and quality checks

#### **Production Deployment:**
- Model versioning and A/B testing
- Monitoring model performance and drift
- Automated retraining pipelines
- Edge computing for downhole sensors

#### **Business Integration:**
- Cost-benefit analysis of predictions
- Integration with existing SCADA systems
- Regulatory compliance and documentation
- Safety-critical system requirements

#### **Maintenance & Operations:**
- Model interpretability for domain experts
- Alert systems for abnormal conditions
- Predictive maintenance scheduling
- Human-in-the-loop validation

### 📚 Recommended Reading
- "Hands-On Machine Learning" by Aurélien Géron
- "Pattern Recognition and Machine Learning" by Christopher Bishop
- "The Elements of Statistical Learning" by Hastie, Tibshirani, Friedman
- "Applied Predictive Modeling" by Kuhn & Johnson

## 7. Bonus: Quick Implementation Examples

### 🎯 Ready-to-Use Code Snippets for Common Extensions

In [None]:
# BONUS 1: Create Ensemble Model
from sklearn.ensemble import VotingRegressor

# Create ensemble of top 3 models
ensemble = VotingRegressor([
    ('xgb', gs_xgb.best_estimator_),
    ('lgb', lgb.LGBMRegressor(random_state=42)),
    ('ann', ann)  # Note: sklearn wrapper needed for ANN
])

# For demo purposes (ANN needs wrapper for sklearn ensemble)
print("💡 Ensemble model structure created")
print("   - XGBoost (tuned)")
print("   - LightGBM") 
print("   - Neural Network")

# BONUS 2: Simple Time Series Cross-Validation
from sklearn.model_selection import TimeSeriesSplit

tscv = TimeSeriesSplit(n_splits=3)
cv_scores = []

for train_idx, val_idx in tscv.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    # Scale data
    scaler_cv = StandardScaler()
    X_tr_scaled = scaler_cv.fit_transform(X_tr)
    X_val_scaled = scaler_cv.transform(X_val)
    
    # Train and evaluate
    model_cv = xgb.XGBRegressor(random_state=42)
    model_cv.fit(X_tr_scaled, y_tr)
    pred_cv = model_cv.predict(X_val_scaled)
    score_cv = r2_score(y_val, pred_cv)
    cv_scores.append(score_cv)

print(f"\n⏰ Time Series CV Results (XGBoost):")
print(f"   CV Scores: {[f'{s:.3f}' for s in cv_scores]}")
print(f"   Mean R²: {np.mean(cv_scores):.3f} ± {np.std(cv_scores):.3f}")

# BONUS 3: Feature Engineering Example
def create_engineered_features(df):
    """Create additional features for enhanced prediction"""
    df_eng = df.copy()
    
    # Pressure gradient
    df_eng['Pressure_Gradient'] = df_eng['WHP_psi'] - df_eng['PDP_psi']
    
    # Production intensity
    df_eng['Production_Intensity'] = df_eng['Ql_blpd'] / df_eng['PDHG_Dep_ft']
    
    # Power proxy (frequency * pressure)
    df_eng['Power_Proxy'] = df_eng['Freq_Hz'] * df_eng['WHP_psi']
    
    # Water cut ratio
    df_eng['WCT_Ratio'] = df_eng['WCT, %'] / (100 - df_eng['WCT, %'] + 1e-6)
    
    return df_eng

print(f"\n🔧 Feature Engineering Function Created:")
print("   - Pressure_Gradient = WHP_psi - PDP_psi")
print("   - Production_Intensity = Ql_blpd / PDHG_Dep_ft")
print("   - Power_Proxy = Freq_Hz * WHP_psi")
print("   - WCT_Ratio = WCT / (100 - WCT)")

# BONUS 4: Simple API Prediction Function
def predict_downhole_pressure(well_data, model_path='best_xgb_model.pkl', scaler_path='feature_scaler.pkl'):
    """
    Predict downhole pressure for new well data
    
    Args:
        well_data: dict with keys matching feature names
        model_path: path to saved model
        scaler_path: path to saved scaler
    
    Returns:
        predicted pressure in psi
    """
    try:
        # Load model and scaler
        model = joblib.load(model_path)
        scaler = joblib.load(scaler_path)
        
        # Prepare data
        input_array = np.array([list(well_data.values())]).reshape(1, -1)
        input_scaled = scaler.transform(input_array)
        
        # Predict
        prediction = model.predict(input_scaled)[0]
        
        return {
            'predicted_pressure_psi': round(prediction, 2),
            'status': 'success'
        }
    except Exception as e:
        return {
            'error': str(e),
            'status': 'failed'
        }

# Example usage
example_data = {
    'Well Name Encoded': 1,
    'PDHG_Dep_ft': 8000,
    'Ql_blpd': 5000,
    'Gor_scf_bbl': 100,
    'WCT, %': 15,
    'Choke': 35,
    'Freq_Hz': 55,
    'WHP_psi': 800,
    'WHT_degF': 140,
    'Year': 2025,
    'Month': 8,
    'Day': 24
}

print(f"\n🎯 API Prediction Function Created:")
print("   Usage: predict_downhole_pressure(well_data)")
print("   Returns: {'predicted_pressure_psi': value, 'status': 'success'}")

print(f"\n✨ Bonus implementations completed!")
print("   🔗 Ensemble modeling approach")
print("   ⏰ Time series cross-validation")  
print("   🔧 Feature engineering pipeline")
print("   🌐 Production API function")