In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

## 🛢️ Project Overview: Oil Region Selection with Predictive Modeling

In this project, I work for the fictional OilyGiant mining company to determine the best region for developing a new oil well. The goal is to maximize profit and minimize risk using data-driven methods.

### 🔍 Project Steps:
1. **Data Preparation**  
   Load and explore geological data from three regions (`geo_data_0`, `geo_data_1`, and `geo_data_2`).

2. **Modeling**  
   Train a linear regression model for each region to predict oil reserve volumes based on available features.

3. **Evaluation**  
   Assess each model's performance using the RMSE metric and predicted reserve volumes.

4. **Profit Simulation**  
   Use model predictions to estimate profit from the 200 most promising wells in each region.

5. **Risk Analysis**  
   Apply the bootstrapping technique to simulate profit distributions and evaluate the risk of financial loss.

6. **Final Decision**  
   Select the most profitable and least risky region for oil well development.


In [31]:
data_0 = pd.read_csv('geo_data_0.csv')
data_1 = pd.read_csv('geo_data_1.csv')
data_2 = pd.read_csv('geo_data_2.csv')

def prepare_data(data):
    features = data.drop(['product', 'id'], axis=1)
    target = data['product']
    return features, target

features_0, target_0 = prepare_data(data_0)
features_1, target_1 = prepare_data(data_1)
features_2, target_2 = prepare_data(data_2)


### Preliminary look at thte data

In [32]:
data_0.describe()
data_1.describe()
data_2.describe()


Unnamed: 0,f0,f1,f2,product
count,100000.0,100000.0,100000.0,100000.0
mean,0.002023,-0.002081,2.495128,95.0
std,1.732045,1.730417,3.473445,44.749921
min,-8.760004,-7.08402,-11.970335,0.0
25%,-1.162288,-1.17482,0.130359,59.450441
50%,0.009424,-0.009482,2.484236,94.925613
75%,1.158535,1.163678,4.858794,130.595027
max,7.238262,7.844801,16.739402,190.029838


### Function to train each region using linear regression and RMSE to calculate loss

In [33]:
def train_and_evaluate_model(features, target):
    features_train, features_valid, target_train, target_valid = train_test_split(
        features, target, test_size=0.25, random_state=42)
    
    model = LinearRegression()
    model.fit(features_train, target_train)
    predictions_valid = model.predict(features_valid)
    
    rmse = mean_squared_error(target_valid, predictions_valid, squared=False)
    print("Average predicted reserves:", predictions_valid.mean())
    print("RMSE:", rmse)
    
    return predictions_valid, target_valid.reset_index(drop=True)

### Predictions and loss metrics

In [34]:
print("Region 0:")
pred_0, target_valid_0 = train_and_evaluate_model(features_0, target_0)

print("\nRegion 1:")
pred_1, target_valid_1 = train_and_evaluate_model(features_1, target_1)

print("\nRegion 2:")
pred_2, target_valid_2 = train_and_evaluate_model(features_2, target_2)

Region 0:
Average predicted reserves: 92.39879990657768
RMSE: 37.75660035026169

Region 1:
Average predicted reserves: 68.71287803913762
RMSE: 0.8902801001028837

Region 2:
Average predicted reserves: 94.77102387765939
RMSE: 40.145872311342174




### Profit Parameters

In [35]:
BUDGET = 100_000_000
WELLS_TO_SELECT = 200
REVENUE_PER_1000_BARRELS = 4500
WELL_COST = BUDGET / WELLS_TO_SELECT

min_reserve_required = WELL_COST / REVENUE_PER_1000_BARRELS
print(f"Minimum reserves to avoid loss: {min_reserve_required:.2f} thousand barrels")

print("\nAverage actual reserves in validation sets:")
print("Region 0:", target_valid_0.mean())
print("Region 1:", target_valid_1.mean())
print("Region 2:", target_valid_2.mean())

Minimum reserves to avoid loss: 111.11 thousand barrels

Average actual reserves in validation sets:
Region 0: 92.32595637084387
Region 1: 68.72538074722745
Region 2: 95.15099907171961


### Calculating profit

In [36]:
def calculate_profit(target, predictions):
    selected = predictions.sort_values(ascending=False)[:WELLS_TO_SELECT]
    total_volume = target[selected.index].sum()
    revenue = total_volume * REVENUE_PER_1000_BARRELS
    return revenue - BUDGET

In [37]:
print("\nEstimated profits using top 200 predicted wells:")
print("Region 0:", calculate_profit(target_valid_0, pd.Series(pred_0)))
print("Region 1:", calculate_profit(target_valid_1, pd.Series(pred_1)))
print("Region 2:", calculate_profit(target_valid_2, pd.Series(pred_2)))


Estimated profits using top 200 predicted wells:
Region 0: 33591411.14462179
Region 1: 24150866.966815114
Region 2: 25985717.59374112


### Bootstrapping for accuracy

In [38]:
def bootstrap_profit(target, predictions, n_bootstrap=1000):
    profits = []
    state = np.random.RandomState(42)
    
    for _ in range(n_bootstrap):
        sample_indices = state.choice(predictions.index, 500, replace=True)
        sample_preds = predictions.loc[sample_indices]
        sample_target = target.loc[sample_indices]
        profit = calculate_profit(sample_target, sample_preds)
        profits.append(profit)
        
    profits = pd.Series(profits)
    mean_profit = profits.mean()
    ci = profits.quantile([0.025, 0.975])
    risk = (profits < 0).mean()
    
    return mean_profit, ci, risk, profits

In [39]:
pred_0_s = pd.Series(pred_0).reset_index(drop=True)
pred_1_s = pd.Series(pred_1).reset_index(drop=True)
pred_2_s = pd.Series(pred_2).reset_index(drop=True)

print("\nBootstrapping....")
mean_0, ci_0, risk_0, dist_0 = bootstrap_profit(target_valid_0, pred_0_s)
mean_1, ci_1, risk_1, dist_1 = bootstrap_profit(target_valid_1, pred_1_s)
mean_2, ci_2, risk_2, dist_2 = bootstrap_profit(target_valid_2, pred_2_s)


Bootstrapping....


### Calculate results and confidence

In [40]:
def print_results(region_name, mean_profit, confidence_interval, risk):
    lower_bound = confidence_interval[0.025]
    upper_bound = confidence_interval[0.975]
    
    print(f"\n{region_name}")
    print(f"  Average Profit:       ${mean_profit:,.0f}")
    print(f"  95% Confidence Range: ${lower_bound:,.0f} — ${upper_bound:,.0f}")
    print(f"  Risk of Loss:          {risk:.2%}")

print_results("Region 0", mean_0, ci_0, risk_0)
print_results("Region 1", mean_1, ci_1, risk_1)
print_results("Region 2", mean_2, ci_2, risk_2)


Region 0
  Average Profit:       $6,061,226
  95% Confidence Range: $100,894 — $12,463,710
  Risk of Loss:          2.50%

Region 1
  Average Profit:       $6,651,177
  95% Confidence Range: $1,808,516 — $12,057,105
  Risk of Loss:          0.20%

Region 2
  Average Profit:       $5,851,036
  95% Confidence Range: $-8,369 — $12,120,509
  Risk of Loss:          2.60%


In [41]:
results = {
    "Region 0": {"mean": mean_0, "risk": risk_0},
    "Region 1": {"mean": mean_1, "risk": risk_1},
    "Region 2": {"mean": mean_2, "risk": risk_2}
}

filtered = {k: v for k, v in results.items() if v["risk"] < 0.025}
best_region = max(filtered.items(), key=lambda x: x[1]["mean"])

print(f"✅ Best region for development: {best_region[0]}")
print(f"Expected profit: ${best_region[1]['mean']:,.0f}")
print(f"Risk of loss: {best_region[1]['risk']:.2%}")

✅ Best region for development: Region 1
Expected profit: $6,651,177
Risk of loss: 0.20%


In [42]:
results_df = pd.DataFrame({
    "Region": ["Region 0", "Region 1", "Region 2"],
    "Average Profit ($)": [mean_0, mean_1, mean_2],
    "95% CI Lower ($)": [ci_0[0.025], ci_1[0.025], ci_2[0.025]],
    "95% CI Upper ($)": [ci_0[0.975], ci_1[0.975], ci_2[0.975]],
    "Risk of Loss (%)": [risk_0 * 100, risk_1 * 100, risk_2 * 100]
})


results_df = results_df.round(2)
print("\nSummary Table:\n")
display(results_df)


Summary Table:



Unnamed: 0,Region,Average Profit ($),95% CI Lower ($),95% CI Upper ($),Risk of Loss (%)
0,Region 0,6061226.32,100894.12,12463709.81,2.5
1,Region 1,6651176.54,1808515.85,12057104.61,0.2
2,Region 2,5851036.38,-8369.42,12120508.98,2.6


## ✅ Final Summary: Recommendation for Oil Well Development

After analyzing data from all three regions, the following insights were obtained:

- **Linear regression models** were trained and validated for each region.
- **Region 1** consistently showed the highest predicted profits and the lowest risk of loss.
- Bootstrapping confirmed that **Region 1 has an average profit exceeding the break-even point**, with a risk of loss well below 2.5%.

### 🏆 Recommendation:
Develop the new oil wells in **Region 1**, as it offers the best balance of profitability and low financial risk.
