# Oily Giant Mining; Geological Exploration, by Deborah Thomas

<H2>This data mining will answer the question: "What is the best geographical site for a new well?"</H2>

- I will find this answer by studying existing oil quality and volume of reserves. 
- I will build a predictive model that will help predict the geographic location that has the potential for highest profit.
- I will predict the volume of reserves in this new well.

## <div style="color: red; border: 2px solid yellow; display: inline-block;">Import libraries</div>

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

from IPython.display import display, HTML

import matplotlib.pyplot as plt

import plotly.graph_objects as go
import plotly.io as pio

from termcolor import colored

## <div style="color: red; border: 2px solid yellow; display: inline-block;">Load in 3 datasets:</div>

### ( 1 dataset for each region )

In [None]:
try:
    region1 = pd.read_csv('/datasets/geo_data_0.csv')  # Attempt to read from the server path
except FileNotFoundError:
    region1 = pd.read_csv('../datasets/geo_data_0.csv')  # Fallback to the local path


display(region1.head(10))

In [None]:
try:
    region2 = pd.read_csv('/datasets/geo_data_1.csv')  # Attempt to read from the server path
except FileNotFoundError:
    region2 = pd.read_csv('../datasets/geo_data_1.csv')  # Fallback to the local path


display(region2.head(10))

In [None]:
try:
    region3 = pd.read_csv('/datasets/geo_data_2.csv')  # Attempt to read from the server path
except FileNotFoundError:
    region3 = pd.read_csv('../datasets/geo_data_2.csv')  # Fallback to the local path


display(region3.head(10))

### Explanation of the columns :
- id: unique oil well identifier
- f0, f1, f2: three features of points
- product: volume of reserves in the oil well (per thousand barrels).

### Explanation of value of each unit:
- 1 unit = 1,000 barrels
- 1 barrel's worth = /$4.50 USD.
- 1 unit's worth = **\$4.50** times 1,000 = **\$4,500 USD**

In [None]:
print("Region 1 dataset:")
print(region1.shape)

In [None]:
print("Region 2 dataset:")
print(region2.shape)

In [None]:
print("Region 3 dataset:")
print(region3.shape)

#### All 3 datasets have the same amount of rows and columns.

## <div style="color: red; border: 2px solid yellow; display: inline-block;">Clean the Data</div>

In [None]:
region1.info()

### Rename columns:

#### Change 'id' to 'well_id' to make it clear that each row represents one well.  Change 'products' to 'vol_reseres' to make it clear that this column is the total volume of reserves.

In [None]:
region1.rename(columns={
    'id': 'well_id',
    'product': 'vol_reserves'
}, inplace=True)

In [None]:
region1.columns

In [None]:
region2.rename(columns={
    'id': 'well_id',
    'product': 'vol_reserves'
}, inplace=True)

In [None]:
region2.columns

In [None]:
region3.rename(columns={
    'id': 'well_id',
    'product': 'vol_reserves'
}, inplace=True)

In [None]:
region3.columns

#### Datatypes, on all three datasets, looks good.

#### There is no missing data, on any of the three datasets.

### Drop duplicates

In [None]:
region1 = region1.drop_duplicates(subset='well_id')

In [None]:
region1.shape

In [None]:
region2 = region2.drop_duplicates(subset='well_id')

In [None]:
region2.shape

In [None]:
region3 = region3.drop_duplicates(subset='well_id')

In [None]:
region3.shape

#### Data is clean

#### ... but, region1 now has 6 fewer rows than region2 and region3.

## <div style="color: red; border: 2px solid yellow; display: inline-block;">Exploratory data analysis:</div>

## <span style="color:blue;">Region 1:</span>

In [None]:
region1.describe()

In [None]:
# Generate the summary statistics
summary_stats_reg1 = region1.describe()

In [None]:
# Extract the mean value for the 'product' column
mean_reserves_reg1 = round(summary_stats_reg1.loc['mean', 'vol_reserves'], 2)
print(mean_reserves_reg1)

In [None]:
# Extract the mean value for the features
mean_f0_reg1 = round(summary_stats_reg1.loc['mean', 'f0'], 2)
mean_f1_reg1 = round(summary_stats_reg1.loc['mean', 'f1'], 2)
mean_f2_reg1 = round(summary_stats_reg1.loc['mean', 'f2'], 2)

In [None]:
# HTML styled string
formatted_string_reg1_mean = f'<span style="color:blue; font-size:16px;">The average amount of oil reserves, in region 1 is: &nbsp; {mean_reserves_reg1:.2f}</span>'
formatted_string_reg1_mean_features = f'<span style="color:blue; font-size:16px;">The mean values of features, in region 1 are: &nbsp; f0: &nbsp; {mean_f0_reg1:.2f}, &nbsp;f1: &nbsp; {mean_f1_reg1:.2f}, &nbsp;f2: &nbsp;{mean_f2_reg1:.2f}</span>'

# Display the formatted string
display(HTML(formatted_string_reg1_mean))
display(HTML(formatted_string_reg1_mean_features))

In [None]:
# Data for plotting region1 mean values
features = ['f0', 'f1', 'f2', 'vol_reserves']
values = [mean_f0_reg1, mean_f1_reg1, mean_f2_reg1, mean_reserves_reg1]

In [None]:
pio.renderers.default = 'png'

# Create a bar plot
fig1 = go.Figure(data=[
    go.Bar(
        x=features, 
        y=values, 
        marker_color=['lightblue', 'lightblue', 'lightblue', 'blue'],
        text=values,  # hover text uses 'text' attribute
        hoverinfo='text',  # display text when hovering
    )
])

# Add titles and labels
fig1.update_layout(
    title={
        'text': 'Average of Features and Volume Reserves in Region 1',
        'x': 0.5,  # Center the title
        'xanchor': 'center',
    },
    xaxis_title='Features',
    yaxis_title='Mean Values',
    font=dict(size=14),
)

fig1.show()

## <span style="color:limegreen;">Region 2:</span>

In [None]:
region2.describe()

In [None]:
# Generate the summary statistics
summary_stats_reg2 = region2.describe()

In [None]:
# Extract the mean value for the 'product' column
mean_reserves_reg2 = round(summary_stats_reg2.loc['mean', 'vol_reserves'], 2)
print(mean_reserves_reg2)

In [None]:
# Extract the mean value for the features
mean_f0_reg2 = round(summary_stats_reg2.loc['mean', 'f0'], 2)
mean_f1_reg2 = round(summary_stats_reg2.loc['mean', 'f1'], 2)
mean_f2_reg2 = round(summary_stats_reg2.loc['mean', 'f2'], 2)

In [None]:
# HTML styled string
formatted_string_reg2_mean_reserves = f'<span style="color:limegreen; font-size:16px;">The average amount of oil reserves, in region 2 is: &nbsp;  {mean_reserves_reg2:.2f}</span>'
formatted_string_reg2_mean_features = f'<span style="color:limegreen; font-size:16px;">The mean values of features, in region 2 are: &nbsp; f0:&nbsp; {mean_f0_reg2:.2f}, &nbsp;f1:&nbsp; {mean_f1_reg2:.2f}, &nbsp; f2: &nbsp;{mean_f2_reg2:.2f}</span>'

# Display the formatted string
display(HTML(formatted_string_reg2_mean_reserves))
display(HTML(formatted_string_reg2_mean_features))

In [None]:
# Data for plotting region2 mean values
features2 = ['f0', 'f1', 'f2', 'vol_reserves']
values2 = [mean_f0_reg2, mean_f1_reg2, mean_f2_reg2, mean_reserves_reg2]

In [None]:
# Create a bar plot
fig2 = go.Figure(data=[
    go.Bar(
        x=features2, 
        y=values2, 
        marker_color=['lightgreen', 'lightgreen', 'lightgreen', 'limegreen'],
        text=values,  # hover text uses 'text' attribute
        hoverinfo='text',  # display text when hovering
    )
])

# Add titles and labels
fig2.update_layout(
    title={
        'text': 'Average of Features and Volume Reserves in Region 2',
        'x': 0.5,  # Center the title
        'xanchor': 'center',
    },
    xaxis_title='Features',
    yaxis_title='Mean Values',
    font=dict(size=14),
)

fig2.show()

#### Interesting... f1 mean is below zero.

## <span style="color:orange;">Region 3:</span>

In [None]:
region3.describe()

In [None]:
# Generate the summary statistics
summary_stats_reg3 = region3.describe()

In [None]:
# Extract the mean value for the 'product' column
mean_reserves_reg3 = round(summary_stats_reg3.loc['mean', 'vol_reserves'], 2)
print(mean_reserves_reg3)

In [None]:
# Extract the mean value for the features
mean_f0_reg3 = round(summary_stats_reg3.loc['mean', 'f0'], 2)
mean_f1_reg3 = round(summary_stats_reg3.loc['mean', 'f1'], 2)
mean_f2_reg3 = round(summary_stats_reg3.loc['mean', 'f2'], 2)

In [None]:
# HTML styled string
formatted_string_reg3_mean_reserves = f'<span style="color:orange; font-size:16px;">The average amount of oil reserves, in region 3 is: &nbsp; {mean_reserves_reg3:.2f}</span>'
formatted_string_reg3_mean_features = f'<span style="color:orange; font-size:16px;">The mean values of features, in region 3 are: &nbsp; f0: &nbsp; {mean_f0_reg3:.2f}, &nbsp; f1: &nbsp; {mean_f1_reg3:.2f}, &nbsp; f2: &nbsp; {mean_f2_reg3:.2f}</span>'

# Display the formatted string
display(HTML(formatted_string_reg3_mean_reserves))
display(HTML(formatted_string_reg3_mean_features))

In [None]:
# Data for plotting region2 mean values
features3 = ['f0', 'f1', 'f2', 'vol_reserves']
values3 = [mean_f0_reg3, mean_f1_reg3, mean_f2_reg3, mean_reserves_reg3]

In [None]:
# Create a bar plot
fig3 = go.Figure(data=[
    go.Bar(
        x=features3, 
        y=values3, 
        marker_color=['coral', 'coral', 'coral', 'orange'],
        text=values,  # hover text uses 'text' attribute
        hoverinfo='text',  # display text when hovering
    )
])

# Add titles and labels
fig3.update_layout(
    title={
        'text': 'Average of Features and Volume Reserves in Region 3',
        'x': 0.5,  # Center the title
        'xanchor': 'center',
    },
    xaxis_title='Features',
    yaxis_title='Mean Values',
    font=dict(size=14),
)

fig3.show()

#### f0 and f1 mean are both very close to zero, so nothing shows on the graph for those means.

<div style="color: red; padding: 10px;">
    <h3>Volume of Reserves for all 3 regions:</h3>
</div>

In [None]:
# Data for plotting
regions = ['Region 1', 'Region 2', 'Region 3']
mean_vol_reserves = [mean_reserves_reg1, mean_reserves_reg2, mean_reserves_reg3]
colors = ['blue', 'limegreen', 'orange']

In [None]:
# Create a bar plot
fig = go.Figure(data=[
    go.Bar(
        x=regions, 
        y=mean_vol_reserves, 
        marker_color=colors,
        text=mean_vol_reserves,  # hover text uses 'text' attribute
        hoverinfo='text',  # display text when hovering
    )
])

# Add titles and labels
fig.update_layout(
    title={
        'text': 'Average Volume Reserves by Region',
        'x': 0.5,  # Center the title
        'xanchor': 'center',
    },
    xaxis_title='Regions',
    yaxis_title='Mean Volume Reserves',
    font=dict(size=14),
)

fig.show()

#### Region 3 has the highest volume of reserves. Region 2 has the lowest.

## <div style="color: red; border: 2px solid yellow; display: inline-block;">Predictions for best location for new well:</div>

<h2><b>This section will:</b></h2>
<ul>
    <li>Split the data into training and validation sets.</li>
    <li>Scale the data</li>
    <li>Train a linear regression model for each region.</li>
    <li>Evaluate the model's performance using RMSE (Root Mean Squared Error).</li>
</ul>

<div style="color: red; padding: 10px;">
    <h3>Split the 3 datasets:</h3>
</div>

In [None]:
def split_data(df, feature_cols, target_col, test_size=0.25, random_state=12345):
    # Split the data into features and target sets
    features = df[feature_cols]
    target = df[target_col]

    # Split the data into training and validation sets
    features_train, features_valid, target_train, target_valid = train_test_split(
        features, target, test_size=test_size, random_state=random_state
    )
    
    return features_train, features_valid, target_train, target_valid

In [None]:
#The column names are the same for all three datasets (region1, region2, and region3), so this only has to be done once.
feature_columns = [col for col in region1.columns if col not in ['well_id', 'vol_reserves']]
target_column = 'vol_reserves'

print(feature_columns)
print(target_column)

In [None]:
# Call the split_data function

In [None]:
features_train1, features_valid1, target_train1, target_valid1 = split_data(region1, feature_columns, target_column)
features_train2, features_valid2, target_train2, target_valid2 = split_data(region2, feature_columns, target_column)
features_train3, features_valid3, target_train3, target_valid3 = split_data(region3, feature_columns, target_column)

In [None]:
len(features_train1)

In [None]:
len(features_valid1)

In [None]:
len(target_train1)

In [None]:
len(target_valid1)

- <b>Features: variables (input data) used to make predictions.</b>
<br>
- <b>Target:  variable (output data) that I am trying to predict.</b>

<H3>Purpose of Each Split</H3>
<br>
1. <b>features_train:</b> 75% of the feature data.
<br>
Purpose: Used to train the machine learning model. The model learns the patterns and relationships within this data.
<br>
<br>
2. <b>features_valid:</b> 25% of the feature data.
<br>
Purpose: Used to validate the model’s performance. This data is not seen by the model during training, providing an unbiased evaluation of the model’s predictive performance.

<br>   
<br>
3. <b>target_train:</b> 75% of the target data.
<br>
Purpose: Represents the actual target values corresponding to features_train. The model uses this data during training to learn the mapping from features to target values.
<br>
<br>
4. <b>target_valid:</b> 25% of the target data.<br>
Purpose: Represents the actual target values corresponding to features_valid. Used to evaluate the model’s performance by comparing the model's predictions to these actual values.

<div style="color: red; padding: 10px;">
    <h3>Scale the features:</h3>
</div>

In [None]:
def scale_data(features_train, features_valid):
    scaler = StandardScaler()

    # Fit and transform the training data, transform the validation data
    features_train_scaled = scaler.fit_transform(features_train)
    features_valid_scaled = scaler.transform(features_valid)
    
    return features_train_scaled, features_valid_scaled  

In [None]:
# Scale data for each region
features_train1_scaled, features_valid1_scaled = scale_data(features_train1, features_valid1)
features_train2_scaled, features_valid2_scaled = scale_data(features_train2, features_valid2)
features_train3_scaled, features_valid3_scaled = scale_data(features_train3, features_valid3)

- Only the features are scaled.
- Scaling the target is generally not needed, especially for models like linear regression.

<div style="color: red; padding: 10px;">
    <h3>Create / Fit / Evaluate the Models:</h3>
</div>

In [None]:
def train_and_evaluate(features_train, features_valid, target_train, target_valid):
    # Initialize the model
    model = LinearRegression()
    
    # Train the model
    model.fit(features_train, target_train)

    # Predict on the validation set
    predictions = model.predict(features_valid)

    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(target_valid, predictions))

    # Calculate the average volume of predicted reserves
    avg_reserves = np.mean(predictions)
    
    # Return the model and the RMSE
    return model, rmse, avg_reserves, predictions

<div style="color: red; padding: 10px;">
    <h3>Call the Train / Evaluate function:</h3>
</div>

In [None]:
# Train and evaluate region 1
model1, rmse1, avg_reserves1, predictions1 = train_and_evaluate(features_train1_scaled, features_valid1_scaled, target_train1, target_valid1)
print(f"\033[94mRegion 1 RMSE: {rmse1:.2f}\nAverage Predicted Reserves: {avg_reserves1:.2f}\033[0m")

In [None]:
region1.vol_reserves.max()

In [None]:
# Train and evaluate region 2
model2, rmse2, avg_reserves2, predictions2  = train_and_evaluate(features_train2_scaled, features_valid2_scaled, target_train2, target_valid2)
print(colored(f"Region 2 RMSE: {rmse2:.2f}\nAverage Predicted Reserves: {avg_reserves2:.2f}", 'green'))

In [None]:
region2.vol_reserves.max()

In [None]:
# Train and evaluate region 3
model3, rmse3, avg_reserves3, predictions3 = train_and_evaluate(features_train3_scaled, features_valid3_scaled, target_train3, target_valid3)
print(f"\033[38;5;208mRegion 3 RMSE: {rmse3:.2f}\nAverage Predicted Reserves: {avg_reserves3:.2f}\033[0m")

In [None]:
region3.vol_reserves.max()

### RMSE outcomes(considering that the scale of the vol_reserves data is pretty low):

- region 1's RMSE score, of 37.85 is poor. There is high error in the model.
- region 2's RMSE score, of 0.89 is great. There is a very small error in the predictions.
- region 3's RMSE score, of 40 is poor. There is high error in the model.

### Predicted oil reserves outcome:

- Region 1 and region3 have the highest level of predicted oil reserves.
- Region 2 had the lowest predicted level of oil reserves.

<div style="color: red; padding: 10px;">
    <h3>Pick the top 200 wells, from each region, with the highest values of predictions:</h3>
</div>

In [None]:
#find the top 200 predictions for each set of predictions
#take the predictions from the model for each region
def get_top_n_predictions(predictions, top_n=200):
    # sort the predictions in descending order
    sorted_predictions = sorted(predictions, reverse=True)
    
    # get the top 200 predictions
    top_predictions = sorted_predictions[:top_n]
    return top_predictions

In [None]:
# Call the get_top_n_predictions function for each region 
top_n = 200
top_wells_high_val_pred1 = get_top_n_predictions(predictions1, top_n)
top_wells_high_val_pred2 = get_top_n_predictions(predictions2, top_n)
top_wells_high_val_pred3 = get_top_n_predictions(predictions3, top_n)

In [None]:
print(top_wells_high_val_pred1[:20])

In [None]:
print(min(top_wells_high_val_pred1))

In [None]:
print(top_wells_high_val_pred2[:20])

In [None]:
print(min(top_wells_high_val_pred2))

In [None]:
print(top_wells_high_val_pred3[:20])

In [None]:
print(min(top_wells_high_val_pred3))

<div style="color: red; padding: 10px;">
    <h3>Calculate the volume of reserves sufficient for developing 200 new wells without losses.:</h3>
</div>

In [None]:
# budget for 200 wells, in USD
budget_total = 100000000  

number_of_wells = 200

#budget is for 200 wells
budget_per_oil_well = budget_total / 200  
print("Budget per oil well: " + str(budget_per_oil_well))

rev_per_barrel = 4.5
rev_per_oil_well = rev_per_barrel * 1000
print("Revenue per oil well: " + str(rev_per_oil_well))

In [None]:
min_reserve_vol = budget_total / rev_per_oil_well / number_of_wells

print(f"Minimum average volume of reserves for development without losses: {min_reserve_vol:,.2f} (per project oil well)")

#### All 3 regions's top 200 predicted oil wells meet the required minimum ( of 111.11 volume of reserves )

<div style="color: red; padding: 10px;">
    <h3>Profit Calculation:</h3>
</div>

In [None]:
pred_reserves = [avg_reserves1, avg_reserves2, avg_reserves3]

In [None]:
targets = [target_valid1, target_valid2, target_valid3]

In [None]:
def profit(predictions, target, budget=100_000_000, total_wells=500, top_wells=200, price_per_barrel=4.5):
    # Number of predictions and target values should match
    assert len(predictions) == len(target)

    # Sort the predictions and select the top "top_wells" predictions
    sorted_indices = np.argsort(predictions)[-top_wells:]

    # Summarize the target values for these top predictions
    selected_product = target[sorted_indices]
    
    # Summarize the total volume of reserves from selected wells
    total_volume = selected_product.sum()
    
    # Calculate profit
    revenue = total_volume * price_per_barrel * 1000  # converting thousand barrels to barrels
    profit = revenue - budget  # budget is the cost of developing the wells

    return profit 

<div style="color: red; padding: 10px;">
    <h3>Call the Profit function:</h3>
</div>

In [None]:
profit_region1 = profit(predictions1, target_valid1)