In [1]:
# Importing required libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
# Load datasets
data_0 = pd.read_csv('/datasets/geo_data_0.csv')
data_1 = pd.read_csv('/datasets/geo_data_1.csv')
data_2 = pd.read_csv('/datasets/geo_data_2.csv')

In [3]:
# Analyzing datasets
data_0

Unnamed: 0,id,f0,f1,f2,product
0,txEyH,0.705745,-0.497823,1.221170,105.280062
1,2acmU,1.334711,-0.340164,4.365080,73.037750
2,409Wp,1.022732,0.151990,1.419926,85.265647
3,iJLyR,-0.032172,0.139033,2.978566,168.620776
4,Xdl7t,1.988431,0.155413,4.751769,154.036647
...,...,...,...,...,...
99995,DLsed,0.971957,0.370953,6.075346,110.744026
99996,QKivN,1.392429,-0.382606,1.273912,122.346843
99997,3rnvd,1.029585,0.018787,-1.348308,64.375443
99998,7kl59,0.998163,-0.528582,1.583869,74.040764


In [4]:
data_1

Unnamed: 0,id,f0,f1,f2,product
0,kBEdx,-15.001348,-8.276000,-0.005876,3.179103
1,62mP7,14.272088,-3.475083,0.999183,26.953261
2,vyE1P,6.263187,-5.948386,5.001160,134.766305
3,KcrkZ,-13.081196,-11.506057,4.999415,137.945408
4,AHL4O,12.702195,-8.147433,5.004363,134.766305
...,...,...,...,...,...
99995,QywKC,9.535637,-6.878139,1.998296,53.906522
99996,ptvty,-10.160631,-12.558096,5.005581,137.945408
99997,09gWa,-7.378891,-3.084104,4.998651,137.945408
99998,rqwUm,0.665714,-6.152593,1.000146,30.132364


In [5]:
data_2

Unnamed: 0,id,f0,f1,f2,product
0,fwXo0,-1.146987,0.963328,-0.828965,27.758673
1,WJtFt,0.262778,0.269839,-2.530187,56.069697
2,ovLUW,0.194587,0.289035,-5.586433,62.871910
3,q6cA6,2.236060,-0.553760,0.930038,114.572842
4,WPMUX,-0.515993,1.716266,5.899011,149.600746
...,...,...,...,...,...
99995,4GxBu,-1.777037,1.125220,6.263374,172.327046
99996,YKFjq,-1.261523,-0.894828,2.524545,138.748846
99997,tKPY3,-1.199934,-2.957637,5.219411,157.080080
99998,nmxp2,-2.419896,2.417221,-5.548444,51.795253


In [6]:
# Check for nulls
print(data_0.isnull().sum())
print(data_1.isnull().sum())
print(data_2.isnull().sum())

id         0
f0         0
f1         0
f2         0
product    0
dtype: int64
id         0
f0         0
f1         0
f2         0
product    0
dtype: int64
id         0
f0         0
f1         0
f2         0
product    0
dtype: int64


- There are no missing vaues here


In [7]:
# Split the data into a training set and validation set at a ratio of 75:25 and 
# Train the model and make predictions for the validation set
def train_model(data):
    X = data.drop(['product', 'id'], axis=1)
    y = data['product']
    
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=42)
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    predictions = pd.Series(model.predict(X_valid), index=y_valid.index)
    
    rmse = mean_squared_error(y_valid, predictions, squared=False)
    return model, predictions, y_valid, rmse


In [8]:
# Profit calculation
def calculate_profit(predictions, targets, count=200, price_per_barrel=4500, budget=100_000_000):
    selected_indices = predictions.sort_values(ascending=False).index[:count]
    selected_targets = targets[selected_indices]
    revenue = selected_targets.sum() * price_per_barrel
    return revenue - budget


In [9]:
def bootstrap_profit(predictions, target, n_samples=1000):
    values = []
    for _ in range(n_samples):
        sample_indices = np.random.choice(predictions.index, size=500, replace=True)
        sample_predictions = predictions[sample_indices]
        sample_target = target[sample_indices]
        profit = calculate_profit(sample_predictions, sample_target)
        values.append(profit)
    return pd.Series(values)


In [17]:
#Risks and profit calculation for each region
def evaluate_region(data):
    model, predictions, y_valid, rmse = train_model(data)
    
    print("Predicted mean:", predictions.mean())
    print("RMSE:", rmse)
    
    profits = bootstrap_profit(predictions, y_valid)
    avg_profit = profits.mean()
    lower = profits.quantile(0.025)
    risk = (profits < 0).mean()
    
    print("Average profit:", avg_profit)
    print("95% CI lower bound:", lower)
    print("Risk of loss: {:.2f}%".format(risk * 100))
    
    return avg_profit, risk


In [18]:
# Evaluate all three regions
print("📍 Region 0:")
avg_0, risk_0 = evaluate_region(data_0)
print("\n📍 Region 1:")
avg_1, risk_1 = evaluate_region(data_1)
print("\n📍 Region 2:")
avg_2, risk_2 = evaluate_region(data_2)


📍 Region 0:
Predicted mean: 92.39879990657768
RMSE: 37.75660035026169
Average profit: 6055104.054067389
95% CI lower bound: 125992.34531071821
Risk of loss: 2.20%

📍 Region 1:
Predicted mean: 68.71287803913762
RMSE: 0.890280100102884
Average profit: 6403972.008644161
95% CI lower bound: 1708980.5669013709
Risk of loss: 0.60%

📍 Region 2:
Predicted mean: 94.77102387765939
RMSE: 40.145872311342174
Average profit: 5890645.760350293
95% CI lower bound: -465300.9088062378
Risk of loss: 3.00%


In [19]:
# Final decision
results = {
    'Region 0': {'avg_profit': avg_0, 'risk': risk_0},
    'Region 1': {'avg_profit': avg_1, 'risk': risk_1},
    'Region 2': {'avg_profit': avg_2, 'risk': risk_2}
}

# Filter by risk threshold
filtered = {k: v for k, v in results.items() if v['risk'] < 0.025}
recommended = max(filtered, key=lambda x: filtered[x]['avg_profit'])

print(f"\n✅ Recommended Region: {recommended}")



✅ Recommended Region: Region 1
