<a href="https://colab.research.google.com/github/Xrenya/SaturationMapping/blob/main/Baseline_1D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns

RANDOM_STATE = 66

In [2]:
files = {"porosity": "/content/por.txt", 
         "resistivity": "/content/res.txt",
         "saturation": "/content/swat.txt"}
data_dict = {}
for file in files:
  with open(files[file], "r") as f:
    data_dict[file] = f.read().splitlines()

In [3]:
df = pd.DataFrame(data_dict)
df.sample(5)
df["porosity"] = pd.to_numeric(df["porosity"], errors="coerce")
df["resistivity"] = pd.to_numeric(df["resistivity"], errors="coerce")
df["saturation"] = pd.to_numeric(df["saturation"], errors="coerce")

In [4]:
X = df.drop(columns=["saturation"])
y = df["saturation"]
print(X.shape, y.shape)
X_train, X_val, y_train, y_val = train_test_split(X, y, 
                                                  test_size=0.2,
                                                  random_state=RANDOM_STATE)

(180000, 2) (180000,)


In [None]:
def model_pipeline(model, X_train, y_train, X_val, y_val):
  pipeline = Pipeline([
                     ("scaler", StandardScaler()),
                     ("model", model)
  ])
  pipeline.fit(X_train, y_train)
  preds = pipeline.predict(X_val)
  model_score = mean_squared_error(y_val, preds)
  return model_score

In [None]:
KNN = KNeighborsRegressor(
    n_neighbors=5, 
    weights="uniform",
    algorithm="auto",
)

In [None]:
model_pipeline(KNN, X_train, y_train, X_val, y_val)

0.05850681338900937

In [None]:
rf = RandomForestRegressor(n_estimators=100,
                           criterion='mse',
                           random_state=RANDOM_STATE)

model_pipeline(rf, X_train, y_train, X_val, y_val)

0.015908294510223867

In [None]:
def model_pipeline(model, X_train, y_train, X_val, y_val):
  pipeline = Pipeline([
                     ("scaler", StandardScaler()),
                     ("model", model)
  ])
  pipeline.fit(X_train, y_train)
  preds = pipeline.predict(X_val)
  model_score = mean_squared_error(y_val, preds)
  return model_score

In [None]:
rf = RandomForestRegressor(n_estimators=100,
                           criterion='mse',
                           max_features = 'auto',
                           min_samples_split=3,
                           bootstrap=True,
                           random_state=RANDOM_STATE)

model_pipeline(rf, X_train, y_train, X_val, y_val)

0.016067949367058375

In [None]:
rf = RandomForestRegressor(n_estimators=1000,
                           criterion='mse',
                           random_state=RANDOM_STATE)

model_pipeline(rf, X_train, y_train, X_val, y_val)

0.01569666275706869

In [10]:
from sklearn.model_selection import RandomizedSearchCV
n_estimators = [50, 100, 200, 300, 1000]
max_features = ['auto', 'sqrt']
max_depth = [2, 4, 6]
max_depth.append(None)
min_samples_split = [2, 4, 6]
min_samples_leaf = [2, 4, 6]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator=rf, 
                               param_distributions=random_grid, 
                               n_iter=50, 
                               cv=5,
                               verbose=2,
                               random_state=RANDOM_STATE,
                               n_jobs = -1)

{'n_estimators': [50, 100, 200, 300, 1000], 'max_features': ['auto', 'sqrt'], 'max_depth': [2, 4, 6, None], 'min_samples_split': [2, 4, 6], 'min_samples_leaf': [2, 4, 6], 'bootstrap': [True, False]}


In [11]:
def model_pipeline_rf(model, X_train, y_train, X_val, y_val):
  pipeline = Pipeline([
                     ("scaler", StandardScaler()),
                     ("model", model)
  ])
  pipeline.fit(X_train, y_train)
  print(model.best_params_)
  best_random = model.best_estimator_
  preds = best_random.predict(X_val)
  model_score = mean_squared_error(y_val, preds)
  return model_score

In [12]:
model_pipeline_rf(rf_random, X_train, y_train, X_val, y_val)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed: 14.1min
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed: 73.1min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 130.4min finished


{'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': None, 'bootstrap': True}


0.12972336352496333

In [None]:
#!pip install catboost
from catboost import CatBoostRegressor

model = CatBoostRegressor(iterations=100,
                          learning_rate=0.1,
                          depth=9)

model.fit(X_train, y_train)

preds = model.predict(X_val)

score = mean_squared_error(y_val, preds)
print(score)

0:	learn: 0.2881066	total: 25.7ms	remaining: 2.55s
1:	learn: 0.2845907	total: 49.2ms	remaining: 2.41s
2:	learn: 0.2815621	total: 74.2ms	remaining: 2.4s
3:	learn: 0.2791340	total: 98.2ms	remaining: 2.36s
4:	learn: 0.2770590	total: 122ms	remaining: 2.31s
5:	learn: 0.2753956	total: 146ms	remaining: 2.29s
6:	learn: 0.2739544	total: 171ms	remaining: 2.27s
7:	learn: 0.2728017	total: 195ms	remaining: 2.25s
8:	learn: 0.2718405	total: 220ms	remaining: 2.22s
9:	learn: 0.2710545	total: 249ms	remaining: 2.24s
10:	learn: 0.2703740	total: 282ms	remaining: 2.29s
11:	learn: 0.2697642	total: 307ms	remaining: 2.25s
12:	learn: 0.2692276	total: 332ms	remaining: 2.22s
13:	learn: 0.2687868	total: 356ms	remaining: 2.19s
14:	learn: 0.2684080	total: 380ms	remaining: 2.15s
15:	learn: 0.2681155	total: 403ms	remaining: 2.12s
16:	learn: 0.2678314	total: 426ms	remaining: 2.08s
17:	learn: 0.2675723	total: 453ms	remaining: 2.06s
18:	learn: 0.2673122	total: 477ms	remaining: 2.03s
19:	learn: 0.2670659	total: 500ms	rema

In [None]:
import xgboost as xgb

model = xgb.XGBRegressor(objective="reg:squarederror",
                         colsample_bytree=0.5,
                         learning_rate=0.2,
                         max_depth=30,
                         alpha=10,
                         n_estimators=300)

model.fit(X_train, y_train)

preds = model.predict(X_val)

score = mean_squared_error(y_val, preds)
print(score)

0.021484273822412397


In [None]:
import lightgbm as lgb

params = {
    "boositing_type": "gbdt",
    "objective": "regression",
    "metrics": "mse",
    "max_depth": 12,
    "learning_rate": 0.01,
    "early_stoping_round": 40
}
n_estimators = 100

d_train = lgb.Dataset(X_train, label=y_train)

model = lgb.train(params, 
                  d_train,
                  n_estimators,
                  verbose_eval=1)

preds = model.predict(X_val)

score = mean_squared_error(y_val, preds)
print(score)

0.06540129321137311
