# Machine Learning

## Modulos a emplear

In [31]:
# Importamos las librerias a utilizar
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import FunctionTransformer, PolynomialFeatures, scale, StandardScaler
from sklearn.decomposition import PCA
import sys
import os

# Funciones importadas como modulos
notebook_dir = os.getcwd()
scripts_dir = os.path.join(notebook_dir, "..", "src")
sys.path.append(scripts_dir)
from data_transform.print_unique_values import print_unique_values

# Codigo para imprimir mas de una salida de la misma celda
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Carga y selección de la Información

### Seleccionamos la informacion del pozo clave y la editamos

In [32]:
# Definimos la ruta de la informacion procesada
file_processed = "data_tokal_eval.csv"
path_data_processed = os.path.join(notebook_dir, "..", "data", "processed", file_processed)

# Cargamos la información procesada
tokal_df = pd.read_csv(path_data_processed)

# Mostramos los pozos en el DataFrame
print_unique_values(tokal_df, "wellname")

Valores únicos en la columna 'wellname': TOKAL-1, TOKAL-2, TOKAL-3


In [33]:
# Seleccionamos el pozo/los pozos de interes
key_wells_lst = ["TOKAL-1"]
# Creamos el DataFrame key_well con la informacion de interes
key_well = tokal_df[tokal_df["wellname"].isin(key_wells_lst)].copy()

# Definimos las columnas de interes
key_well_tops = ["arena_3", "arena_4", "c. inferior", "c. superior"]
features = ["gr", "rp", "rhob", "nphi", "dtco", "dtsm", "vsh", "phie", "phit"]
target = ["uwater"]

# Creamos un DataFrame con la informacion necesario para el modelo de Machine Learning
key_well_df_ml = key_well[features + target + key_well_tops].copy()
key_well_df_ml.dropna(inplace=True)

## Modelo de Machine Learning

In [34]:
# Definimos una variable con los features con los que entrenaremos nuestros modelo de ML y con la variable a predecir
X = key_well_df_ml[features + key_well_tops]
y = key_well_df_ml[target]

# Separamos los valores en set de entramiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=32)

# Convertimos en un array 1D y_train
y_train = y_train.values.ravel()

# Definimos el modelo a emplear
rf_regressor = RandomForestRegressor(bootstrap=True, n_jobs=-1, criterion="poisson", random_state=50)

# Ajustamos nuestro modelo a nuestros datos de entrenamiento
rf_regressor.fit(X_train, y_train)

# Evaluamos nuestro modelo
y_train_pred = rf_regressor.predict(X_train)
y_test_pred = rf_regressor.predict(X_test)

rf_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
rf_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

print(f'Random forest train/test RMSE: {rf_train: .3f}/{rf_test:.3f}')


Random forest train/test RMSE:  0.001/0.003


## Aplicamos el modelo de Machine Learning

In [35]:
# Creamos el DataFrame predict_well con la informacion del pozo de interes
well_target = "TOKAL-3"
predict_well = tokal_df[tokal_df["wellname"] == well_target].copy()
predict_well = predict_well[["wellname", "md"] + features + key_well_tops]
predict_well.dropna(inplace=True)

# Creamos un DataFrame con la informacion para predecir
predict_well_ml = predict_well[features + key_well_tops].copy()

# Aplicamos el modelo para generar el registro sintetico
y_pred_well_log = rf_regressor.predict(predict_well_ml)

# Creamos la columna del registro sintetico
predict_well[target[0] +"_syn"] = y_pred_well_log

In [36]:
# Guardamos la curva en el dataset del pozo objetivo
# qe_df = predict_well.copy()
qe_df[target[0] +"_syn"] = predict_well[target[0] +"_syn"]
qe_df

Unnamed: 0,wellname,md,gr,rp,rhob,nphi,dtco,dtsm,vsh,phie,...,arena_3,arena_4,c. inferior,c. superior,quartz_syn,illite_syn,bound water_syn,k-feldspar_syn,uoil_syn,uwater_syn
90465,TOKAL-3,3056.077393,51.541515,1.499222,2.372968,0.217665,77.020908,186.206454,0.469088,0.138145,...,False,False,False,True,0.394058,0.418625,0.051836,0.019461,0.000012,0.151937
90466,TOKAL-3,3056.229736,49.185925,1.627110,2.301361,0.266638,79.060778,186.995901,0.498102,0.130885,...,False,False,False,True,0.355328,0.444948,0.054198,0.021332,0.000009,0.147655
90467,TOKAL-3,3056.382080,47.912537,1.765783,2.296071,0.329390,83.052604,185.452339,0.528686,0.123385,...,False,False,False,True,0.276328,0.472327,0.058118,0.053587,0.000138,0.135308
90468,TOKAL-3,3056.534424,51.273327,1.765934,2.363587,0.374565,87.344230,181.003533,0.559989,0.116485,...,False,False,False,True,0.216819,0.499710,0.061429,0.078397,0.000360,0.131759
90469,TOKAL-3,3056.687012,50.268745,1.671849,2.440886,0.372892,90.195861,176.516123,0.590998,0.110114,...,False,False,False,True,0.205183,0.528197,0.065039,0.073206,0.000014,0.123781
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97034,TOKAL-3,4057.192871,63.773464,1.760714,2.536641,0.293183,85.049125,184.487368,0.965696,0.017580,...,True,False,False,False,0.000997,0.854337,0.106336,0.000217,0.000001,0.040610
97035,TOKAL-3,4057.345215,65.971161,1.746591,2.519919,0.289012,84.470510,176.374028,0.965139,0.017659,...,True,False,False,False,0.001822,0.854927,0.106329,0.000124,0.000001,0.040005
97036,TOKAL-3,4057.497803,66.255798,1.714331,2.512301,0.286650,84.764831,181.163501,0.964485,0.017775,...,True,False,False,False,0.002067,0.854314,0.106247,0.000031,0.000001,0.040322
97037,TOKAL-3,4057.650146,62.815662,1.672484,2.515662,0.286839,85.441126,183.310718,0.963665,0.017885,...,True,False,False,False,0.002378,0.851985,0.105761,0.000074,0.000001,0.041325


In [39]:
# Guardamos el registro sintetico en formato csv
synthetic_file_name = f"{well_target}_qe_syn.csv"
path_to_save = os.path.join(notebook_dir, "..", "data", "processed", "tokal", synthetic_file_name)
qe_df[["wellname", "md", "quartz_syn", "illite_syn",	"bound water_syn", "k-feldspar_syn", "uoil_syn", "uwater_syn"]].to_csv(path_to_save, index=False)

## Guardamos la curva sintetica del modelo de Machine Learning

In [6]:
# Guardamos el registro sintetico en formato csv
synthetic_file_name = f"{well_target}_{target[0]}_syn.csv"
path_to_save = os.path.join(notebook_dir, "..", "data", "processed", "tokal", synthetic_file_name)
predict_well[["wellname", "md", target[0] +"_syn"]].to_csv(path_to_save, index=False)