# 1. Import Python Libraries

In [306]:
import pandas as pd
from base import ROOT_DIR
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import display
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score

In [2]:
%config Completer.use_jedi = False

## 1.1. Set up the path

In [3]:
# set paths of different subdirectories
dir_ml = ROOT_DIR / 'gtx/machine_learning'

# 2. Load the training and validation datasets

## 2.1. Load the dataset used for training the ML algorithms

In [4]:
training_data = pd.read_csv(dir_ml / 'data_eaglebine.csv')
training_data.head()

Unnamed: 0,UWI,Depth sub-sea (feet),True Temperature (oF),X,Y
0,42013301410000,-350,70.33,2597996.0,-350331.932781
1,42013301410000,-300,70.33,2597996.0,-350331.932781
2,42013301410000,-250,70.33,2597996.0,-350331.932781
3,42013301410000,-200,71.49,2597996.0,-350331.932781
4,42013301410000,-150,72.66,2597996.0,-350331.932781


In [5]:
training_data.columns

Index(['UWI', 'Depth sub-sea (feet)', 'True Temperature   (oF)', 'X', 'Y'], dtype='object')

In [6]:
# Rename columns
training_data.rename(columns={'Depth sub-sea (feet)':'SSTVD', 'True Temperature   (oF)':'TRUE_TEMP'}, inplace=True)
training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66389 entries, 0 to 66388
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   UWI        66389 non-null  int64  
 1   SSTVD      66389 non-null  int64  
 2   TRUE_TEMP  66389 non-null  float64
 3   X          66389 non-null  float64
 4   Y          66389 non-null  float64
dtypes: float64(3), int64(2)
memory usage: 2.5 MB


## 2.2 Load the Validation dataset

In [7]:
val_data = pd.read_csv(dir_ml / 'test_egb.csv')
val_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79 entries, 0 to 78
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   UWI     79 non-null     int64  
 1   SSTVD   79 non-null     float64
 2   X       79 non-null     float64
 3   Y       79 non-null     float64
dtypes: float64(3), int64(1)
memory usage: 2.6 KB


# 3. Call the Machine Learning Algorithms for Regressions

In [8]:
ridge = Ridge()
lasso = Lasso()
elas_net = ElasticNet()

## 3.1. Define the conditions prior the training of algorithms

In [9]:
# Define the feature variables and target variable
X = training_data[['SSTVD']].values.reshape(-1,1)
y = training_data[['TRUE_TEMP']].values.reshape(-1,1)

In [10]:
# Split the data in training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [11]:
# Define the alpha parameters to tune
lasso_params = {'alpha': [0.02, 0.024, 0.025, 0.026, 0.03]}
ridge_params = {'alpha': [0.1, 1.0, 10.0]}
elas_net_params = {'alpha': [0.01, 0.1, 0, 1, 10, 100]}

# 4. Train the Algorithms using Hiperparameters Tuning and Cross-Validation

In [12]:
# Function to train each algorithm

def training_model(model, param_grid, cv):
    model = GridSearchCV(model, param_grid=param_grid, cv=cv)
    model.fit(X_train, y_train)
    return model

## 4.1. Lasso Regression

In [13]:
# Lasso Regression
laso_model = training_model(lasso, lasso_params, 5)
laso_model.best_params_

{'alpha': 0.03}

In [14]:
laso_model_final = laso_model.best_estimator_.fit(X_train, y_train)

In [15]:
# Prediction on test set
y_pred_lasso = laso_model_final.predict(X_test)

In [16]:
# R2 of model
print(laso_model_final.score(X_train, y_train))
print(laso_model_final.score(X_test, y_test))

0.992153033347863
0.9922760869531096


In [17]:
# Mean Abosulte Error
mae_lasso = mean_absolute_error(y_test, y_pred_lasso)
print(mae_lasso)

5.2681136308521435


## 4.2. Ridge Regression

In [18]:
ridge_model = training_model(ridge, ridge_params, 5)
ridge_model.best_params_

{'alpha': 1.0}

In [19]:
# Train with the best model
ridge_model_final = ridge_model.best_estimator_.fit(X_train, y_train)

In [20]:
# Prediction on test set
y_pred_ridge = ridge_model_final.predict(X_test)

In [21]:
# R2 of model
print(ridge_model_final.score(X_train, y_train))
print(ridge_model_final.score(X_test, y_test))

0.9921530333478721
0.9922760868598206


In [22]:
# Mean Abosulte Error
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
print(mae_ridge)

5.268113857721729


## 4.3. Elastic-Net Regression

In [23]:
elast_net_model = training_model(elas_net, elas_net_params, 5)
elast_net_model.best_params_

  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


{'alpha': 1}

In [24]:
# Train with the best model
elasnet_model_final = elast_net_model.best_estimator_.fit(X_train, y_train)

In [25]:
# Prediction on test set
y_pred_elasnet = elasnet_model_final.predict(X_test)

In [26]:
# R2 of model
print(elasnet_model_final.score(X_train, y_train))
print(elasnet_model_final.score(X_test, y_test))

0.9921530333452299
0.9922760884388981


In [27]:
# Mean Abosulte Error
mae_elasnet = mean_absolute_error(y_test, y_pred_elasnet)
print(mae_elasnet)

5.2681100115111335


# 5. Select Best Model

Regarding this results, It was chosen the Elastic-Net Regression algorithm to make the predictions in the validation dataset

In [28]:
mae = [mae_lasso, mae_ridge, mae_elasnet]
algorithms = ['Lasso', 'Ridge', 'Elastic-Net']

model_selec = pd.DataFrame({'Models': algorithms, 'MAE': mae})
model_selec

Unnamed: 0,Models,MAE
0,Lasso,5.268114
1,Ridge,5.268114
2,Elastic-Net,5.26811


# 6. Predictions of True Temperature in validation dataset

In [29]:
tru_temp_col = 'TRUE_TEMP'
val_data[tru_temp_col] = elasnet_model_final.predict(val_data['SSTVD'].values.reshape(-1,1))

In [30]:
val_data.head()

Unnamed: 0,UWI,SSTVD,X,Y,TRUE_TEMP
0,42021301990000,6604.0,3014593.0,139785.362353,200.562562
1,42289309440000,9501.3,3399516.0,585955.11173,250.375604
2,42177314700000,7443.5,2937857.0,-29223.831082,214.996017
3,42127311560000,7767.21,2264634.0,-384375.035838,220.561537
4,42127322920000,5503.0,2101962.0,-436304.200645,181.63316


In [31]:
val_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79 entries, 0 to 78
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   UWI        79 non-null     int64  
 1   SSTVD      79 non-null     float64
 2   X          79 non-null     float64
 3   Y          79 non-null     float64
 4   TRUE_TEMP  79 non-null     float64
dtypes: float64(4), int64(1)
memory usage: 3.2 KB


## 6.1. Merge dataset used for training with validation dataset

In [32]:
# Group by well the dataset used for training
egb_data = training_data.groupby('UWI')['TRUE_TEMP'].mean().reset_index()

In [33]:
# call the validation dataset 
val_data = val_data[['UWI', 'TRUE_TEMP']]

In [34]:
# Concat both datasets
egb_final = egb_data.append(val_data)

In [35]:
display(egb_final.head())
egb_final.info()

Unnamed: 0,UWI,TRUE_TEMP
0,42013301410000,206.171133
1,42013301930000,218.8496
2,42013302760000,179.589149
3,42013305480000,192.718864
4,42013310190000,219.685061


<class 'pandas.core.frame.DataFrame'>
Int64Index: 322 entries, 0 to 78
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   UWI        322 non-null    int64  
 1   TRUE_TEMP  322 non-null    float64
dtypes: float64(1), int64(1)
memory usage: 7.5 KB


In [36]:
len(egb_final.UWI.unique())

322

In [38]:
file_name = 'ttpredict_egb.csv'
val_data.to_csv(ROOT_DIR / 'gtx/machine_learning' / file_name, index=False)