# 1. Import Python Libraries

In [3]:
import pandas as pd
from base import ROOT_DIR
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import display
from pandas_profiling import ProfileReport
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score
from zipfile import ZipFile

In [4]:
%config Completer.use_jedi = False

## 1.1. Set up the path

In [5]:
# set paths of different subdirectories
dir_ml = ROOT_DIR / 'gtx/ml_basins'

# 2. Load the training and validation datasets

## 2.1. Load the dataset used for training the ML algorithms

In [6]:
training_data = pd.read_csv(dir_ml / 'dataset_training.csv')
training_data.head()

Unnamed: 0,UWI,BHT,TrueTemp,SSTVD(m),Field,X,Y,Set
0,42013301410000,61.666667,141.555556,3725.418,Eaglebine,2597996.0,-350331.932781,Training
1,42013301930000,107.222222,132.778,3168.7008,Eaglebine,2681234.0,-316108.676424,Training
2,42013302760000,65.555556,89.161111,1747.4184,Eaglebine,2611186.0,-239228.351288,Training
3,42013305480000,76.666667,107.961111,2379.2688,Eaglebine,2651948.0,-249935.737703,Training
4,42013310190000,98.888889,133.722222,3203.289504,Eaglebine,2697926.0,-301994.813739,Training


In [7]:
training_data.drop(columns=['Field', 'Set'], axis=1, inplace=True)

In [8]:
training_data.columns

Index(['UWI', 'BHT', 'TrueTemp', 'SSTVD(m)', 'X', 'Y'], dtype='object')

In [9]:
training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 615 entries, 0 to 614
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   UWI       615 non-null    object 
 1   BHT       615 non-null    float64
 2   TrueTemp  615 non-null    float64
 3   SSTVD(m)  615 non-null    float64
 4   X         615 non-null    float64
 5   Y         615 non-null    float64
dtypes: float64(5), object(1)
memory usage: 29.0+ KB


## 2.2 Load the Validation dataset

In [10]:
val_data = pd.read_csv(dir_ml / 'dataset_val.csv')
val_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206 entries, 0 to 205
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   UWI       206 non-null    object 
 1   BHT       206 non-null    float64
 2   SSTVD(m)  206 non-null    float64
 3   Field     206 non-null    object 
 4   X         206 non-null    float64
 5   Y         206 non-null    float64
 6   Set       206 non-null    object 
dtypes: float64(4), object(3)
memory usage: 11.4+ KB


In [11]:
val_data.drop(columns=['Field', 'Set'], axis=1, inplace=True)
val_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206 entries, 0 to 205
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   UWI       206 non-null    object 
 1   BHT       206 non-null    float64
 2   SSTVD(m)  206 non-null    float64
 3   X         206 non-null    float64
 4   Y         206 non-null    float64
dtypes: float64(4), object(1)
memory usage: 8.2+ KB


## 2.3 EDA using Pandas Profiling

In [12]:
pr = ProfileReport(training_data, explorative=True)
pr

HBox(children=(HTML(value='Summarize dataset'), FloatProgress(value=0.0, max=19.0), HTML(value='')))






HBox(children=(HTML(value='Generate report structure'), FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(HTML(value='Render HTML'), FloatProgress(value=0.0, max=1.0), HTML(value='')))



# 3. Call the Machine Learning Algorithms for Regressions

In [13]:
ridge = Ridge()
lasso = Lasso()
elas_net = ElasticNet()

## 3.1. Define the conditions prior the training of algorithms

In [14]:
# Define the feature variables and target variable
X = training_data[['SSTVD(m)', 'BHT']].values
y = training_data[['TrueTemp']].values

In [15]:
# Split the data in training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [16]:
# Define the alpha parameters to tune
lasso_params = {'alpha': [0.02, 0.024, 0.025, 0.026, 0.03]}
ridge_params = {'alpha': [0.1, 1.0, 10.0]}
elas_net_params = {'alpha': [0.01, 0.1, 0, 1, 10, 100]}

# 4. Train the Algorithms using Hiperparameters Tuning and Cross-Validation

In [17]:
# Function to train each algorithm

def training_model(model, param_grid, cv):
    model = GridSearchCV(model, param_grid=param_grid, cv=cv)
    model.fit(X_train, y_train)
    return model

## 4.1. Lasso Regression

In [18]:
# Lasso Regression
laso_model = training_model(lasso, lasso_params, 5)
laso_model.best_params_

{'alpha': 0.02}

In [19]:
laso_model_final = laso_model.best_estimator_.fit(X_train, y_train)

In [20]:
# Prediction on test set
y_pred_lasso = laso_model_final.predict(X_test)

In [21]:
# R2 of model
print(laso_model_final.score(X_train, y_train))
print(laso_model_final.score(X_test, y_test))

0.8990885374104806
0.9155912085402121


In [22]:
# Mean Abosulte Error
mae_lasso = mean_absolute_error(y_test, y_pred_lasso)
print(mae_lasso)

5.237706757095569


## 4.2. Ridge Regression

In [23]:
ridge_model = training_model(ridge, ridge_params, 5)
ridge_model.best_params_

{'alpha': 10.0}

In [24]:
# Train with the best model
ridge_model_final = ridge_model.best_estimator_.fit(X_train, y_train)

In [25]:
# Prediction on test set
y_pred_ridge = ridge_model_final.predict(X_test)

In [26]:
# R2 of model
print(ridge_model_final.score(X_train, y_train))
print(ridge_model_final.score(X_test, y_test))

0.8990885404789228
0.9155884689483224


In [27]:
# Mean Abosulte Error
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
print(mae_ridge)

5.237863607848301


## 4.3. Elastic-Net Regression

In [28]:
elast_net_model = training_model(elas_net, elas_net_params, 5)
elast_net_model.best_params_

  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  self.best_estimator_.fit(X, y, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


{'alpha': 0}

In [29]:
# Train with the best model
elasnet_model_final = elast_net_model.best_estimator_.fit(X_train, y_train)

  elasnet_model_final = elast_net_model.best_estimator_.fit(X_train, y_train)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [30]:
# Prediction on test set
y_pred_elasnet = elasnet_model_final.predict(X_test)

In [31]:
# R2 of model
print(elasnet_model_final.score(X_train, y_train))
print(elasnet_model_final.score(X_test, y_test))

0.8990885405193002
0.9155881106540239


In [32]:
# Mean Abosulte Error
mae_elasnet = mean_absolute_error(y_test, y_pred_elasnet)
print(mae_elasnet)

5.237883546291775


# 5. Select Best Model

Regarding this results, It was chosen the Elastic-Net Regression algorithm to make the predictions in the validation dataset

In [33]:
mae = [mae_lasso, mae_ridge, mae_elasnet]
algorithms = ['Lasso', 'Ridge', 'Elastic-Net']

model_selec = pd.DataFrame({'Models': algorithms, 'MAE': mae})
model_selec

Unnamed: 0,Models,MAE
0,Lasso,5.237707
1,Ridge,5.237864
2,Elastic-Net,5.237884


# 6. Predictions of True Temperature in validation dataset

In [34]:
tru_temp_col = 'TrueTemp'
val_data[tru_temp_col] = elasnet_model_final.predict(val_data[['SSTVD(m)', 'BHT']])

In [35]:
val_data.head()

Unnamed: 0,UWI,BHT,SSTVD(m),X,Y,TrueTemp
0,42013339770000,113.888889,3211.068,2685272.0,-310794.173099,132.383073
1,42013340740000,110.0,3151.3272,2697188.0,-303523.525076,130.489564
2,42013342130000,77.222222,2168.652,2544150.0,-288509.453453,102.695063
3,42021301990000,73.333333,2008.0224,3014593.0,139785.362353,98.309595
4,42041303170000,90.0,2553.09624,3275049.0,464826.10715,113.563984


In [36]:
val_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206 entries, 0 to 205
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   UWI       206 non-null    object 
 1   BHT       206 non-null    float64
 2   SSTVD(m)  206 non-null    float64
 3   X         206 non-null    float64
 4   Y         206 non-null    float64
 5   TrueTemp  206 non-null    float64
dtypes: float64(5), object(1)
memory usage: 9.8+ KB


## 6.1. Merge dataset used for training with validation dataset

In [37]:
# call the validation dataset 
val_data = val_data[['UWI', 'TrueTemp']]

In [38]:
# Concat both datasets
data_final = training_data[['UWI', 'TrueTemp']].append(val_data)

In [39]:
display(data_final.head())
data_final.info()

Unnamed: 0,UWI,TrueTemp
0,42013301410000,141.555556
1,42013301930000,132.778
2,42013302760000,89.161111
3,42013305480000,107.961111
4,42013310190000,133.722222


<class 'pandas.core.frame.DataFrame'>
Int64Index: 821 entries, 0 to 205
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   UWI       821 non-null    object 
 1   TrueTemp  821 non-null    float64
dtypes: float64(1), object(1)
memory usage: 19.2+ KB


In [40]:
len(data_final.UWI.unique())

821

In [95]:
file_name = 'prediction_tt.csv'
val_data.to_csv(ROOT_DIR / 'gtx'/ file_name, index=False)

In [44]:
#ZipFile(ROOT_DIR / "gtx/predictionstt.zip", mode="w").write(r"C:\Users\fredd\PycharmProjects\gtx-2021\gtx\prediction_tt.csv")