This notebook contains the code for making predictions on a test file. <br>
 - *This notebook was made in Google Colab, and requires GPU runtime.* <br>
 - *No training files or submission files need to be added in notebook. <br>Data is being implicitly downloaded in the notebook.* <br>.


 

# Pre-requisites
Necessary imports, preprocessed data downloads and intialization

##### Install libraries

In [1]:
%%capture
!pip install catboost
!pip install lasio

##### Import

In [2]:
import numpy as np 
import pandas as pd 

import os
import lasio 
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
import xgboost as xg
from catboost import CatBoostRegressor
import random

from sklearn.metrics import mean_squared_error as MSE

import time
import warnings
warnings.filterwarnings('ignore')

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed = 42
seed_everything(seed)

##### Alias Dictionary

In [4]:
# False dict
alias_dict_F = dict()

alias_dict_F['li_DEC_D'] = ['DPHZ', 'DPHI_LS', 'DPHI', 'DPOR_LS', 'DPO_LS', 'DPHZ_LS'] # from density
alias_dict_F['li_DEC_N'] = ['NPOR_LS', 'ENPH_LS', 'NPHI_LS', 'APLC_LS', 'NPHI_LS_1', 'NPHI', 'SNP'] # from Neutron
alias_dict_F['li_DEC_T'] = ['TNPH_LS_1', 'TNPH_LS', 'TPHI_LS', 'TNPH', 'CNPOR_LS', 'TNPH_LIM'] # thermal
alias_dict_F['li_DEC_S'] = ['SPHI', 'SPHI_LS'] # sonic
alias_dict_F['li_DEC_NOR'] = ['PORS_LS', 'PXND', 'PORZ_LS'] # normal

alias_dict_F['li_USF_S'] = ['DTS', 'DT4ST', 'DT4SR', 'DT1', 'DT4S', 'DTSH', 'DT1T', 'DTTS', 'DTRS', 'DT1R', 'DT2R', 'DTST', 'DT2'] # shear
alias_dict_F['li_USF_P'] = ['DTCO','DT4PR', 'DT4PT', 'DTCO_1', 'DT4P', 'DTTP'] # compressional
alias_dict_F['li_USF_LO'] = ['DTC', 'DT', 'DTLN', 'DTLF', 'DTL'] # long spacing 

alias_dict_F['li_GAPI_D'] = ['GRD1', 'CGRD', 'GRD', 'HCGRD', 'HSGRD'] # density
alias_dict_F['li_GAPI_R'] = ['HCGRR', 'CGRR', 'GRR', 'GRR_R', 'GRR_R2', 'GRR_R1'] # Resistivity
alias_dict_F['li_GAPI_S'] = ['GRS1', 'CGRS', 'ECGRS', 'HSGRS', 'HCGRS'] # Sonic
alias_dict_F['li_GAPI_N'] = ['HGRT', 'GRC', 'GR_EDTC', 'GR', 'ECGREDTC', 'GRT'] # Normal
alias_dict_F['li_GAPI_SPEC'] = ['SGRR', 'HSGR', 'SGRD', 'SGRDD', 'SGR', 'SGRS'] # spectroscopy

alias_dict_F['li_IN_D'] = ['LCALD', 'CALD', 'CALI', 'LCAL', 'LCAL_1'] # density
alias_dict_F['li_IN_R'] = ['CALSR', 'CAL1R', 'CAL1R_R', 'CALSR_R', 'CALR'] # resistivity
alias_dict_F['li_IN_HILT'] = ['HCALS', 'HCAL', 'HCAL_2', 'HCALD', 'HCALR', 'HCAL_1'] # hilt
alias_dict_F['li_IN_HD'] = ['HD2', 'HD1', 'HD', 'HD1_PPC1', 'HDS', 'HD2_PPC2', 'HD2_PPC1'] # hd
alias_dict_F['li_IN_N'] = ['CALX', 'C1', 'C2', 'DCAL'] # normal

alias_dict_F['li_SH'] = ['AE10','AF10','AHT10','AT10','HLLS','HMIN','HMNO','LLS','LLS_R','LLS_R1','LLS_R2','MSFL','MSFL_R','RXO8','SFLU']
alias_dict_F['li_DP'] =['AHT90', 'AF90', 'ILD', 'AT90', 'HRID', 'AE90', 'LLD_R1', 'LLD_R2', 'RT_HRLT', 'LLD', 'HLLD', 'LLD_R']
alias_dict_F['li_MD'] = ['AE20','AE30','AF20','AHT20','AHT30','AT20','AT30','AO30','ILM','TBIT20','ILM','ILM_1','IMBC','IMPH','RXOZ','RXOZ_R','RXO_HRLT']
alias_dict_F['li_RLA'] = ['RLA1','RLA2','RLA3','RLA4','RLA5']

alias_dict_F['li_BE'] = ['PEFL', 'PEFZ', 'PE', 'PEF']

alias_dict_F['li_MV'] = ['SPR']

alias_dict_F['li_LBF_D'] = ['TEND_1', 'TEND1', 'TEND'] # density
alias_dict_F['li_LBF_R'] = ['TENR_R', 'TENR_R', 'TENR_R2', 'TENR', 'TENR_R1'] # resistivity
alias_dict_F['li_LBF_S'] = ['TENS1', 'LTEN', 'TENS'] # sonic
alias_dict_F['li_LBF_N'] = ['TENT', 'TEN'] # normal


alias_F = {"DTSM" :['DTSM'],
         "DEC_D"  : alias_dict_F['li_DEC_D'],
         "DEC_N"  : alias_dict_F['li_DEC_N'],
         "DEC_T"  : alias_dict_F['li_DEC_T'],
         "DEC_S"  : alias_dict_F['li_DEC_S'],
         "DEC_NOR"  : alias_dict_F['li_DEC_NOR'],

         "USF_S"  : alias_dict_F['li_USF_S'],
         "USF_P"  : alias_dict_F['li_USF_P'],
         "USF_LO"  : alias_dict_F['li_USF_LO'],

         "GAPI_D" : alias_dict_F['li_GAPI_D'],
         "GAPI_R" : alias_dict_F['li_GAPI_R'],
         "GAPI_S" : alias_dict_F['li_GAPI_S'],
         "GAPI_N" : alias_dict_F['li_GAPI_N'],
         "GAPI_SPEC" : alias_dict_F['li_GAPI_SPEC'],

         "IN_D"   : alias_dict_F['li_IN_D'],
         "IN_R"   : alias_dict_F['li_IN_R'],
         "IN_HILT"   : alias_dict_F['li_IN_HILT'],
         "IN_N"   : alias_dict_F['li_IN_N'],

         "MV"   : alias_dict_F['li_MV'],

         "BE"   : alias_dict_F['li_BE'],

         "LBF_D"  : alias_dict_F['li_LBF_D'],
         "LBF_R"  : alias_dict_F['li_LBF_R'],
         "LBF_S"  : alias_dict_F['li_LBF_S'],
         "LBF_N"  : alias_dict_F['li_LBF_N'],

         "SH"   : alias_dict_F['li_SH'],
         "DP"   : alias_dict_F['li_DP'], 
         "RL"   : alias_dict_F['li_RLA']
 }
#########################################################

# True dict 
alias_dict_T = dict()
alias_dict_T['li_SH'] = ['AT10', 'AE10', 'AHT10', 'LLS', 'SFL', 'AST10'] # shallow
alias_dict_T['li_MD'] = ['AE20', 'IMBC', 'AF20','AE30', 'ILM_1', 'AST30', 'IMPH', 'AHT30','ILM', 'AHT20', 'TBIT20', 'TBIT30'] # Medium
alias_dict_T['li_DP'] = ['ILD1', 'ILD_1', 'AT90', 'AE90', 'AST90', 'LLD', 'ILD', 'AHT90', 'IDPH'] # deep
alias_dict_T['li_SFL'] = ['SFLA', 'SFLU', 'SFLU', 'SFLU_1', 'RILD']
alias_dict_T['li_RLA'] = ['RLA3', 'RLA1', 'RLA5']

alias_dict_T['li_DEC_NP'] = ['NPHI', 'NPOR', 'NPHI_LS', 'NPOR_LS', 'NPHS', 'NPHI1', 'CNC'] # Neutron
alias_dict_T['li_DEC_DP'] = ['DPHZ2', 'DPHZ_LS', 'DPHI_LS', 'DPHI', 'DPOR', 'DPHI_SLDT', 'DPHZ', 'PORZ', 'DPO_LS', 'DPO'] # Density
alias_dict_T['li_DEC_ST'] = ['SPHI_SS', 'SPHI', 'TNPH_LS', 'SPHI_LS'] # Sonic

alias_dict_T['li_USF_S'] = ['DT1', 'DTSM_SLOW', 'DTSM_FAST', 'DT4S', 'DTRS', 'DTC', 'DTTS', 'DTS', 'DTMD', 'DTST', 'DT', 'DTOT', 'DTM'] # Shear
alias_dict_T['li_USF_P'] = ['DT4P', 'DTRP', 'DTTP', 'DTC', 'DTCO_1', 'DTCO', 'DT'] # Compressional

alias_dict_T['li_GAPI_D'] = ['GRD', 'SGRD', 'ECGRD', 'SGR'] # from Density
alias_dict_T['li_GAPI_R']  = ['ECGRR', 'GRR'] # from resistivity
alias_dict_T['li_GAPI_S'] = ['GRN', 'ECGRS'] # from sonic
alias_dict_T['li_GAPI_N'] = ['GR_EDTC', 'GR_STGC', 'GR']  # normal Gamma API

alias_dict_T['li_IN_D'] = ['C1', 'LCAL', 'CALI', 'CALD', 'LCALD'] # from density
alias_dict_T['li_IN_R'] = ['CALI_SPCS', 'CALR'] # from resistivity
alias_dict_T['li_IN_HILT'] =  ['HCAL2R', 'HCAL_1', 'HCAL_2', 'HCALD', 'HCALR', 'HCALS', 'HCAL'] # from HILT
 
alias_dict_T['li_LB_D'] = ['TEND'] # from density
alias_dict_T['li_LB_R'] = ['TENR'] # from Resistivity
alias_dict_T['li_LB_S'] = ['TENS'] # from Sonic

alias_dict_T['li_BE'] = ['PEF_SLDT', 'PE', 'PEFZ', 'PEF', 'PEFS', 'PEFL']

alias_dict_T['li_MV'] = ['SPR']


alias_T = {"DTSM" :['DTSM'],
           
         "DEC_NP"  : alias_dict_T['li_DEC_NP'],
         "DEC_DP"  : alias_dict_T['li_DEC_DP'],
         "DEC_ST"  : alias_dict_T['li_DEC_ST'],

         "USF_S"  : alias_dict_T['li_USF_S'],
         "USF_P"  : alias_dict_T['li_USF_P'],

         "GAPI_D" : alias_dict_T['li_GAPI_D'],
         "GAPI_R" : alias_dict_T['li_GAPI_R'],
         "GAPI_S" : alias_dict_T['li_GAPI_S'],
         "GAPI_N" : alias_dict_T['li_GAPI_N'],

         "IN_D"   : alias_dict_T['li_IN_D'],
         "IN_R"   : alias_dict_T['li_IN_R'],
         "IN_HILT"  : alias_dict_T['li_IN_HILT'],

         "MV"   : alias_dict_T['li_MV'],

         "BE"   : alias_dict_T['li_BE'],

          "SH"  : alias_dict_T['li_SH'],
          "DP"  : alias_dict_T['li_DP'],
         "SFL"  : alias_dict_T['li_SFL'],

         "LB_D"  : alias_dict_T['li_LB_D'],
         "LB_R"  : alias_dict_T['li_LB_R'],
         "LB_S"  : alias_dict_T['li_LB_S']
 }

alias_dict = np.array( [alias_T, alias_F] )

##### Training Data Download

In [5]:
if not os.path.exists('data'):
  print("Downloading Data...")
  !gdown --id 1syUxq6XDKYLQerZ1rx5DVEvm03ugKW_w
  !unzip -qq 'data.zip' -d 'data'
  !rm 'data.zip'

data = np.array([
                  pd.read_csv('data/dataT.csv'), 
                   pd.read_csv('data/dataF.csv')   ])

Downloading Data...
Downloading...
From: https://drive.google.com/uc?id=1syUxq6XDKYLQerZ1rx5DVEvm03ugKW_w
To: /content/data.zip
63.8MB [00:02, 30.1MB/s]


# Workflow - Main Class
Workflow wrapped in a modular class

In [7]:
class WellLog:

  def __init__(self, filepath, alias_dict, data, valid = False, drop_col = [], thresh = 1 ):

    self.filepath = filepath
    self.test = []
    self.region = 'U'
    self.alias_dict = alias_dict
    self.data = data
    self.valid = valid
    self.drop_col = drop_col
    self.preds = []
    self.deploy = False
    self.ydep = []
    self.depth  = []
    self.thresh = thresh

    self.cluster_region()
    self.test_util()

  def cluster_region( self):

    las = lasio.read(self.filepath)
    lat = las.well['SLAT'].value
    lon = las.well['SLON'].value
    cluster_centers = [[5.80127095, 3.52103194],
                          [2.27100945, 7.48227453]]
    dist = []
    for x in cluster_centers:
     dist.append( (lat-x[0])**2 +  (lon-x[1])**2 )
    if dist[0]<dist[1]:
      self.region = 'T'
    else:
      self.region = 'F'
    print('Region : {}'.format(self.region))

  def id(self,x):
    if x == 'T':
      return 0
    else:
      return 1

  def wellydf( self, df, alias_dict ):

      df2 = pd.DataFrame()
      for alias, columns in alias_dict.items(): 
        for column in columns:
          if column in df.columns.values:
            df2[alias] = df[column]
            break
      return df2

  def test_util( self):

    temp = lasio.read(self.filepath).df()
    self.depth = temp.index
    self.test = self.wellydf( temp , alias_dict[self.id(self.region)] )
    self.test.dropna(axis=1, how='all', inplace=True)

    if 'DTSM' in self.test.columns:
      print("\nDEPLOY FILE\n")
      self.test = self.test[self.test['DTSM'].notna()]
      self.deploy = True
      self.ydep = self.test['DTSM']
      self.test.drop(columns= ['DTSM'], inplace=True)

    print(self.test.info())
    if len(self.drop_col)>0:
      print("Dropping Selected Columns...")
      self.test_clean(self.drop_col,apply = 1)

  def test_clean(self, drop_col = [], apply = 0 ):
      
      temp = self.test.drop(columns = drop_col, axis =1)
      print(temp.info())
      if apply:
        self.test = temp

  def train(self, model = 'both'):

    xtrain = self.data[ self.id(self.region) ]
    col_list = list(self.test.columns)
    col_list.append('DTSM')
    xtrain = xtrain[col_list]
    col_list.pop()
    xtrain = xtrain[xtrain['DTSM'].notna()]

    thresh = int((len(col_list))//2)+ self.thresh
    xtrain = xtrain.dropna( thresh = thresh)
    ytrain = xtrain['DTSM']
    xtrain.drop(columns= ['DTSM'], inplace=True)
    
    print('Training data : {}'.format(len(xtrain)) )

    pred = []

    if model=='xgb' or model=='both':
      print("\nTraining XGB...")
      xgb_r = xg.XGBRegressor(objective='reg:squarederror',
                              random_state = 1,
                              tree_method = 'gpu_hist',
                            n_estimators =1000,
                            max_depth = 8)
      pred.append(self.train_util(xgb_r,xtrain,ytrain))

    if model=='cat' or model=='both':
      print("\nTraining CAT...")
      cat = CatBoostRegressor(verbose = 0,learning_rate  = 0.01,
                              task_type='GPU',
                              random_state = 1,
                              iterations = 1500, depth = 8)
      pred.append(self.train_util(cat, xtrain, ytrain))

    if (len(pred)>1):
      print("\nEnsembling...")

    if self.deploy:
      print('Deploy Weighted(1:3) RMSE :{}'.format( np.sqrt(MSE(self.ydep.values, (pred[0]+4*pred[1])/5)) ))
      print( (pred[0]+4*pred[1])/5 ) 

    if len(pred) >1:
      pred[0] = (pred[0]+4*pred[1])/5

    self.preds = pred[0]

  def train_util(self,model,xtrain,ytrain):

    if self.valid:
      xtrain, test_x, ytrain, test_y = train_test_split(xtrain, ytrain, 
                                                        test_size=0.2,
                                                        random_state= 42)
    a = time.time()
    model.fit( xtrain , ytrain )
    b = time.time()
    print('Train time :{}'.format(b-a))

    if self.valid:
      a = time.time()
      pred = model.predict(test_x)
      b = time.time()
      print('Valid time :{}'.format(b-a))
      rmse = np.sqrt(MSE(test_y, pred))
      print('Valid RMSE :{}'.format(rmse))

    pred = model.predict(self.test)

    if self.deploy:
      rmse = np.sqrt(MSE(self.ydep.values, pred))
      print('Deploy RMSE :{}'.format(rmse))
      print(pred)

    return pred

  def predictions(self, filename = ''):
    if len(filename) > 0:
      x = pd.DataFrame()
      x['Depth'] = self.depth
      x['DTSM'] = self.preds
      print(x.head())
      print(x.tail())
      x.to_excel(filename + '.xlsx', index=False)

    return self.preds

# Test a Single File
Using the above class for testing a file <br>
*Add the file to be tested in this notebook*

In [None]:
'''
Create an object of above class

Parameters :

filepath : path of LAS file
alias_dict : alias dictionary, initalized in pre-requisites section
data : preprocessed training dataframes, initalized in pre-requisites section
valid : boolean, True for validation, default - False
drop_col : list of aliases to drop from test file, defualt - any empty list

'''

test = WellLog( 'folder-name/0a7822c59487_TGS.las', alias_dict, data  )

In [None]:
'''
Training data

Parameters :
model : ''CAT' / 'XGB' / 'both' , default - both

'''

test.train()

In [None]:
'''
Returns predictions

Parameters :
filename : filename for xlsx file, saving the predictions (optional)
'''

test.predictions( 'file-name')

# Submission Test Files
Producing results on submission test files

In [8]:
!gdown --id 1-43M2dHo-p7sBMQv0DK2RK8ZC1cNpVUC
!unzip -qq final-data.zip -d 'test-data'
os.remove("final-data.zip")

Downloading...
From: https://drive.google.com/uc?id=1-43M2dHo-p7sBMQv0DK2RK8ZC1cNpVUC
To: /content/final-data.zip
6.83MB [00:00, 25.9MB/s]


In [9]:

x = [  

'63250f7d463b_TGS.las',
'638f2cc65681_TGS.las',
'fca03aa6acde_TGS.las',
'3369b6f8fb6f_TGS.las',
'8e37531ba266_TGS.las',
'eed1e9537976_TGS.las',
'113412eec2a6_TGS.las',
'ff7845ea074d_TGS.las',
'34a80ab7a5fa_TGS.las',
'84c5fb9cc880_TGS.las', 
'7595ba9fb314_TGS.las',
'302460e3021a_TGS.las',
'2f96a5f92418_TGS.las',
'1684cc35f399_TGS.las'
]

for filename in tqdm(x):
    print('___________________________________________________________________')
    print(filename)
    test = WellLog( 'test-data/' + filename, alias_dict, data, valid=0  )
    test.train()
    test.predictions( filename[:-4] )

HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))

___________________________________________________________________
63250f7d463b_TGS.las
Region : F
<class 'pandas.core.frame.DataFrame'>
Float64Index: 13883 entries, 5241.0 to 12182.0
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   DEC_D   13882 non-null  float64
 1   DEC_N   13871 non-null  float64
 2   USF_S   13882 non-null  float64
 3   USF_P   13882 non-null  float64
 4   IN_D    13882 non-null  float64
 5   MV      13883 non-null  float64
 6   BE      13433 non-null  float64
 7   SH      13883 non-null  float64
 8   DP      13794 non-null  float64
dtypes: float64(9)
memory usage: 1.1 MB
None
Training data : 515065

Training XGB...
Train time :13.865854263305664

Training CAT...
Train time :69.99051809310913

Ensembling...
    Depth        DTSM
0  5241.0  113.033036
1  5241.5  123.132374
2  5242.0  116.541283
3  5242.5  115.188701
4  5243.0  121.831093
         Depth        DTSM
13878  12180.0  138.883628
13879  12180

In [10]:
temp = dict()
temp['00d02be79f49_TGS.las'] = []
temp['ae16a9f64878_TGS.las'] = ['IN_D', 'DP', 'SFL']
temp['94c1f5cae85c_TGS.las'] = ['BE']
temp['ed48bda2217f_TGS.las'] = []
temp['20372701d5e2_TGS.las'] = ['DEC_DP', 'IN_HILT', 'BE', 'DP']
temp['0a7822c59487_TGS.las'] = ['DEC_T', 'IN_HILT', 'BE', 'SH', 'DP']

x = [
'00d02be79f49_TGS.las',
'ae16a9f64878_TGS.las',
'94c1f5cae85c_TGS.las',
'ed48bda2217f_TGS.las',
'20372701d5e2_TGS.las',
'0a7822c59487_TGS.las'
]

for filename in tqdm(x):
    print('___________________________________________________________________')
    print(filename)
    test = WellLog( 'test-data/' + filename, alias_dict, data, valid=0,
                   drop_col = temp[filename] )
    test.train()
    test.predictions( filename[:-4] )

HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))

___________________________________________________________________
00d02be79f49_TGS.las
Region : T
<class 'pandas.core.frame.DataFrame'>
Float64Index: 1307 entries, 9912.0 to 10565.0
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   DEC_NP  1293 non-null   float64
 1   DEC_DP  1307 non-null   float64
 2   USF_P   575 non-null    float64
 3   GAPI_R  1275 non-null   float64
 4   IN_D    1307 non-null   float64
 5   MV      1307 non-null   float64
 6   DP      1307 non-null   float64
dtypes: float64(7)
memory usage: 81.7 KB
None
Training data : 631810

Training XGB...
Train time :14.036465406417847

Training CAT...
Train time :11.312457084655762

Ensembling...
    Depth        DTSM
0  9912.0  100.066396
1  9912.5   99.665523
2  9913.0  101.298541
3  9913.5  102.906530
4  9914.0  101.961048
        Depth        DTSM
1302  10563.0  112.102998
1303  10563.5  112.222685
1304  10564.0  112.064954
1305  10564.5  111.724287
1306  105

In [11]:
import glob
pred_files = glob.glob('*.xlsx')

In [12]:
!mkdir predictions2
import shutil
for pred_file in pred_files:
  shutil.move( pred_file, 'predictions2' + '/' + pred_file )
shutil.make_archive( 'predictions2' , 'zip', 'predictions2' )

'/content/predictions2.zip'