Import the libraries used for the model.
Using Python 3.6

In [None]:
import lasio  # ver 0.28
import os     
import numpy as np #ver 1.17.0
import pandas as pd # ver 1.0.5
import matplotlib.pyplot as plt #ver 3.2.2
from math import isnan

Import all las files into an object.

In [None]:
lasAll = {}
path='D:/New folder/ML Challenge Data' # path to the training data goes here
for filename in os.listdir(path):
    if filename.endswith(".las"):
        lasAll[filename] = lasio.read(os.path.join(path, filename)) # creates an object containing data from all las files

Count the number of times each feature appears in the dataset.

In [None]:
FeatureCount = {}   # initialize features count
for filename in lasAll:
    las = lasAll[filename]
    for item in las.curves.iteritems():
        if item[0] in FeatureCount.keys():
            FeatureCount[item[0]] += 1  # Update FeatureCount if feature present in file
        else:
            FeatureCount[item[0]] = 1

Create a bar graph for the features that appear more than 10 times in the dataset.

In [None]:
features = [] 
for key in FeatureCount:
    if FeatureCount[key] > 10:
        features.append(key)   # Add key to features, if FeatureCount>10
values = [FeatureCount[key] for key in features]
plt.figure(figsize=(20, 5))
plt.bar(features, values)
plt.xticks(rotation=90)
plt.show()
print(features) # array containing features appearing more than 10 times in the dataset

Next, plot the histogram of features from a sample las file to see the distribution functions. Note that some features are bimodal whereas some are unimodal. Also, some unimodal distributions are skewed. This, suggests that "Median" is a better measure of central tendency here compared to "Mean" which will be used later to replace missing values in the feature columns
The pearson correlation coefficient is computed among all features. Note that 'DTCO','AFCO','NPHI' were found to be most correlated with 'DTSM'

In [None]:
las = lasAll['0f7a4609731a_TGS.las']  # randomly chosen las file 
logs = las.df()
logs.hist(figsize=(10,10)) # plot the histogram of the features to see the distribution
# Compute pearson correlation coefficient
r = logs.corr(method="pearson")
print(r)

Create x and y data arrays with x containing frequently occuring predictor features as determined above and y the predicted parameter.

In [None]:
xlabels=['DEPT', 'DTCO', 'GRS', 'RXOZ', 'GRR', 'GRD', 'RHOZ', 'TNPH', 'DPHZ', 'HCAL', 'HDRA', 'PEFZ', 
         'SGRDD', 'TNPH_LS', 'RLA3', 'CALD', 'DPHZ_LS', 'CALR', 'SPR', 'AT30', 'AT90', 'AT10', 'AT20', 'HCALR',
         'SPHI_LS', 'TENR', 'TENS', 'AT60', 'ILD', 'ILM', 'SFLU', 'CILD', 'NPHI_LS', 'HCALD', 'DTRP', 'DTRS', 
         'RHOB', 'TEND', 'PEF', 'DRHO', 'NPHI', 'DPHI', 'DPHI_LS', 'GR', 'DT', 'DTST', 'LLD', 'LLS', 'DTL', 
         'MSFL', 'RLA4', 'RLA5', 'GR_EDTC', 'HSGRD'] # 54 features that appear more than 10 times in the data

ylabels = ['DTSM']
x = []   # initialize input variable
y = []   # initialize output variable

for filename in lasAll:
    lasAll[filename].df().fillna(lasAll[filename].df().median()) # replace missing values with median of columns    
    allmeasures = list(lasAll[filename].df().columns)
    if all([item in allmeasures for item in ylabels]):
        for index, row in lasAll[filename].df().iterrows():
            inputs = []
            for i in xlabels:
                try:
                    inputs.append(row[allmeasures.index(i)]) # append rows of the data to the input array
                except:
                    inputs.append(np.nan)   # in case of an exception append a NaN
            outputs = []
            [outputs.append(row[allmeasures.index(i)]) for i in ylabels]
            if all([not isnan(item) for item in outputs]):
                x.append(inputs)   #append inputs to x if the value of DTSM is not NaN
                y.append(outputs)  #append output to y if the value of DTSM is not NaN
                
x = np.asarray(x) # convert to an array
y = np.asarray(y)
print(np.shape(y)) # print shape of the arrays to check dimensions
print(np.shape(x)) 

In [None]:
from sklearn.tree import DecisionTreeRegressor # scikit-learn ver 0.23.1
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE

train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=.15) # split arrays into random train and test sets

train_x=np.nan_to_num(train_x) # convert any NaNs to numbers 
train_y=np.nan_to_num(train_y)
test_x=np.nan_to_num(test_x)
test_y=np.nan_to_num(test_y)

scaler = MinMaxScaler() #data_scaled = X_std * (max - min) + min, where min,max correspond to the data range
# transform data
train_x = scaler.fit_transform(train_x) # scale the data using the data range
test_x = scaler.fit_transform(test_x)



regressor = DecisionTreeRegressor(random_state=42) # initialise the DT regressor. Random state controls the randomness of the estimator
regressor.fit(train_x,np.ravel(train_y)) # train the model using the training data

pred = regressor.predict(test_x) # make predictions on the test data
rmse = np.sqrt(MSE(np.ravel(test_y),pred)) # evaluate the performance on the test data
print(rmse)

importance = regressor.feature_importances_  # compute feature importance. Feaures at the root of the tree are more important than ones used near the leaves.
imp=np.array(importance)
sort_index=np.argsort(imp) # sort the importance form low to high
print(sort_index) # show the indices of important features

# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))

In [None]:
#The relative distance between the wells is preserved in the latitude and longitude values from the file headers, 
#which is reflected in the scatter plot showing the cluster of well positions.

Latitude = []
Longitude = []
for filename in lasAll:
    las = lasAll[filename]
    Latitude.append(las.well['SLAT'].value) # for each well get the latitude and longitude information
    Longitude.append(las.well['SLON'].value)

plt.scatter(x=Longitude,y=Latitude)
plt.show()

The top ten most important features found from the decision tree regressor were: 'DTCO','GRS','HCALR','TNPH','TNPH_LS','SPR','HSGRD','AT10','HCALD','DT'

The top three correlated features to DTSM were: DTCO, AFCO and NPHI

Taking a union of these features gives a total of 12 features. Create another variable x with only these selected features+ Latitude and longitude

In [None]:
# Top 10 important features from decision tree (union) 3 featres that are most correlated =12 featuress 
ylabels = ['DTSM']
xlabels=['DTCO','GRS','HCALR','TNPH','TNPH_LS','SPR','HSGRD','AT10','HCALD','DT','AFCO','NPHI'] 
x = []
y = []
Lat=[]
Lon=[]
for filename in lasAll:
    lasAll[filename].df().fillna(lasAll[filename].df().median()) # replace missing values with median of columns
    allmeasures = list(lasAll[filename].df().columns)
    if all([item in allmeasures for item in ylabels]):
        for index, row in lasAll[filename].df().iterrows():
            inputs = []
            for i in xlabels:
                try:
                    inputs.append(row[allmeasures.index(i)])
                except:
                    inputs.append(np.nan)
                    
            outputs = []
            [outputs.append(row[allmeasures.index(i)]) for i in ylabels]
            if all([not isnan(item) for item in outputs]):
                x.append(inputs) # append 12 features from all files
                y.append(outputs) # append DTSM from all files
                Lat.append(lasAll[filename].well['SLAT'].value) # obtain position informtion from all files
                Lon.append(lasAll[filename].well['SLON'].value)
x = np.asarray(x)
y = np.asarray(y)
x=np.c_[ x, np.ravel(Lat),np.ravel(Lon)] # concatenate latitude and Longitude to data array
print(np.shape(y))
print(np.shape(x))

Create training and test data from the subset of features

In [None]:
from sklearn import preprocessing  # scikit-learn ver 0.23.1
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(x,y,test_size=0.3,random_state=42) # split arrays into random train and test sets
train_x=np.nan_to_num(train_x) # convert NaNs to numbers
test_x=np.nan_to_num(test_x)
scaler = preprocessing.StandardScaler()
# transform data
train_x = scaler.fit_transform(train_x) #scale the data. Remove the mean and make the data have unit variance
test_x = scaler.fit_transform(test_x)

Train model#1 XGBoost regressor using the training data.

In [None]:
from xgboost import plot_importance # ver 1.2.0
import xgboost as xg
from matplotlib import pyplot # ver 3.2.2
xgb_regressor = xg.XGBRegressor(objective='reg:squarederror', n_estimators=100,learning_rate=0.5,importance_type='gain',feature_selector='shuffle')
    # initialize the ensemble regressor with 100 estimators

xgb_regressor.fit(train_x,train_y) # train the regressor using the training data

pred = xgb_regressor.predict(test_x)
rmse1 = np.sqrt(MSE(test_y,pred))
print(rmse1)

# Plot the original versus precdictions
x_ax = range(len(test_y))
plt.plot(x_ax, test_y, label="original")
plt.plot(x_ax, pred, label="predicted")
plt.title("DTSM test and predicted data")
plt.legend()
plt.show()

score = xgb_regressor.score(train_x, train_y)
print("R-squared:", score)

Train Model#2 Random Forest Regressor using the training data.

In [None]:
from sklearn.ensemble import RandomForestRegressor # scikit-learn ver 0.23.1

rf_regressor = RandomForestRegressor(bootstrap=True, criterion='mse',
                      n_estimators=100) #initialize rf regressor

rf_regressor.fit(train_x, np.ravel(train_y)) # train rf regressor using training data
ypred = rf_regressor.predict(test_x) # make predictions on the test data

score = rf_regressor.score(train_x, train_y) # compute training performance
print("R-squared:", score)

rmse2 = np.sqrt(MSE(np.ravel(test_y), ypred)) # compute test performance
print("RMSE= ",rmse2)

Train the stacked model using Model#1, Model#2, and Lasso regressor.

In [None]:
from sklearn.ensemble import StackingRegressor # scikit-learn ver 0.23.1
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV

lasso = LassoCV(n_alphas=10, eps=1e-3, max_iter=100, precompute=True) # initialize lasso regressor

estimators = [('Random Forest', rf_regressor),
              ('Lasso',lasso),
              ('Gradient Boosting', xgb_regressor)] # create a stack of regressors

stacking_regressor = StackingRegressor(estimators=estimators, 
                                       final_estimator=RidgeCV())
stacking_regressor.fit(train_x,np.ravel(train_y)) # train the stacking regressor

ypred3 = stacking_regressor.predict(test_x) # make predictions on the test data using the stacking regressor
rmse3 = np.sqrt(MSE(np.ravel(test_y), ypred3)) # compute test performance
print("RMSE= ",rmse3)

Make predictions and save excel files from the Leaderboard files

In [None]:
from sklearn import preprocessing

path='D:/New folder/Leaderboard' # Path containing Leaderboard files
lasDict = {}   # Initialize a new object for loading Leaderboard files
scaler = preprocessing.StandardScaler()  # use standard scaler to remove mean and make data have unit variance

for filename in os.listdir(path):
    if filename.endswith(".las"):
        lasDict[filename] = lasio.read(os.path.join(path, filename))


xlabels=['DTCO','GRS','HCALR','TNPH','TNPH_LS','SPR','HSGRD','AT10','HCALD','DT','AFCO','NPHI']    

for filename in lasDict:
    print(filename)
    df2=lasDict[filename].df()
    print(df2.shape)
    x1=[]   # initialize data arrays
    Lat=[] 
    Lon=[]
    df2.fillna(df2.median()) #replace missing values with median of columns
    allmeasures = list(lasDict[filename].df().columns)
    for index, row in lasDict[filename].df().iterrows():
        inputs = []   
        for i in xlabels:          
            try:
                inputs.append(row[allmeasures.index(i)])
            except:
                inputs.append(np.nan)
        x1.append(inputs)    # append input to array x
        Lat.append(lasDict[filename].well['SLAT'].value) # append latitude information to Lat
        Lon.append(lasDict[filename].well['SLON'].value) # append longitude information to Lon
    
    x1 = np.asarray(x1) # convert to array
    x1=np.c_[ x1, np.ravel(Lat), np.ravel(Lon) ] # concatenate Latitude and Longitude to data array
    print(np.shape(x1))
    
    x1=np.nan_to_num(x1) # convert NaNs to nums
    x1 = scaler.fit_transform(x1) # scale data to make mean=0 and var=1   
    print(np.shape(x1))

    pred=stacking_regressor.predict(x1) # apply the stacked model on the Leaderboard data
    print(np.shape(pred))
        
    df = pd.DataFrame(pred,index=df2.index,columns=['DTSM']) # create a new dataframe to save output files
    name=x = filename.split(".") # get the filename from the las files
    name=name[0]+'.xlsx'
    print(name)
    df.to_excel(os.path.join(path, name), index = True,header=True) # save outputs to excel files. Header=True makes sure the depths are saved out