In [None]:
#Build a data-driven model of Irreducible Water Saturation of Core Samples

In [4]:
#Task 1: Load Data

#Use Pandas inbuilt function to load the Core_Swi_SM.xlsx file.

import numpy as np
import pandas as pd
cord=pd.read_excel('Core_Swi_SM.xlsx',index_col= 0 )
cord.head()

Unnamed: 0_level_0,Porosity,TOC,Quartz,Calcite,Swirr
Depth,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2773.88,6.78,4.3,40.1,0.0,6.99958
2776.0,6.17,4.4,50.5,6.5,9.163555
2778.13,4.99,3.5,42.6,2.6,5.775995
2780.25,6.1,0.4,4.8,69.7,11.7735
2782.48,5.72,4.1,38.5,0.6,7.09608


In [6]:
# convert dataframe to array 
cor=cord.values
cor.shape

(167, 5)

In [8]:
# store features in X
X = cor[:,:4] 

In [10]:
# store target in y
y = cor[:,4]

In [13]:
# number of samples
X.shape[0] 

167

In [15]:
# number of features
X.shape[1]

4

In [17]:
#Perform train-test split. Use 80% of data as the training set.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4, train_size=0.8)

In [19]:
## isolation forest does not need scaling. LOF and KDE need scaling.

from sklearn.ensemble import IsolationForest
out = IsolationForest(contamination = 0.03, max_samples = 0.8, max_features=1.0)
out.fit(X_train)
in_out=out.predict(X_train)
in_out

array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1,  1,  1,  1,
        1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

In [21]:
X_train_i = X_train[in_out==1] ## inlier training feature set
y_train_i = y_train[in_out==1] ## inlier training target set

In [23]:
X_test_i =X_test[out.predict(X_test)==1] ## inlier training target set
y_test_i =y_test[out.predict(X_test)==1] ## inlier testing target set

In [24]:
X_train[out.predict(X_train)==-1] ## outlier in training data

array([[ 1.42,  0.7 ,  5.4 , 79.5 ],
       [ 2.18,  0.7 ,  0.  , 82.5 ],
       [ 2.05,  0.5 ,  1.9 , 72.2 ],
       [ 6.1 ,  0.4 ,  4.8 , 69.7 ]])

In [25]:
## export outlier detection 
import pickle 

filename = 'Swirr_Outlier.sav'

pickle.dump(out, open(filename, 'wb'))

In [26]:
## train scaler 
from sklearn.preprocessing import StandardScaler
scl= StandardScaler()
scl.fit(X_train_i)
X_train_i_s=scl.transform(X_train_i)
X_test_i_s=scl.transform(X_test_i)

In [27]:
## export scaler

filename = 'Swirr_Scaler.sav'
pickle.dump(scl, open(filename, 'wb'))

In [28]:
#Training the Random Forest Regressor model

from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()

In [29]:
# instantiate the Kfold CrossValidation

from sklearn.model_selection import KFold
cv1 = KFold(n_splits=3, shuffle = True) 

In [30]:
from sklearn.model_selection import GridSearchCV

In [31]:
# perform hyperparameter optimization for the Regression Forest Regressor
param_grid1 = {'min_samples_leaf': [2, 5], 'max_depth':[4,7,9], 'max_features':[2,3,4], 'n_estimators':[50, 150,300]}

# what parameters to search depends on the expert knowledge
grid = GridSearchCV(rfr, param_grid=param_grid1, cv=cv1, verbose=3)  # estimator

grid.fit(X_train_i_s, y_train_i)

Fitting 3 folds for each of 54 candidates, totalling 162 fits
[CV 1/3] END max_depth=4, max_features=2, min_samples_leaf=2, n_estimators=50;, score=0.818 total time=   0.0s
[CV 2/3] END max_depth=4, max_features=2, min_samples_leaf=2, n_estimators=50;, score=0.779 total time=   0.0s
[CV 3/3] END max_depth=4, max_features=2, min_samples_leaf=2, n_estimators=50;, score=0.802 total time=   0.0s
[CV 1/3] END max_depth=4, max_features=2, min_samples_leaf=2, n_estimators=150;, score=0.835 total time=   0.1s
[CV 2/3] END max_depth=4, max_features=2, min_samples_leaf=2, n_estimators=150;, score=0.770 total time=   0.1s
[CV 3/3] END max_depth=4, max_features=2, min_samples_leaf=2, n_estimators=150;, score=0.838 total time=   0.1s
[CV 1/3] END max_depth=4, max_features=2, min_samples_leaf=2, n_estimators=300;, score=0.840 total time=   0.3s
[CV 2/3] END max_depth=4, max_features=2, min_samples_leaf=2, n_estimators=300;, score=0.783 total time=   0.3s
[CV 3/3] END max_depth=4, max_features=2, min

GridSearchCV(cv=KFold(n_splits=3, random_state=None, shuffle=True),
             estimator=RandomForestRegressor(),
             param_grid={'max_depth': [4, 7, 9], 'max_features': [2, 3, 4],
                         'min_samples_leaf': [2, 5],
                         'n_estimators': [50, 150, 300]},
             verbose=3)

In [33]:
## Optimal values of hyperparameters --min_samples_leaf, max_depth, max_features
print(grid.best_params_) Texas

{'max_depth': 9, 'max_features': 4, 'min_samples_leaf': 2, 'n_estimators': 300}


In [34]:
grid.best_estimator_ 

RandomForestRegressor(max_depth=9, max_features=4, min_samples_leaf=2,
                      n_estimators=300)

In [35]:
grid.best_score_

0.9152358358651602

In [36]:
# memorization and generalization performances of the optimal RF regressor model

print('memorization performance: ', grid.score(X_train_i_s,y_train_i)) 

print('generalization performance: ', grid.score(X_test_i_s,y_test_i))  

memorization performance:  0.9881828693647532
generalization performance:  0.9812176017372871


In [37]:
#Save the best trained model
import pickle 

filename = 'Swirr_RFR.sav'
pickle.dump(grid.best_estimator_, open(filename, 'wb'))