In [1]:
!pip install catboost
!pip install scikit-learn --upgrade

Collecting catboost
  Downloading catboost-0.26.1-cp37-none-manylinux1_x86_64.whl (67.4 MB)
[K     |████████████████████████████████| 67.4 MB 41 kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.26.1
Collecting scikit-learn
  Downloading scikit_learn-0.24.2-cp37-cp37m-manylinux2010_x86_64.whl (22.3 MB)
[K     |████████████████████████████████| 22.3 MB 1.4 MB/s 
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-2.2.0-py3-none-any.whl (12 kB)
Installing collected packages: threadpoolctl, scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.22.2.post1
    Uninstalling scikit-learn-0.22.2.post1:
      Successfully uninstalled scikit-learn-0.22.2.post1
Successfully installed scikit-learn-0.24.2 threadpoolctl-2.2.0


In [2]:
# If you have installation questions, please reach out

import pandas as pd # data storage
import numpy as np  # math and stuff

import catboost as cats # graident boosting 
from catboost import CatBoostRegressor, Pool

import seaborn as sns
import scipy.stats as stats

import datetime
import sklearn  
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.model_selection import cross_val_score, KFold, train_test_split

from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import accuracy_score, max_error, mean_squared_error, median_absolute_error

import matplotlib.pyplot as plt # plotting utility

# dataframes

In [3]:
df = pd.read_csv('drive/My Drive/1_lewis_research/core_to_wl_merge/OS0_Merged_dataset_imputed_08_23_2021.csv')

df2 = pd.read_csv('drive/My Drive/1_lewis_research/core_to_wl_merge/OS1_Merged_dataset_imputed_08_23_2021.csv')

df3 = pd.read_csv('drive/My Drive/1_lewis_research/core_to_wl_merge/OS2_Merged_dataset_imputed_08_23_2021.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df = df.drop(['Unnamed: 0', 'Unnamed: 0.1', 'LiveTime2','ScanTime2', 'LiveTime1','ScanTime1',
              'ref_num', 'API', 'well_name', 'sample_num' ], axis=1)

In [5]:
df = df[df.perm_klink_md >= 0]

df = df[df.USGS_ID != 'E997'] # removing E997

In [6]:
dataset = df[[
       'CAL', 'GR', 'DT', 'SP', 'DENS', 'PE',
       'RESD', 'PHIN', 'PHID', 
       'GR_smooth', 
       'PE_smooth',
       'perm_klink_md'
]]

dataset.replace('NaN',np.nan, regex=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,


In [7]:
X = dataset[[ 'CAL', 'GR', 'DT', 'SP', 'DENS', 'PE',
        'RESD', 'PHIN', 'PHID', 
        'GR_smooth', 
        'PE_smooth']]

Y = dataset[['perm_klink_md']]

Y_array = np.array(Y.values)

In [8]:
seed = 42 # random seed is only used if you want to compare exact answers with friends 
test_size = 0.25 # how much data you want to withold, .15 - 0.3 is a good starting point

X_train, X_test, y_train, y_test = train_test_split(X.values, Y_array, test_size=test_size)

In [9]:
def catboost_perm(X_train, X_test, y_train, y_test, export_name='TEST.csv', max_iter = 200):
  model = CatBoostRegressor(objective='RMSE',
                            task_type='GPU',
                            iterations=max_iter)

  model.fit(X_train, y_train, verbose=max_iter )

  preds = model.predict(X_test)

  rmse = mean_squared_error(y_test, preds, squared=False)
  print("Root Mean Squared Error: %f" % (rmse))
  max = max_error(y_test, preds)
  print("Max Error: %f" % (max))
  MAE = median_absolute_error(y_test, preds)
  print("Median Abs Error: %f" % (MAE))

  grid = {'learning_rate': [ 0.05, 0.1, 0.2, 0.3],
        'depth': [ 4, 6, 8, 10],
        'l2_leaf_reg': [ 3, 4, 5, 6, 7, 8]}

  model_grid = CatBoostRegressor(objective='RMSE', 
                                 iterations=max_iter, 
                                 verbose=False)

  # Grid Search
  grid_search_result = model_grid.grid_search(grid, 
                                            X=X_train, 
                                            y=y_train, 
                                            cv=5,
                                            verbose=False)
  
  model2 = CatBoostRegressor(objective='RMSE',
                           depth=grid_search_result['params']['depth'],
                           l2_leaf_reg=grid_search_result['params']['l2_leaf_reg'],
                           learning_rate=grid_search_result['params']['learning_rate'],
                           iterations=max_iter)

  model2.fit(X_train, y_train, verbose=500 )

  preds2 = model2.predict(X_test)

  rmse2 = mean_squared_error(y_test, preds2, squared=False)
  print("Root Mean Squared Error: %f" % (rmse2))
  max2 = max_error(y_test, preds2)
  print("Max Error: %f" % (max2))
  MAE2= median_absolute_error(y_test, preds2)
  print("Median Abs Error: %f" % (MAE2))

  x = datetime.datetime.now()

  d = {'target': [Y.columns.values, Y.columns.values],
     'MSE': [rmse, rmse2],
     'MAE': [MAE, MAE2],
     'MaxError': [max, max2], 
     'iter':[max_iter, max_iter],
     'day': [x.day, x.day], 
     'month':[x.month, x.month], 
     'year':[x.year, x.year],
     'model':['catboost', 'catboost'],
     'version':[cats.__version__, cats.__version__ ]}
 
  filepath = 'drive/My Drive/1_lewis_research/analysis/experiments/catboost/catboost_results/'

  results = pd.DataFrame(data=d)

  results.to_csv(filepath+export_name)

  return results

# iterations

In [10]:
iter = 150

# No offset

In [11]:
catboost_perm(X_train, X_test, y_train, y_test,'OS0_perm_cat.csv', iter)

Learning rate set to 0.192011
0:	learn: 0.0346782	total: 4.98ms	remaining: 742ms
149:	learn: 0.0038070	total: 566ms	remaining: 0us
Root Mean Squared Error: 0.016058
Max Error: 0.080611
Median Abs Error: 0.005750

bestTest = 0.01949380084
bestIteration = 23


bestTest = 0.01891975091
bestIteration = 15


bestTest = 0.01796552668
bestIteration = 7


bestTest = 0.01908010584
bestIteration = 3


bestTest = 0.01838775654
bestIteration = 33


bestTest = 0.01902048316
bestIteration = 15


bestTest = 0.01899966643
bestIteration = 6


bestTest = 0.0194539329
bestIteration = 3


bestTest = 0.01872136888
bestIteration = 33


bestTest = 0.01843946067
bestIteration = 15


bestTest = 0.01780568003
bestIteration = 6


bestTest = 0.01937367741
bestIteration = 4


bestTest = 0.01863224727
bestIteration = 33


bestTest = 0.0180839583
bestIteration = 17


bestTest = 0.01833154908
bestIteration = 7


bestTest = 0.01910334733
bestIteration = 4


bestTest = 0.01856629272
bestIteration = 32


bestTest = 0.01

Unnamed: 0,target,MSE,MAE,MaxError,iter,day,month,year,model,version
0,[perm_klink_md],0.016058,0.00575,0.080611,150,23,8,2021,catboost,0.26.1
1,[perm_klink_md],0.017241,0.006482,0.08911,150,23,8,2021,catboost,0.26.1


# offset 1

In [12]:
df2 = df2.drop(['Unnamed: 0', 'Unnamed: 0.1', 'LiveTime2','ScanTime2', 'LiveTime1','ScanTime1',
              'ref_num', 'API', 'well_name', 'sample_num' ], axis=1)

df2 = df2[df2.perm_klink_md >= 0]
df2 = df2[df2.USGS_ID != 'E997'] # removing E997

dataset2 = df2[[
       'CAL', 'GR', 'DT', 'SP', 'DENS', 'PE',
       'RESD', 'PHIN', 'PHID', 
       'GR_smooth', 
       'PE_smooth',
       'perm_klink_md']]

# Features we will use for prediction
X2 = dataset2[['CAL', 'GR', 'DT', 'SP', 'DENS', 'PE',
        'RESD', 'PHIN', 'PHID', 
        'GR_smooth', 
        'PE_smooth']]

# What we are trying to predict
Y = dataset2[['perm_klink_md']]

Y_array2 = np.array(Y.values)

X_train2, X_test2, y_train2, y_test2 = train_test_split(X2.values, Y_array2, test_size=test_size)

In [13]:
catboost_perm(X_train2, X_test2, y_train2, y_test2,'OS1_perm_cat.csv', iter)

Learning rate set to 0.192011
0:	learn: 0.0347419	total: 6.35ms	remaining: 946ms
149:	learn: 0.0033915	total: 575ms	remaining: 0us
Root Mean Squared Error: 0.018993
Max Error: 0.085328
Median Abs Error: 0.006580

bestTest = 0.04455490378
bestIteration = 83


bestTest = 0.04452537751
bestIteration = 58


bestTest = 0.0446464301
bestIteration = 16


bestTest = 0.04445915409
bestIteration = 13


bestTest = 0.04451513646
bestIteration = 109


bestTest = 0.04469832477
bestIteration = 67


bestTest = 0.04488979924
bestIteration = 14


bestTest = 0.04455066504
bestIteration = 17


bestTest = 0.04484851072
bestIteration = 128


bestTest = 0.04510518283
bestIteration = 51


bestTest = 0.04475550717
bestIteration = 35


bestTest = 0.04470923188
bestIteration = 12


bestTest = 0.04477280116
bestIteration = 125


bestTest = 0.04484034237
bestIteration = 46


bestTest = 0.04465372877
bestIteration = 30


bestTest = 0.04478078671
bestIteration = 20


bestTest = 0.04488907708
bestIteration = 128


be

Unnamed: 0,target,MSE,MAE,MaxError,iter,day,month,year,model,version
0,[perm_klink_md],0.018993,0.00658,0.085328,150,23,8,2021,catboost,0.26.1
1,[perm_klink_md],0.043276,0.006092,0.275779,150,23,8,2021,catboost,0.26.1


# Offset 2

In [14]:
df3 = df3.drop(['Unnamed: 0', 'Unnamed: 0.1', 'LiveTime2','ScanTime2', 'LiveTime1','ScanTime1',
              'ref_num', 'API', 'well_name', 'sample_num' ], axis=1)

df3 = df3[df3.perm_klink_md >= 0]
df3 = df3[df3.USGS_ID != 'E997'] # removing E997

dataset3 = df3[[
       'CAL', 'GR', 'DT', 'SP', 'DENS', 'PE',
       'RESD', 'PHIN', 'PHID', 
       'GR_smooth', 
       'PE_smooth',
       'perm_klink_md']]

# Features we will use for prediction
X3 = dataset3[['CAL', 'GR', 'DT', 'SP', 'DENS', 'PE',
        'RESD', 'PHIN', 'PHID', 
        'GR_smooth', 
        'PE_smooth']]

# What we are trying to predict
Y3 = dataset3[['perm_klink_md']]

Y_array3 = np.array(Y3.values)

X_train3, X_test3, y_train3, y_test3 = train_test_split(X3.values, Y_array3, test_size=test_size)

In [15]:
catboost_perm(X_train3, X_test3, y_train3, y_test3,'OS2_perm_cat.csv', iter)

Learning rate set to 0.192011
0:	learn: 0.0298528	total: 4.69ms	remaining: 699ms
149:	learn: 0.0033587	total: 554ms	remaining: 0us
Root Mean Squared Error: 0.037968
Max Error: 0.331121
Median Abs Error: 0.006144

bestTest = 0.01682089323
bestIteration = 149


bestTest = 0.01661863343
bestIteration = 107


bestTest = 0.01675459669
bestIteration = 83


bestTest = 0.01609476829
bestIteration = 33


bestTest = 0.01687389013
bestIteration = 149


bestTest = 0.01630069945
bestIteration = 147


bestTest = 0.01713413701
bestIteration = 103


bestTest = 0.01733399284
bestIteration = 49


bestTest = 0.01681223547
bestIteration = 149


bestTest = 0.01666818474
bestIteration = 148


bestTest = 0.01638597165
bestIteration = 116


bestTest = 0.01709232762
bestIteration = 23


bestTest = 0.01689739514
bestIteration = 149


bestTest = 0.01642107772
bestIteration = 147


bestTest = 0.01627162146
bestIteration = 86


bestTest = 0.01681750443
bestIteration = 51


bestTest = 0.01733398797
bestIteration = 

Unnamed: 0,target,MSE,MAE,MaxError,iter,day,month,year,model,version
0,[perm_klink_md],0.037968,0.006144,0.331121,150,23,8,2021,catboost,0.26.1
1,[perm_klink_md],0.035838,0.007327,0.316022,150,23,8,2021,catboost,0.26.1
