# Model Building Pipe Line

In [1]:
import pandas as pd
import numpy as np
import warnings; warnings.simplefilter('ignore')

import time

In this notebook, we will be establishing the model pipe line that we want to use in order to create our final model to evaluate the problem for the 2024 Hackathon.

Following will be the path to the data that we want to use. The following are the assumptions about the data before submitting it to the model building:

1. Data has been cleaned and contains no NaNs
2. Data has been normalized per feature
3. Categorical data has been correctly dealt with (OHE)
4. Target variable is labeled as: "AVG PUMP DIFFERENCE"

In [2]:
# Put in the path to the data here
path_to_data = "Final_dataset.csv"

raw_df = pd.read_csv(path_to_data)
raw_df.head()

Unnamed: 0.1,Unnamed: 0,Well ID,Avg Pump Difference,Lateral Length,TVD,DELAYED,PARENT_CODEV_1050_WELL_COUNT,PARENT_IN_ZONE_MIN_HYPOT,PARENT_1050_WELL_COUNT,PARENT_3000_AVG_HYPOT_DIST,...,Fluid System_Campbell,Fluid System_Williams,Fluid System_Young,Development Strategy_Coke,Development Strategy_Dr Pepper,Development Strategy_Mountain Dew,Development Strategy_Orange Crush,Development Strategy_Pepsi,Development Strategy_Pibb Extra,Development Strategy_Sprite
0,0,1,-0.93,10300,10415.0,1,2,1195.286743,2,2071.0,...,1,0,0,1,0,0,0,0,0,0
1,1,2,-8.45,10300,10415.0,1,2,1195.286743,2,2071.0,...,1,0,0,1,0,0,0,0,0,0
2,2,3,8.7,11000,8470.0,0,6,2882.384033,0,2492.0,...,1,0,0,0,0,0,1,0,0,0
3,3,4,,11000,10555.0,1,1,1059.568848,0,2378.0,...,1,0,0,1,0,0,0,0,0,0
4,4,6,-4.34,12300,10355.0,1,0,2763.0,0,2464.0,...,1,0,0,1,0,0,0,0,0,0


Now we want to split the training from the testing

In [3]:
test_df = raw_df[raw_df["Avg Pump Difference"].isna()].reset_index(drop = True)
train_df = raw_df[~raw_df["Avg Pump Difference"].isna()].reset_index(drop = True)

## Imports

In [4]:
# Linear Models
from sklearn.linear_model import LinearRegression, Ridge, Lasso, BayesianRidge, SGDRegressor

# Kernel Ridge
from sklearn.kernel_ridge import KernelRidge

# SVM
from sklearn.svm import SVR

# Nearest Neighbors
from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsRegressor

# Bayes
from sklearn.naive_bayes import GaussianNB

# Decision Tree
from sklearn.tree import DecisionTreeRegressor

# Ensemble
from sklearn.ensemble import RandomForestRegressor


In [5]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [6]:
all_cols = train_df.columns.tolist()

y = "Avg Pump Difference"

all_cols.remove(y)
x = all_cols

X_train = train_df[x]
Y_train = train_df[[y]]

In [7]:
def training_loop(model, X, Y, num_folds):
    kf = KFold(n_splits=num_folds, shuffle = True)
    
    total = []
    for i, (train_index, test_index) in enumerate(kf.split(X)):
        train_X = X[train_index,:]
        train_Y = Y.iloc[train_index,:]
        
        test_X = X[test_index,:]
        test_Y = Y.iloc[test_index,:]
        
        model.fit(train_X, train_Y)
        
        pred_Y = model.predict(test_X)
        
        cur_accuracy = mean_squared_error(test_Y, pred_Y)
        total.append(cur_accuracy)
        
    return sum(total)/len(total)

In [8]:
models = [
    ("Linear", LinearRegression()),
    ("Ridge - 1", Ridge(solver="svd")),
    ("Ridge - 2 ", Ridge(solver="lsqr")),
    ("Ridge - 3", Ridge(alpha=1.5, solver="svd")),
    ("Ridge - 4", Ridge(alpha=1.5, solver="lsqr")),
    ("Ridge - 5", Ridge(alpha=2, solver="svd")),
    ("Ridge - 6", Ridge(alpha=2, solver="lsqr")),
    ("Ridge - 7", Ridge(alpha=2.5, solver="svd")),
    ("Ridge - 8", Ridge(alpha=2.5, solver="lsqr")),
    ("Lasso - 1", Lasso(warm_start=True)),
    ("Lasso - 2", Lasso(warm_start=False)),
    ("Lasso - 3", Lasso(alpha=1.5, warm_start=True)),
    ("Lasso - 4", Lasso(alpha=1.5, warm_start=False)),
    ("Lasso - 5", Lasso(alpha=2, warm_start=True)),
    ("Lasso - 6", Lasso(alpha=2, warm_start=False)),
    ("BayesianRidge", BayesianRidge()),
    ("Kernel Ridge", KernelRidge()),
    ("SVR - 1", SVR(degree=3)),
    ("SVR - 2", SVR(degree=4)),
    ("SVR - 3", SVR(degree=5)),
    ("SVR - 4", SVR(degree=10)),
    ("SVR - 5", SVR(degree=15)),
    ("SVR - 6", SVR(kernel="poly",degree=3)),
    ("SVR - 7", SVR(kernel="poly",degree=4)),
    ("SVR - 8", SVR(kernel="poly",degree=5)),
    ("SVR - 9", SVR(kernel="poly",degree=10)),
    ("SVR - 10", SVR(kernel="poly",degree=15)),
    ("SVR - 11", SVR(kernel="sigmoid",degree=3)),
    ("SVR - 12", SVR(kernel="sigmoid",degree=4)),
    ("SVR - 13", SVR(kernel="sigmoid",degree=5)),
    ("SVR - 14", SVR(kernel="sigmoid",degree=10)),
    ("SVR - 15", SVR(kernel="sigmoid",degree=15)),
    ("KNN - 1", KNeighborsRegressor(n_neighbors=5)),
    ("KNN - 2", KNeighborsRegressor(n_neighbors=10)),
    ("KNN - 3", KNeighborsRegressor(n_neighbors=15)),
    ("KNN - 4", KNeighborsRegressor(n_neighbors=20)),
    ("KNN - 5", KNeighborsRegressor(n_neighbors=25)),
    ("KNN - 6", KNeighborsRegressor(n_neighbors=30)),
    ("KNN - 7", KNeighborsRegressor(n_neighbors=35)),
    ("DT - 1", DecisionTreeRegressor(max_depth = None)),
    ("DT - 2", DecisionTreeRegressor(max_depth = 5)),
    ("DT - 3", DecisionTreeRegressor(max_depth = 10)),
    ("DT - 4", DecisionTreeRegressor(max_depth = 15)),
    ("DT - 5", DecisionTreeRegressor(max_depth = 20)),
    ("DT - 6", DecisionTreeRegressor(max_depth = 25)),
    ("DT - 7", DecisionTreeRegressor(max_depth = 30)),
    ("RF - 1", RandomForestRegressor(max_depth = None)),
    ("RF - 2", RandomForestRegressor(max_depth = 5)),
    ("RF - 3", RandomForestRegressor(max_depth = 10)),
    ("RF - 4", RandomForestRegressor(max_depth = 15)),
    ("RF - 5", RandomForestRegressor(max_depth = 20)),
    ("RF - 6", RandomForestRegressor(max_depth = 25)),
    ("RF - 7", RandomForestRegressor(max_depth = 30)),
    ("A - RF - 1", RandomForestRegressor(n_estimators=150, max_depth = None)),
    ("A - RF - 2", RandomForestRegressor(n_estimators=150, max_depth = 5)),
    ("A - RF - 3", RandomForestRegressor(n_estimators=150, max_depth = 10)),
    ("A - RF - 4", RandomForestRegressor(n_estimators=150, max_depth = 15)),
    ("A - RF - 5", RandomForestRegressor(n_estimators=150, max_depth = 20)),
    ("A - RF - 6", RandomForestRegressor(n_estimators=150, max_depth = 25)),
    ("A - RF - 7", RandomForestRegressor(n_estimators=150, max_depth = 30)),
    ("B - RF - 1", RandomForestRegressor(n_estimators=50, max_depth = None)),
    ("B - RF - 2", RandomForestRegressor(n_estimators=50, max_depth = 5)),
    ("B - RF - 3", RandomForestRegressor(n_estimators=50, max_depth = 10)),
    ("B - RF - 4", RandomForestRegressor(n_estimators=50, max_depth = 15)),
    ("B - RF - 5", RandomForestRegressor(n_estimators=50, max_depth = 20)),
    ("B - RF - 6", RandomForestRegressor(n_estimators=50, max_depth = 25)),
    ("B - RF - 7", RandomForestRegressor(n_estimators=50, max_depth = 30)),
]

actual_results = []

well_id = X_train["Well ID"]
actual_X = X_train.drop(["Well ID"], axis = 1)

scaler = StandardScaler()
pca = PCA(n_components=0.9)

scaled_X = scaler.fit_transform(actual_X)
pca_X = pca.fit_transform(scaled_X)

for name, m in models: 
    result = training_loop(m, pca_X, Y_train, 10)
    actual_results.append([name, result])
    print("Finished model: %s, RMSE: %.2f" % (name, result))

Finished model: Linear, RMSE: 513.73
Finished model: Ridge - 1, RMSE: 481.50
Finished model: Ridge - 2 , RMSE: 541.36
Finished model: Ridge - 3, RMSE: 440.57
Finished model: Ridge - 4, RMSE: 542.32
Finished model: Ridge - 5, RMSE: 526.62
Finished model: Ridge - 6, RMSE: 469.82
Finished model: Ridge - 7, RMSE: 469.06
Finished model: Ridge - 8, RMSE: 464.21
Finished model: Lasso - 1, RMSE: 372.81
Finished model: Lasso - 2, RMSE: 373.24
Finished model: Lasso - 3, RMSE: 382.07
Finished model: Lasso - 4, RMSE: 385.98
Finished model: Lasso - 5, RMSE: 394.69
Finished model: Lasso - 6, RMSE: 400.30
Finished model: BayesianRidge, RMSE: 352.59
Finished model: Kernel Ridge, RMSE: 4298.39
Finished model: SVR - 1, RMSE: 524.17
Finished model: SVR - 2, RMSE: 522.73
Finished model: SVR - 3, RMSE: 521.69
Finished model: SVR - 4, RMSE: 520.50
Finished model: SVR - 5, RMSE: 526.92
Finished model: SVR - 6, RMSE: 555.92
Finished model: SVR - 7, RMSE: 566.37
Finished model: SVR - 8, RMSE: 565.63
Finished m

## Uncertainty Model

In [10]:
results_df = pd.DataFrame()

results_df["Well ID"] = test_df["Well ID"]

start = time.time()
for i in range(100):
    random_sample = train_df.sample(n=train_df.shape[0], replace=True, random_state=569214)

    scaler = StandardScaler()
    pca = PCA(n_components=0.9)

    cur_y = random_sample[["Avg Pump Difference"]]
    data = random_sample.drop(["Avg Pump Difference", "Well ID"], axis = 1)

    cur_model = RandomForestRegressor(n_estimators=150, max_depth = 30)

    scaled_data = scaler.fit_transform(data)
    pca_data = pca.fit_transform(scaled_data)

    result = training_loop(cur_model, pca_data, cur_y, 10)

    well_id = test_df[["Well ID"]]
    test_data = test_df.drop(["Well ID", "Avg Pump Difference"], axis = 1)

    scaled_test = scaler.transform(test_data)
    pca_test = pca.transform(scaled_test)

    raw_pred = cur_model.predict(pca_test)

    results_df["R%d, GPM" % (i+1)] = raw_pred

    print("Completed", i)
 
end_time = time.time()

#print("diff", (end_time - start_time))

Completed 0
Completed 1
Completed 2
Completed 3
Completed 4
Completed 5
Completed 6
Completed 7
Completed 8
Completed 9
Completed 10
Completed 11
Completed 12
Completed 13
Completed 14
Completed 15
Completed 16
Completed 17
Completed 18
Completed 19
Completed 20
Completed 21
Completed 22
Completed 23
Completed 24
Completed 25
Completed 26
Completed 27
Completed 28
Completed 29
Completed 30
Completed 31
Completed 32
Completed 33
Completed 34
Completed 35
Completed 36
Completed 37
Completed 38
Completed 39
Completed 40
Completed 41
Completed 42
Completed 43
Completed 44
Completed 45
Completed 46
Completed 47
Completed 48
Completed 49
Completed 50
Completed 51
Completed 52
Completed 53
Completed 54
Completed 55
Completed 56
Completed 57
Completed 58
Completed 59
Completed 60
Completed 61
Completed 62
Completed 63
Completed 64
Completed 65
Completed 66
Completed 67
Completed 68
Completed 69
Completed 70
Completed 71
Completed 72
Completed 73
Completed 74
Completed 75
Completed 76
Completed

In [11]:
temp_df = results_df.iloc[:,1:]
results_df["Est Pump Difference, GPM"] = temp_df.mean(axis = 1)

In [12]:
cols = ["Well ID", "Est Pump Difference, GPM"] + results_df.columns.tolist()[1:-2]

results_df = results_df[cols]
results_df

Unnamed: 0,Well ID,"Est Pump Difference, GPM","R1, GPM","R2, GPM","R3, GPM","R4, GPM","R5, GPM","R6, GPM","R7, GPM","R8, GPM",...,"R90, GPM","R91, GPM","R92, GPM","R93, GPM","R94, GPM","R95, GPM","R96, GPM","R97, GPM","R98, GPM","R99, GPM"
0,4,29.459114,31.103733,25.824267,30.022733,32.485667,30.573867,31.666933,27.091133,29.022333,...,29.483267,37.651267,25.1232,30.028,26.396867,29.482333,29.606933,27.625933,32.420733,28.503
1,31,8.934846,7.340267,11.4334,8.965867,8.4698,13.0286,8.0832,10.349267,10.514133,...,10.5946,11.4762,8.4268,9.732133,8.332733,6.617467,9.696067,7.0278,21.481867,9.368
2,42,18.497933,19.101333,19.844,18.08,19.056933,18.2782,15.863,18.3148,21.557133,...,16.535267,20.261867,20.8192,17.994667,20.008333,18.3604,21.782867,16.6594,22.648933,25.1196
3,52,-0.002842,-0.0452,1.569133,-0.6536,0.663667,-0.714733,0.901467,-0.991133,0.178867,...,1.828733,-0.1136,2.278933,-0.7136,1.636267,1.7552,1.5132,-0.4464,0.544867,-0.072933
4,71,36.768454,38.367533,29.831067,42.333867,40.199467,38.482333,37.874867,38.591,36.273133,...,35.174067,41.211267,39.973333,39.0426,33.196333,37.083533,34.493267,40.929533,40.047067,34.608
5,76,8.292659,7.447533,6.474067,6.2276,9.706133,7.156,6.700267,7.862733,8.170067,...,8.429467,7.693867,8.264867,6.801667,15.109267,8.851533,8.247867,7.385467,17.849533,7.160133
6,96,15.392877,13.563267,15.898,13.633933,12.979067,17.4504,14.2404,13.761867,16.6708,...,14.911133,13.7632,14.6178,16.3668,19.082733,16.002333,16.539733,13.060467,15.097867,17.860933
7,131,16.254795,16.088,12.7276,14.481933,16.8064,14.196333,16.062667,13.552733,16.095733,...,16.5032,14.382333,14.3128,18.3468,17.144533,16.768733,15.909667,18.871333,15.5918,13.070667
8,137,5.366422,6.669133,4.941,5.007667,5.630067,6.281133,5.124533,5.7334,5.433067,...,4.4544,5.659333,4.285467,4.473067,6.906067,7.903,6.191,5.427,7.235667,5.617333
9,194,37.410677,43.458333,30.0516,44.688533,36.657467,33.238267,40.318,39.9478,39.917667,...,33.406267,42.952333,38.845067,42.2358,36.651467,38.410067,37.567,43.148133,39.093133,35.070333


In [13]:
results_df.to_csv("temp_solution.csv", index=False)