# Model Building Pipe Line

In [None]:
import pandas as pd
import numpy as np
import warnings; warnings.simplefilter('ignore')

In this notebook, we will be establishing the model pipe line that we want to use in order to create our final model to evaluate the problem for the 2024 Hackathon.

Following will be the path to the data that we want to use. The following are the assumptions about the data before submitting it to the model building:

1. Data has been cleaned and contains no NaNs
2. Data has been normalized per feature
3. Categorical data has been correctly dealt with (OHE)
4. Target variable is labeled as: "AVG PUMP DIFFERENCE"

In [None]:
# Put in the path to the data here
path_to_data = "Final_dataset.csv"

raw_df = pd.read_csv(path_to_data)
raw_df.head()

Now we want to split the training from the testing

In [None]:
test_df = raw_df[raw_df["Avg Pump Difference"].isna()].reset_index(drop = True)
train_df = raw_df[~raw_df["Avg Pump Difference"].isna()].reset_index(drop = True)

## Imports

In [None]:
# Linear Models
from sklearn.linear_model import LinearRegression, Ridge, Lasso, BayesianRidge, SGDRegressor

# Kernel Ridge
from sklearn.kernel_ridge import KernelRidge

# SVM
from sklearn.svm import SVR

# Nearest Neighbors
from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsRegressor

# Bayes
from sklearn.naive_bayes import GaussianNB

# Decision Tree
from sklearn.tree import DecisionTreeRegressor

# Ensemble
from sklearn.ensemble import RandomForestRegressor


In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
all_cols = train_df.columns.tolist()

y = "Avg Pump Difference"

all_cols.remove(y)
x = all_cols

X_train = train_df[x]
Y_train = train_df[[y]]

In [None]:
def training_loop(model, X, Y, num_folds):
    kf = KFold(n_splits=num_folds, shuffle = True)
    
    total = []
    for i, (train_index, test_index) in enumerate(kf.split(X)):
        train_X = X[train_index,:]
        train_Y = Y.iloc[train_index,:]
        
        test_X = X[test_index,:]
        test_Y = Y.iloc[test_index,:]
        
        model.fit(train_X, train_Y)
        
        pred_Y = model.predict(test_X)
        
        cur_accuracy = mean_squared_error(test_Y, pred_Y)
        total.append(cur_accuracy)
        
    return sum(total)/len(total)

In [None]:
models = [
    ("Linear", LinearRegression()),
    ("Ridge - 1", Ridge(solver="svd")),
    ("Ridge - 2 ", Ridge(solver="lsqr")),
    ("Ridge - 3", Ridge(alpha=1.5, solver="svd")),
    ("Ridge - 4", Ridge(alpha=1.5, solver="lsqr")),
    ("Ridge - 5", Ridge(alpha=2, solver="svd")),
    ("Ridge - 6", Ridge(alpha=2, solver="lsqr")),
    ("Ridge - 7", Ridge(alpha=2.5, solver="svd")),
    ("Ridge - 8", Ridge(alpha=2.5, solver="lsqr")),
    ("Lasso - 1", Lasso(warm_start=True)),
    ("Lasso - 2", Lasso(warm_start=False)),
    ("Lasso - 3", Lasso(alpha=1.5, warm_start=True)),
    ("Lasso - 4", Lasso(alpha=1.5, warm_start=False)),
    ("Lasso - 5", Lasso(alpha=2, warm_start=True)),
    ("Lasso - 6", Lasso(alpha=2, warm_start=False)),
    ("BayesianRidge", BayesianRidge()),
    ("Kernel Ridge", KernelRidge()),
    ("SVR - 1", SVR(degree=3)),
    ("SVR - 2", SVR(degree=4)),
    ("SVR - 3", SVR(degree=5)),
    ("SVR - 4", SVR(degree=10)),
    ("SVR - 5", SVR(degree=15)),
    ("SVR - 6", SVR(kernel="poly",degree=3)),
    ("SVR - 7", SVR(kernel="poly",degree=4)),
    ("SVR - 8", SVR(kernel="poly",degree=5)),
    ("SVR - 9", SVR(kernel="poly",degree=10)),
    ("SVR - 10", SVR(kernel="poly",degree=15)),
    ("SVR - 11", SVR(kernel="sigmoid",degree=3)),
    ("SVR - 12", SVR(kernel="sigmoid",degree=4)),
    ("SVR - 13", SVR(kernel="sigmoid",degree=5)),
    ("SVR - 14", SVR(kernel="sigmoid",degree=10)),
    ("SVR - 15", SVR(kernel="sigmoid",degree=15)),
    ("KNN - 1", KNeighborsRegressor(n_neighbors=5)),
    ("KNN - 2", KNeighborsRegressor(n_neighbors=10)),
    ("KNN - 3", KNeighborsRegressor(n_neighbors=15)),
    ("KNN - 4", KNeighborsRegressor(n_neighbors=20)),
    ("KNN - 5", KNeighborsRegressor(n_neighbors=25)),
    ("KNN - 6", KNeighborsRegressor(n_neighbors=30)),
    ("KNN - 7", KNeighborsRegressor(n_neighbors=35)),
    ("DT - 1", DecisionTreeRegressor(max_depth = None)),
    ("DT - 2", DecisionTreeRegressor(max_depth = 5)),
    ("DT - 3", DecisionTreeRegressor(max_depth = 10)),
    ("DT - 4", DecisionTreeRegressor(max_depth = 15)),
    ("DT - 5", DecisionTreeRegressor(max_depth = 20)),
    ("DT - 6", DecisionTreeRegressor(max_depth = 25)),
    ("DT - 7", DecisionTreeRegressor(max_depth = 30)),
    ("RF - 1", RandomForestRegressor(max_depth = None)),
    ("RF - 2", RandomForestRegressor(max_depth = 5)),
    ("RF - 3", RandomForestRegressor(max_depth = 10)),
    ("RF - 4", RandomForestRegressor(max_depth = 15)),
    ("RF - 5", RandomForestRegressor(max_depth = 20)),
    ("RF - 6", RandomForestRegressor(max_depth = 25)),
    ("RF - 7", RandomForestRegressor(max_depth = 30)),
    ("A - RF - 1", RandomForestRegressor(n_estimators=150, max_depth = None)),
    ("A - RF - 2", RandomForestRegressor(n_estimators=150, max_depth = 5)),
    ("A - RF - 3", RandomForestRegressor(n_estimators=150, max_depth = 10)),
    ("A - RF - 4", RandomForestRegressor(n_estimators=150, max_depth = 15)),
    ("A - RF - 5", RandomForestRegressor(n_estimators=150, max_depth = 20)),
    ("A - RF - 6", RandomForestRegressor(n_estimators=150, max_depth = 25)),
    ("A - RF - 7", RandomForestRegressor(n_estimators=150, max_depth = 30)),
    ("B - RF - 1", RandomForestRegressor(n_estimators=50, max_depth = None)),
    ("B - RF - 2", RandomForestRegressor(n_estimators=50, max_depth = 5)),
    ("B - RF - 3", RandomForestRegressor(n_estimators=50, max_depth = 10)),
    ("B - RF - 4", RandomForestRegressor(n_estimators=50, max_depth = 15)),
    ("B - RF - 5", RandomForestRegressor(n_estimators=50, max_depth = 20)),
    ("B - RF - 6", RandomForestRegressor(n_estimators=50, max_depth = 25)),
    ("B - RF - 7", RandomForestRegressor(n_estimators=50, max_depth = 30)),
]

actual_results = []

well_id = X_train["Well ID"]
actual_X = X_train.drop(["Well ID"], axis = 1)

scaler = StandardScaler()
pca = PCA(n_components=0.9)

scaled_X = scaler.fit_transform(actual_X)
pca_X = pca.fit_transform(scaled_X)

for name, m in models: 
    result = training_loop(m, pca_X, Y_train, 10)
    actual_results.append([name, result])
    print("Finished model: %s, RMSE: %.2f" % (name, result))

## Uncertainty Model

In [None]:
results_df = pd.DataFrame()

results_df["Well ID"] = test_df["Well ID"]


for i in range(100):
    random_sample = train_df.sample(n=train_df.shape[0], replace=True, random_state=569214)

    scaler = StandardScaler()
    pca = PCA(n_components=0.9)

    cur_y = random_sample[["Avg Pump Difference"]]
    data = random_sample.drop(["Avg Pump Difference", "Well ID"], axis = 1)

    cur_model = RandomForestRegressor(n_estimators=150, max_depth = 30)

    scaled_data = scaler.fit_transform(data)
    pca_data = pca.fit_transform(scaled_data)

    result = training_loop(cur_model, pca_data, cur_y, 10)

    well_id = test_df[["Well ID"]]
    test_data = test_df.drop(["Well ID", "Avg Pump Difference"], axis = 1)

    scaled_test = scaler.transform(test_data)
    pca_test = pca.transform(scaled_test)

    raw_pred = cur_model.predict(pca_test)

    results_df["R%d, GPM" % (i+1)] = raw_pred

    print("Completed", i)
 

In [None]:
temp_df = results_df.iloc[:,1:]
results_df["Est Pump Difference, GPM"] = temp_df.mean(axis = 1)

In [None]:
cols = ["Well ID", "Est Pump Difference, GPM"] + results_df.columns.tolist()[1:-2]

results_df = results_df[cols]
results_df

In [None]:
results_df.to_csv("temp_solution.csv", index=False)