# Irvin EDA
Irvin's notebook with EDA and data processing.

In [65]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error, r2_score
import plotly.express as px

In [33]:
SEED = 42

## Combine Well Data

Combining all the well data, properly adding NaNs, and adding positional information to the WELL_MASTER.csv

In [3]:
well_log_dir = r"/work/Carpe-Datum-2022/Well_Log/"

In [4]:
well_positional_df = pd.read_csv("/work/Carpe-Datum-2022/Well_Head_and_Completion.csv").set_index("WELL")

In [5]:
def process_data(filename: str) -> pd.DataFrame:
    df = pd.read_csv(well_log_dir + filename)
    well_id = filename.split(".")[0]
    df["well_id"] = well_id
    df["X"] = well_positional_df.loc[well_id, "X(ft)"]
    df["Y"] = well_positional_df.loc[well_id, "Y(ft)"]
    df["Completion Zone"] = well_positional_df.loc[well_id, "Completion"]
    return df

In [7]:
# Grab all WP files
files = os.listdir(well_log_dir)
master = pd.DataFrame()

# Read in files into a list of dataframes, excluding ourselves
wells = [process_data(file) for file in files if "WELL_MASTER" not in file]

master = pd.concat(wells)

In [8]:
# Hard-coded because I'm too lazy to figure out how to do this correctly.
master = master.replace([-999, -999.0000000000005, -999.0000000000002, -998.9999999999998], np.NaN)
master.describe()

Unnamed: 0,MD(ft),PORO(v/v),Permeability(mD),RHOB(g/cm3),DTS(us/ft),DT(us/ft),PEF(B/E),RD(OHMM),RS(OHMM),ROP,DENC(g/cm3),NPHI(v/v),X,Y
count,1963.0,455.0,221.0,1918.0,1004.0,1920.0,1689.0,1724.0,1724.0,1726.0,1684.0,1963.0,1963.0,1963.0
mean,4673.224657,0.13114,10.497034,2.472862,128.273096,80.322722,5.351575,181.844254,514.513168,20.167304,0.047522,0.200203,124875.694213,87554.249042
std,165.718326,0.062887,39.248645,0.207148,35.222315,17.827921,2.47529,184.443838,3246.082233,28.413135,0.02744,0.107541,9796.471857,3209.167562
min,4228.005683,0.019566,1e-06,1.991035,-471.675434,1.138524,0.019155,10.197411,2.270108,-490.300267,-0.050168,-0.0019,109560.26,81747.12
25%,4546.547858,0.072468,0.0195,2.326909,116.81309,69.493418,4.930555,70.345749,71.606283,16.81594,0.03083,0.122878,115987.29,84476.47
50%,4666.802978,0.129662,0.094813,2.469486,131.851152,78.929504,5.782597,130.884864,137.398654,20.011714,0.050912,0.186431,124094.8,88549.74
75%,4796.413228,0.185373,1.948104,2.57889,142.295954,87.86383,6.477832,213.021581,259.946834,27.341557,0.063165,0.244933,134042.86,89987.0
max,5096.702794,0.249241,352.697773,3.044163,186.253964,179.133001,11.490936,2508.770579,62290.8,46.8954,0.161357,1.021738,142095.82,93691.55


In [9]:
master.to_csv(well_log_dir + "WELL_MASTER.csv", index=False)

In [10]:
prod_df = pd.read_csv("/work/Carpe-Datum-2022/Production_History_Field.csv", index_col="Date")

In [11]:
prod_df["Total"] = prod_df.sum(axis=1)

In [12]:
prod_df.head()

Unnamed: 0_level_0,WOPR:WP0(bbl/day),WOPR:WP1(bbl/day),WOPR:WP10(bbl/day),WOPR:WP11(bbl/day),WOPR:WP12(bbl/day),WOPR:WP13(bbl/day),WOPR:WP14(bbl/day),WOPR:WP15(bbl/day),WOPR:WP16(bbl/day),WOPR:WP17(bbl/day),...,WOPR:WP46(bbl/day),WOPR:WP47(bbl/day),WOPR:WP48(bbl/day),WOPR:WP49(bbl/day),WOPR:WP5(bbl/day),WOPR:WP6(bbl/day),WOPR:WP7(bbl/day),WOPR:WP8(bbl/day),WOPR:WP9(bbl/day),Total
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1-Jan-12,8.945508,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,381.691952
1-Feb-12,8.854879,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,344.603663
1-Mar-12,8.845839,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,340.549643
1-Apr-12,8.839578,0.0,0.0,0,0.0,0.0,0.0,3483.552017,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3821.070871
1-May-12,8.834659,0.0,0.0,0,0.0,0.0,0.0,3374.525497,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.598606,3717.077014


## Production Plot

Plot of time series data for each well's production, as well as the total production for the collection, as a whole.

In [13]:

px.line(prod_df, x=prod_df.index, y=prod_df.columns)

In [14]:
well_time_series = [prod_df[prod_df[column] != 0][column].reset_index(drop=True) for column in prod_df.columns]

## Decline Curve Analysis DataFrame

Has all 0 values removed, so that the initial, as well as the 

In [43]:
well_time_series_df = pd.concat(well_time_series, axis=1)
well_time_series_df.describe()

Unnamed: 0,WOPR:WP0(bbl/day),WOPR:WP1(bbl/day),WOPR:WP10(bbl/day),WOPR:WP11(bbl/day),WOPR:WP12(bbl/day),WOPR:WP13(bbl/day),WOPR:WP14(bbl/day),WOPR:WP15(bbl/day),WOPR:WP16(bbl/day),WOPR:WP17(bbl/day),...,WOPR:WP46(bbl/day),WOPR:WP47(bbl/day),WOPR:WP48(bbl/day),WOPR:WP49(bbl/day),WOPR:WP5(bbl/day),WOPR:WP6(bbl/day),WOPR:WP7(bbl/day),WOPR:WP8(bbl/day),WOPR:WP9(bbl/day),Total
count,54.0,26.0,34.0,0.0,87.0,80.0,46.0,15.0,44.0,13.0,...,22.0,30.0,63.0,54.0,77.0,30.0,12.0,50.0,41.0,121.0
mean,8.809168,38.675641,46.363101,,0.664632,22.342583,381.194628,3060.205978,1221.216449,1367.727037,...,1249.728244,1215.319622,464.88244,351.281916,13.723622,1033.318772,5.925703,13.095474,6.434229,8168.554806
std,0.024437,0.271776,0.675604,,0.044145,0.867709,45.894232,223.213725,388.403162,73.441095,...,94.585474,131.419452,127.722299,89.23619,3.947069,273.942482,0.544118,0.098262,0.674547,2647.876209
min,8.785622,38.346559,45.500584,,0.574007,21.417261,297.860478,2744.731762,650.708425,1263.301841,...,1123.836187,1038.169784,261.701444,213.002484,7.129533,696.642534,5.269145,12.969189,5.49897,340.549643
25%,8.795126,38.467457,45.795829,,0.629549,21.613251,343.387504,2890.105172,869.892989,1305.778714,...,1172.948476,1105.357913,353.050039,274.153564,10.257872,799.168559,5.473093,13.017598,5.865219,6632.304585
50%,8.804288,38.611888,46.258112,,0.674357,21.99842,390.596558,3030.348207,1199.865209,1375.149992,...,1239.033637,1189.925342,464.081421,348.436063,13.952722,979.260694,5.855899,13.075749,6.280917,8075.815412
75%,8.814843,38.818828,46.803097,,0.707594,22.946195,420.371976,3215.375697,1549.21931,1421.757508,...,1311.294321,1335.117225,578.041027,426.108004,17.214352,1226.342823,6.320583,13.15295,7.059284,9800.313135
max,8.945508,39.393796,48.109917,,0.717781,24.529675,447.046761,3483.552017,1941.703124,1484.466481,...,1441.033779,1454.967513,676.068447,512.678527,19.81721,1611.39346,6.854249,13.385876,7.598606,13674.068207


In [47]:
two_year_well_df = well_time_series_df.loc[:, well_time_series_df.isnull().sum() < (121-24)].T

In [69]:
two_year_well_df.columns = [str(x) for x in two_year_well_df.columns]
# two_year_well_df = two_year_well_df.drop("Total", axis=0)
two_year_well_df["decline_init"] = two_year_well_df["1"] - two_year_well_df["0"]
two_year_well_df["sum"] = two_year_well_df.loc[:, "0":"24"].sum(axis=1)
two_year_X = two_year_well_df[["0", "decline_init"]]
two_year_y = two_year_well_df["sum"]
train_X, test_X, train_y, test_y = train_test_split(two_year_X, two_year_y, test_size=.2, random_state=SEED)
px.scatter_3d(two_year_well_df, x="decline_init", y="0", z="sum", hover_name=two_year_well_df.index)

In [70]:
two_year_model = LinearRegression()

two_year_model.fit(train_X, train_y)
two_year_preds = two_year_model.predict(test_X)

In [71]:
error = mean_absolute_percentage_error(two_year_preds, test_y)
r2_scored = r2_score(two_year_preds, test_y)
r2_scored

0.9975131313830568

In [None]:
# I don't know what I'm trying to do here and the math is obviously wrong too.
well_mean_decline_factor = {well.name: (1 - (well.iloc[0] - well.iloc[-1]) ** (1/ well.shape[0])) for well in well_time_series if well.get(0)}
well_mean_decline_factor


invalid value encountered in double_scalars



{'WOPR:WP0(bbl/day)': 0.03338011693272547,
 'WOPR:WP1(bbl/day)': -0.0017768093629044746,
 'WOPR:WP10(bbl/day)': -0.02861029665315029,
 'WOPR:WP12(bbl/day)': 0.02204661025897181,
 'WOPR:WP13(bbl/day)': -0.012068401578415822,
 'WOPR:WP14(bbl/day)': -0.11494893222843094,
 'WOPR:WP15(bbl/day)': -0.5532305342342627,
 'WOPR:WP16(bbl/day)': -0.17680045764116303,
 'WOPR:WP17(bbl/day)': -0.5148259825319825,
 'WOPR:WP18(bbl/day)': -0.05070721267759781,
 'WOPR:WP19(bbl/day)': 0.02663099846443351,
 'WOPR:WP2(bbl/day)': 0.011567335031054893,
 'WOPR:WP20(bbl/day)': -0.01031891904055593,
 'WOPR:WP21(bbl/day)': 0.06256979212629499,
 'WOPR:WP22(bbl/day)': 0.05424319710998837,
 'WOPR:WP23(bbl/day)': 0.013197189081264882,
 'WOPR:WP24(bbl/day)': -0.0766858558029837,
 'WOPR:WP25(bbl/day)': -0.4197542441673068,
 'WOPR:WP26(bbl/day)': -0.1555376181812389,
 'WOPR:WP27(bbl/day)': 0.01775273875839123,
 'WOPR:WP28(bbl/day)': -0.046417321467089545,
 'WOPR:WP29(bbl/day)': -0.1889305641597574,
 'WOPR:WP3(bbl/day)':

## Average Production relative to position

In [None]:
well_time_series_df.columns = [name[5:-9] for name in well_time_series_df.columns]

In [None]:
avg_prod_df = well_time_series_df.mean().rename("Average_Production")
avg_prod_df

WP0        8.809168
WP1       38.675641
WP10      46.363101
WP11            NaN
WP12       0.664632
WP13      22.342583
WP14     381.194628
WP15    3060.205978
WP16    1221.216449
WP17    1367.727037
WP18     291.086371
WP19       7.082097
WP2        5.276535
WP20      39.545986
WP21       1.579448
WP22      13.159516
WP23       6.300324
WP24      90.374282
WP25    1328.452893
WP26     439.336342
WP27       9.066930
WP28     131.235255
WP29     859.540421
WP3     1606.279426
WP30       1.125125
WP31     460.674842
WP32    1618.477384
WP33       0.225082
WP34    1475.869991
WP35       2.201824
WP36     376.967843
WP37     377.621596
WP38     932.002925
WP39      13.290233
WP4      257.113535
WP40      49.518124
WP41    1056.826177
WP42    1146.619302
WP43    2175.493437
WP44     444.719213
WP45       0.142503
WP46    1249.728244
WP47    1215.319622
WP48     464.882440
WP49     351.281916
WP5       13.723622
WP6     1033.318772
WP7        5.925703
WP8       13.095474
WP9        6.434229


In [None]:
sum_prod_df = well_time_series_df.sum().rename("Sum_Production")
new_df = pd.concat([well_positional_df, sum_prod_df], axis=1).dropna(subset=["X(ft)"], axis=0)
new_df.to_csv('new_df2.csv')

In [None]:
new_df = pd.concat([well_positional_df, avg_prod_df], axis=1).dropna(subset=["X(ft)"], axis=0)
new_df

Unnamed: 0,X(ft),Y(ft),Completion,Elevation Kelly Bushing (ft),Average_Production
WP0,137106.82,83818.47,Upper,193.32,8.809168
WP1,132460.98,85832.77,Upper,187.15,38.675641
WP2,133634.0,82144.52,Upper,164.55,5.276535
WP3,140892.57,81747.12,Lower,179.92,1606.279426
WP4,109951.43,89343.45,Lower,186.21,257.113535
WP5,123950.95,92227.31,Lower,172.8,13.723622
WP6,111035.32,88692.32,Lower,182.66,1033.318772
WP7,112439.69,85560.85,Lower,181.09,5.925703
WP8,127369.74,91739.99,Upper,180.04,13.095474
WP9,115943.69,86730.44,Lower,178.25,6.434229


In [None]:
px.scatter_3d(new_df, x="X(ft)", y="Y(ft)", z="Sum_Production", color="Sum_Production")

In [None]:
import plotly.graph_objects as go

fig = go.Figure(data=[go.Surface(z=np.diag(new_df["Average_Production"].values), x=new_df["X(ft)"].values, y=new_df["Y(ft)"].values)])

fig.show()

In [None]:
np.diag(new_df["Average_Production"].values)

array([[   8.80916814,    0.        ,    0.        , ...,    0.        ,
           0.        ,    0.        ],
       [   0.        ,   38.67564086,    0.        , ...,    0.        ,
           0.        ,    0.        ],
       [   0.        ,    0.        ,    5.27653451, ...,    0.        ,
           0.        ,    0.        ],
       ...,
       [   0.        ,    0.        ,    0.        , ..., 1215.31962223,
           0.        ,    0.        ],
       [   0.        ,    0.        ,    0.        , ...,    0.        ,
         464.88244033,    0.        ],
       [   0.        ,    0.        ,    0.        , ...,    0.        ,
           0.        ,  351.28191581]])

In [None]:
new_df.to_csv('new_df.csv')

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=6b71fecc-b405-4b93-bce3-c5ed6c21c449' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>