# Analyze Stage Completions

In [None]:
import json
import pathlib
import zipfile

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import sklearn.linear_model as sklm
import sklearn.metrics as skm
import scipy

In [None]:
import pendulum as pdt
import toolz.curried as toolz

## Preparation

In [None]:
project_filenames = {
    'bakken': 'frankNstein_Bakken_UTM13_FEET.ifrac',
    'montney': 'Project-frankNstein_Montney_UTM13_METERS.ifrac',
    'permian': 'Project_frankNstein_Permian_UTM13_FEET.ifrac',
}

In [None]:
test_data_path = pathlib.Path('c:/src/Orchid.IntegrationTestData/')
project_path_names = toolz.valmap(lambda fn: test_data_path.joinpath(fn), project_filenames)
project_path_names

In [None]:
def project_json(path):
    with zipfile.ZipFile(path) as archive:
        return json.loads(archive.read('project.json'))

In [None]:
project_jsons = {}
wells = {}

In [None]:
stages_seq = {}
stages = {}
stages_by_seq_no = {}
previous_treatment_starts = {}
stages_with_previous = {}

### Bakken

In [None]:
project_jsons['bakken'] = project_json(project_path_names['bakken'])

In [None]:
wells['bakken'] = {w['Name']: w for w in toolz.get_in(['Object', 'Wells'], project_jsons['bakken'])}

In [None]:
def string_to_date_time(i):
    column_name, value = i
    
    def to_pandas_timestamp(time_text):
        if time_text != '0001-01-01T00:00:00.0000000':
            result = pd.Timestamp(pdt.parse(time_text))
        else:
            result = pd.NaT
        return result
    
    if column_name == 'StartTime':
        return column_name, to_pandas_timestamp(value)
    elif column_name == 'StopTime':
        return column_name, to_pandas_timestamp(value)
    else:
        return column_name, value
    
def stage_details(s):
    result = toolz.pipe(
        s,
        toolz.keyfilter(lambda n: n in {'DisplayStageNumber', 'GlobalStageSequenceNumber', 'StartTime', 'StopTime'}),
        toolz.itemmap(string_to_date_time),
    )
    return result

def stages_details(project, well):
    result = toolz.pipe(
        toolz.get_in([well, 'Stages'], wells[project]),
        toolz.map(stage_details),
        toolz.map(lambda s: toolz.merge({'Project': project, 'Well': well}, s)),
        list,
    )
    return result

In [None]:
# stages_details('bakken', 'Demo_1H')
# stages_details('bakken', 'Demo_2H')
# stages_details('bakken', 'Demo_3H')
# stages_details('bakken', 'Demo_4H')

In [None]:
stages_seq['bakken'] = toolz.concat([
    stages_details('bakken', 'Demo_1H'),
    stages_details('bakken', 'Demo_2H'),
    stages_details('bakken', 'Demo_3H'),
    stages_details('bakken', 'Demo_4H'),
])

In [None]:
stages['bakken'] = pd.DataFrame(data=stages_seq['bakken'])

In [None]:
stages['bakken']

In [None]:
stages_by_seq_no['bakken'] = stages['bakken'].set_index('GlobalStageSequenceNumber').sort_index()
stages_by_seq_no['bakken']

In [None]:
stages_by_seq_no['bakken'].index

In [None]:
fig, ax = plt.subplots()

ax.plot(range(1, 136 + 1), stages_by_seq_no['bakken'].index)

plt.show()

In [None]:
def calculate_completion_time(row):
    result = (row['StopTime'] - row['StartTime']).total_seconds()
    return result

In [None]:
stages_by_seq_no['bakken']['CompletionTime'] = (
    stages_by_seq_no['bakken'].apply(calculate_completion_time, axis=1)
)
stages_by_seq_no['bakken']

In [None]:
previous_treatment_starts['bakken'] = stages_by_seq_no['bakken'].loc[2:, 'StartTime'].to_frame()
previous_treatment_starts['bakken'].columns = ['PreviousStart']
previous_treatment_starts['bakken']

In [None]:
stages_with_previous['bakken'] = pd.concat([stages_by_seq_no['bakken'], previous_treatment_starts['bakken']], axis=1, copy=True)
stages_with_previous['bakken']

In [None]:
stages_with_previous['bakken']['ChangeoverTime'] = (
    stages_with_previous['bakken']['StopTime'] - stages_with_previous['bakken']['PreviousStart']
)
stages_with_previous['bakken']

In [None]:
stages_with_previous['bakken']['ChangeoverTime'] = (
    stages_with_previous['bakken']['ChangeoverTime'].apply(lambda ptd: ptd.total_seconds())
)
stages_with_previous['bakken']

In [None]:
stages_with_previous['bakken'].plot.line(y='CompletionTime')

In [None]:
stages_with_previous['bakken'].hist(column='CompletionTime')

In [None]:
stages_with_previous['bakken'].plot.line(y='ChangeoverTime')

In [None]:
stages_with_previous['bakken'].hist(column='ChangeoverTime')

### Montney

In [None]:
project_jsons['montney'] = project_json(project_path_names['montney'])

In [None]:
wells['montney'] = {w['Name']: w for w in toolz.get_in(['Object', 'Wells'], project_jsons['montney'])}

In [None]:
# stages_details('montney', 'Hori_01')
# stages_details('montney', 'Hori_02')
# stages_details('montney', 'Hori_03')
# stages_details('montney', 'Vert_01')

In [None]:
stages_seq['montney'] = toolz.concat([
    stages_details('montney', 'Hori_01'),
    stages_details('montney', 'Hori_02'),
    stages_details('montney', 'Hori_03'),
    stages_details('montney', 'Vert_01'),
])

In [None]:
stages['montney'] = pd.DataFrame(data=stages_seq['montney'])

In [None]:
stages['montney']

In [None]:
stages_by_seq_no['montney'] = stages['montney'].set_index('GlobalStageSequenceNumber').sort_index()
stages_by_seq_no['montney']

In [None]:
stages_by_seq_no['montney'].index

In [None]:
fig, ax = plt.subplots()

ax.plot(range(1, 76 + 1), stages_by_seq_no['montney'].index)

plt.show()

In [None]:
stages_by_seq_no['montney']['CompletionTime'] = (
    stages_by_seq_no['montney'].apply(calculate_completion_time, axis=1)
)
stages_by_seq_no['montney']

In [None]:
previous_treatment_starts['montney'] = stages_by_seq_no['montney'].loc[2:, 'StartTime'].to_frame()
previous_treatment_starts['montney'].columns = ['PreviousStart']
previous_treatment_starts['montney']

In [None]:
stages_with_previous['montney'] = pd.concat([stages_by_seq_no['montney'], 
                                             previous_treatment_starts['montney']], 
                                            axis=1, copy=False)
stages_with_previous['montney']

In [None]:
stages_with_previous['montney']['ChangeoverTime'] = (
    stages_with_previous['montney']['StopTime'] - stages_with_previous['montney']['PreviousStart']
)
stages_with_previous['montney']

In [None]:
stages_with_previous['montney']['ChangeoverTime'] = (
    stages_with_previous['montney']['ChangeoverTime'].apply(lambda ptd: ptd.total_seconds())
)
stages_with_previous['montney']

In [None]:
stages_with_previous['montney'].plot.line(y='CompletionTime')

In [None]:
stages_with_previous['montney'].hist(column='CompletionTime')

In [None]:
stages_with_previous['montney'].plot.line(y='ChangeoverTime')

In [None]:
stages_with_previous['montney'].hist(column='ChangeoverTime')

### Permian

In [None]:
project_jsons['permian'] = project_json(project_path_names['permian'])

In [None]:
wells['permian'] = {w['Name']: w for w in toolz.get_in(['Object', 'Wells'], project_jsons['permian'])}

In [None]:
# stages_details('permian', 'C1')
# stages_details('permian', 'C2')
# stages_details('permian', 'C3')
# stages_details('permian', 'P1')

In [None]:
stages_seq['permian'] = toolz.concat([
    stages_details('permian', 'C1'),
    stages_details('permian', 'C2'),
    stages_details('permian', 'C3'),
    stages_details('permian', 'P1'),
])

In [None]:
stages['permian'] = pd.DataFrame(data=stages_seq['permian'])

In [None]:
stages['permian']

In [None]:
stages_by_seq_no['permian'] = stages['permian'].set_index('GlobalStageSequenceNumber').sort_index()
stages_by_seq_no['permian']

In [None]:
stages_by_seq_no['permian'].index

In [None]:
fig, ax = plt.subplots()

ax.plot(range(1, 86 + 1), stages_by_seq_no['permian'].index)

plt.show()

In [None]:
stages_by_seq_no['permian']['CompletionTime'] = (
    stages_by_seq_no['permian'].apply(calculate_completion_time, axis=1)
)
stages_by_seq_no['permian']

In [None]:
stages_by_seq_no['permian']['CompletionTime'] = (
    stages_by_seq_no['permian'].apply(calculate_completion_time, axis=1)
)
stages_by_seq_no['permian']

In [None]:
previous_treatment_starts['permian'] = stages_by_seq_no['permian'].loc[2:, 'StartTime'].to_frame()
previous_treatment_starts['permian'].columns = ['PreviousStart']
previous_treatment_starts['permian']

In [None]:
stages_with_previous['permian'] = pd.concat([stages_by_seq_no['permian'], 
                                             previous_treatment_starts['permian']], 
                                            axis=1, copy=False)
stages_with_previous['permian']

In [None]:
stages_with_previous['permian']['ChangeoverTime'] = (
    stages_with_previous['permian']['StopTime'] - stages_with_previous['permian']['PreviousStart']
)
stages_with_previous['permian']

In [None]:
stages_with_previous['permian']['ChangeoverTime'] = (
    stages_with_previous['permian']['ChangeoverTime'].apply(lambda ptd: ptd.total_seconds())
)
stages_with_previous['permian']

In [None]:
stages_with_previous['permian'].plot.line(y='CompletionTime')

In [None]:
stages_with_previous['permian'].hist(column='CompletionTime')

In [None]:
stages_with_previous['permian'].plot.line(y='ChangeoverTime')

In [None]:
stages_with_previous['permian'].hist(column='ChangeoverTime')

In [None]:
permian_previous_below_10k = stages_with_previous['permian'][stages_with_previous['permian']['ChangeoverTime'] < 10000]
permian_previous_below_10k

In [None]:
permian_previous_below_10k.plot.line(y='CompletionTime')

In [None]:
permian_previous_below_10k.hist(column='CompletionTime')

In [None]:
permian_previous_below_10k.plot.line(y='ChangeoverTime')

In [None]:
permian_previous_below_10k.hist(column='ChangeoverTime')

## Analysis

### Completion

In [None]:
stages_with_previous['bakken']['CompletionTime'].describe()

### Changeover

In [None]:
bakken_changeover = stages_with_previous['bakken'].loc[2:, 'ChangeoverTime']
bakken_changeover

In [None]:
linear_regressor = sklm.LinearRegression()

In [None]:
bakken_regressable = stages_with_previous['bakken'].loc[2:]
bakken_regressable

In [None]:
# Converts numpy array to (column) vector of single sample arrays
bakken_x = bakken_regressable.index.to_numpy().reshape(-1, 1)
# bakken_x

In [None]:
# Converts numpy array to (column) vector of single sample arrays
bakken_y = bakken_regressable.loc[:, 'ChangeoverTime'].to_numpy().reshape(-1, 1)
# bakken_y

In [None]:
linear_regressor = sklm.LinearRegression()

In [None]:
linear_regressor.fit(bakken_x, bakken_y)

In [None]:
bakken_y_prediction = linear_regressor.predict(bakken_x)

In [None]:
plt.scatter(bakken_changeover.index, bakken_changeover.values)
plt.plot(bakken_x, bakken_y_prediction, color='red')
plt.show()

In [None]:
linear_regressor.coef_, linear_regressor.intercept_

In [None]:
skm.mean_squared_error(bakken_y, bakken_y_prediction), skm.r2_score(bakken_y, bakken_y_prediction)

In [None]:
bakken_changeover_delta = stages_with_previous['bakken'].loc[2:135, 'ChangeoverTime'].to_frame()
bakken_changeover_delta.index += 1
bakken_changeover_delta.columns = ['PreviousChangeoverTime']
bakken_changeover_delta

In [None]:
bakken_with_delta = pd.concat([stages_with_previous['bakken'],
                                  bakken_changeover_delta],
                                 axis=1, copy=False)
bakken_with_delta = bakken_with_delta.loc[2:, :]
bakken_with_delta

In [None]:
bakken_with_delta['Delta'] = (
    bakken_with_delta['ChangeoverTime'] - bakken_with_delta['PreviousChangeoverTime']
)
bakken_with_delta

In [None]:
bakken_with_delta.hist('Delta')

In [None]:
bakken_with_delta['Delta'].describe()

In [None]:
def predicted_changeover(row):
    result = linear_regressor.coef_ * row['GlobalStageSequenceNumber'] + linear_regressor.intercept_
    return result[0][0]

In [None]:
bakken_with_prediction = stages_with_previous['bakken'].loc[2:, ['ChangeoverTime']]
bakken_with_prediction.reset_index(inplace=True)
prediction = bakken_with_prediction.apply(predicted_changeover, axis=1)
bakken_with_prediction['Prediction'] = prediction
bakken_with_prediction['Error'] = bakken_with_prediction['Prediction'] - bakken_with_prediction['ChangeoverTime']
bakken_with_prediction

In [None]:
bakken_with_prediction.hist('Error')

In [None]:
bakken_with_prediction['Error'].describe()