# EDA Datathon2021 by Softypo

In [1]:
# Importing module libraries
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.graph_objects as go
from tqdm.notebook import tqdm
import dataframe_utilities as dfutil

## Data loading

In [2]:
# Global files
target_files = dfutil.loader('Data for Datathon\structured_data', index_col=0,)
# Eaglebine files
Eaglebine_files = dfutil.loader('Data for Datathon\Eaglebine\Eaglebine', index_col=0)
# Duvernay files
Duvernay_files = dfutil.loader('Data for Datathon\Duvernay\Duvernay', index_col=0)

In [3]:
# master dataframe
df = target_files['set_assign']
# creating static/synthetic cathegory
df['source'] = pd.Series(['static' if ~np.isnan(row) else 'synthetic' for row in target_files['set_assign'].join(target_files['Data_static_logs'])['Temp (degC)']], dtype="category", index=df.index)

### tweaks and fixs

In [4]:
# fixing this one file set index
target_files['combined_temperature'].set_index('UWI', inplace=True)
# fixing this one file index column name
Eaglebine_files['Eaglebine_well_headers'].reset_index(inplace=True)
Eaglebine_files['Eaglebine_well_headers'].set_index('displayapi', inplace=True)
# fixing this one file index column values
Eaglebine_files['EagleBine_Casing_production'].index = Eaglebine_files['EagleBine_Casing_production'].index*10000
# triming spaces in these files
Eaglebine_files['Eaglebine_production'].rename(columns=lambda x: x.strip(), inplace=True)
Duvernay_files['SPE_Duvernay_production'].rename(columns=lambda x: x.strip(), inplace=True)
# calculatin formation at td from Eaglebine_tops
Eaglebine_files['Eaglebine_BHT']['formation'] = Eaglebine_files['Eaglebine_tops'].iloc[:,9:28].isin(Eaglebine_files['Eaglebine_tops'].iloc[:,9:28].max(axis=1)).idxmax(1)
#droping unused files
Duvernay_files.pop('Duvernay_tops')
Eaglebine_files.pop('Eaglebine_tops')
#droping duplicated rows
Eaglebine_files['Eaglebine_production'] = Eaglebine_files['Eaglebine_production'][~Eaglebine_files['Eaglebine_production'].index.duplicated(keep='first')]
Eaglebine_files['Eaglebine_mud'] = Eaglebine_files['Eaglebine_mud'][~Eaglebine_files['Eaglebine_mud'].index.duplicated(keep='last')]

### concealing names and dropping redundant columns

In [5]:
# well headers
Eaglebine_files['Eaglebine_well_headers']['Elevation'] = Eaglebine_files['Eaglebine_well_headers']['Elevation']*0.3048
Eaglebine_files['Eaglebine_well_headers'].drop(columns=['td'], inplace=True)

Eaglebine_files['Eaglebine_well_headers'].rename(columns = {'Elevation':'Elevation_KB_meters'}, inplace = True)
Duvernay_files['Duvernay_well_headers'].rename(columns = {'Elevation Meters':'Elevation_KB_meters'}, inplace = True)

# BHT files
Eaglebine_files['Eaglebine_BHT'].drop(columns=['SurfLat', 'SurfLong', 'TD (ft)', 'BHT_below sea level (ft)', 'BHT_ subsurface (ft)', 'GL(ft)', 'BHTorMRT (maximum recorded temperature) oF'], inplace=True)
Duvernay_files['Duvernay_BHT'].drop(columns=['DST Bottom Hole Temp. (degC)', 'Unnamed: 9', 'elevation M above sea level', 'UWI'], inplace=True)

Duvernay_files['Duvernay_BHT'].rename(columns = {'Formation DSTd':'formation'}, inplace = True)

# Casing files
Eaglebine_files['Eaglebine_BHT'].add_suffix('_casing_info')

# mud files
Eaglebine_files['Eaglebine_mud']['MW@Depth(KB)'] = Eaglebine_files['Eaglebine_mud']['MW@Depth(KB)']*0.3048
Eaglebine_files['Eaglebine_mud'].drop(columns=['TD', 'KB'], inplace=True)

Eaglebine_files['Eaglebine_mud'].rename(columns = {'MW@Depth(KB)':'MW@Depth_KB_meters'}, inplace = True)
Eaglebine_files['Eaglebine_mud'].add_suffix('_mud_info')

# production files
Eaglebine_files['Eaglebine_production'].drop(columns=['Elevation', 'Measured Depth (ft)'], inplace=True)
Duvernay_files['SPE_Duvernay_production'].drop(columns=['Elevation', 'Elevation Drill Floor (ft)', 'Elevation Ground (ft)', 'Elevation Kelly Bushing (ft)'], inplace=True)

# pressures files
Duvernay_files['Duvernay_Pressures'].drop(columns=['KB Elev (m)', 'DST Number', 'Formation DSTd',
       'DST Start Depth (TVD) (m)', 'DST End Depth (TVD) (m)',
       'DST Start Depth (MD) (m)', 'DST End Depth (MD) (m)', 'DST Test Date',
       'Test Type', 'DST Misrun', 'Misrun Problem Type', '1st Valve Open Time',
       '2nd Valve Open Time', '3rd Valve Open Time', '1st Shut-in Time',
       '2nd Shut-in Time', '3rd Shut-in Time', 'Pressure Recorder Depth (m)',
       'DST Bottom Hole Temp. (degC)', '1st Flow Pressure (kPa)',
       '2nd Flow Pressure (kPa)', '3rd Flow Pressure (kPa)',
       '1st Shut-in Pressure (kPa)', '1st Shut-in Initial Slope',
       '1st Shut-in Final Slope', '1st Shut-in Extrapolated Press (kPa)',
       '2nd Shut-in Pressure (kPa)', '2nd Shut-in Initial Slope',
       '2nd Shut-in Final Slope', '2nd Shut-in Extrapolated Press (kPa)',
       '3rd Shut-in Pressure (kPa)', '3rd Shut-in Initial Slope',
       '3rd Shut-in Final Slope', '3rd Shut-in Extrapolated Press (kPa)',
       'Maximum Shut-in Pressure (kPa)'], inplace=True)

### concatenating dataframes

In [6]:
# pre merge
df = dfutil.multimerge([df, target_files['combined_temperature']], how='left', left_index=True, right_index=True)
Eaglebine = dfutil.multimerge(Eaglebine_files, how='outer', left_index=True, right_index=True)
Duvernay = dfutil.multimerge(Duvernay_files, how='outer', left_index=True, right_index=True)
# fixing strange index behavior
Eaglebine.index = [str(idx) + '_' for idx in Eaglebine.index]
Duvernay.index = [str(idx) + '_' for idx in Duvernay.index]
df.index = [str(idx) + '_' for idx in df.index]
# post merge
df = dfutil.multimerge([df, pd.concat([Duvernay, Eaglebine], axis=0, join='outer', verify_integrity=True)], how='left', left_index=True, right_index=True)
# renaming index names to a common one
dfutil.renameIndex('UWI', df, inplace=True)

In [9]:
df.head(5)

Unnamed: 0_level_0,Set,source,BHT,TrueTemp,Depth_SS(m),Field,TD (ft),DST Start Depth (MD) (m),DST End Depth (MD) (m),DST Test Date,...,Mud Wt,MW@Depth_KB_meters,TD Date,Drilling Days,Liquid Injection Cum (bbl),Gas Injection Cum (mcf),WGS84Latitude,WGS84Longitude,SurfLat,SurfLong
UWI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
42021301990000_,Validation_Testing,synthetic,73.333333,,,Eaglebine,6910.0,,,,...,10.2,2106.168,NaT,,,,30.011031,-97.127885,30.01083,-97.1276
100102606420W500_,Validation_Testing,synthetic,27.78,,697.2,Duvernay,,1484.4,1495.0,21850.0,...,,,NaT,,,,,,,
100141705519W500_,Validation_Testing,synthetic,83.33,,1348.9,Duvernay,,2488.4,2496.6,25813.0,...,,,NaT,,,,,,,
100141503621W400_,Validation_Testing,synthetic,48.89,,408.15,Duvernay,,1313.7,1318.0,21964.0,...,,,NaT,,,,,,,
100043406718W500_,Validation_Testing,synthetic,65.56,,1440.35,Duvernay,,2207.7,2222.0,23066.0,...,,,NaT,,,,,,,


### Creating master dataframe

## Zipping the prediction

In [None]:
import zipfile
zipfile.ZipFile('predictions.zip', mode='w').write("predictions.csv")