# EDA Datathon2021 by Softypo

In [2]:
# Importing module libraries
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.graph_objects as go
from tqdm.notebook import tqdm

## Data loading

In [83]:
# Global files
structured_data = pd.read_csv('StructuredData/combined_temperature.csv', index_col=0, usecols=range(1,7))
set_assign = pd.read_csv('Data for Datathon\set_assign.csv', index_col=0)
Data_static_logs = pd.read_csv('Data for Datathon\Data_static_logs.csv', index_col=0)
# Duvernay files
Duvernay_Well_Headers = pd.read_excel('Data for Datathon\Duvernay\Duvernay\Duvernay well headers SPE April 21 2021 .xlsx', index_col=0)
Duvernay_DST_BHT = pd.read_excel('Data for Datathon\Duvernay\Duvernay\Duvernay DST BHT for SPE April 20 2021.xlsx', index_col=0)
# Eaglebine files
Eaglebine_Well_Headers = pd.read_excel('Data for Datathon\Eaglebine\Eaglebine\Eaglebine well headers SPE April 21 2021.xlsx', index_col=3)

### Creating master dataframe

In [87]:
# standardising names
Duvernay_Well_Headers.rename(columns={'TD meters': 'TD_meters'})

Unnamed: 0_level_0,Elevation Meters,ElevationDatum,TD meters,SurfaceLatitude_NAD83,SurfaceLongitude_NAD83,BottomLatitude_NAD83,BottomLongitude_NAD83,SurfaceLatitude_NAD27,SurfaceLongitude_NAD27,BottomLatitude_NAD27,BottomLongitude_NAD27
UWI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
100072507623W500,555.700000,KB,2545.00000,55.613640,-117.423800,55.613640,-117.423800,55.613640,-117.423800,55.613640,-117.423800
100062007823W500,560.900000,KB,2430.00000,55.770872,-117.537674,55.770872,-117.537674,55.770872,-117.537674,55.770872,-117.537674
100052107723W500,558.400000,KB,2515.50000,55.683723,-117.516168,55.683723,-117.516168,55.683723,-117.516168,55.683723,-117.516168
100061307518W500,631.400000,KB,2411.00000,55.494600,-116.653606,55.494600,-116.653606,55.494600,-116.653606,55.494600,-116.653606
100070907618W500,627.000000,KB,2408.00000,55.566965,-116.726243,55.566965,-116.726243,55.566965,-116.726243,55.566965,-116.726243
...,...,...,...,...,...,...,...,...,...,...,...
100013603712W500,1309.463760,KB,1352.44197,52.216665,-115.579316,52.217223,-115.578140,52.216665,-115.579316,52.217223,-115.578140
100101105718W500,1319.223776,KB,3862.55232,53.914953,-116.559741,53.914953,-116.559741,53.914953,-116.559741,53.914953,-116.559741
100112004213W500,1326.310609,KB,4717.23956,52.633129,-115.838137,52.633602,-115.837674,52.633129,-115.838137,52.633602,-115.837674
100142103711W500,1508.678432,KB,5175.67380,52.199202,-115.517529,52.200163,-115.517398,52.199202,-115.517529,52.200163,-115.517398


In [77]:
#dropping redundant and/or useless columns
Duvernay_Well_Headers.drop(columns=['ElevationDatum', 'SurfaceLatitude_NAD27', 'SurfaceLongitude_NAD27', 'BottomLatitude_NAD27', 'BottomLongitude_NAD27'], inplace=True)
Eaglebine_Well_Headers.drop(columns=['ElevationDatum', 'WGS84Latitude', 'WGS84Longitude', 'SurfLat', 'SurfLong', 'SurfaceLatitude_NAD27', 'SurfaceLongitude_NAD27', 'BottomLatitude_NAD27', 'BottomLongitude_NAD27'], inplace=True)


In [None]:
# standardising units to metric

In [None]:
['UWI', 'Elevation Meters', 'ElevationDatum', 'TD meters', 'SurfaceLatitude_NAD83', 'SurfaceLongitude_NAD83', 'BottomLatitude_NAD83', 'BottomLongitude_NAD83', 'SurfaceLatitude_NAD27', 'SurfaceLongitude_NAD27', 'BottomLatitude_NAD27', 'BottomLongitude_NAD27']

In [None]:
['td', 'Elevation', 'ElevationDatum', 'displayapi', 'WGS84Latitude', 'WGS84Longitude', 'SurfLat', 'SurfLong', 'SurfaceLatitude_NAD83', 'SurfaceLongitude_NAD83', 'BottomLatitude_NAD83', 'BottomLongitude_NAD83', 'SurfaceLatitude_NAD27', 'SurfaceLongitude_NAD27', 'BottomLatitude_NAD27', 'BottomLongitude_NAD27']

In [61]:
# merging Duvernay and Eaglebine files
Well_Headers = pd.merge(Duvernay_Well_Headers, Eaglebine_Well_Headers, how='outer')

In [49]:
# joining wells features dataframes and structured_data provided
df = set_assign.join(structured_data)

#
df = df.join(Well_Headers, rsuffix='WH')

# creating static/synthetic cathegory
df['source'] = pd.Series(['static' if ~np.isnan(row) else 'synthetic' for row in set_assign.join(Data_static_logs)['Temp (degC)']], index=df.index, dtype="category")

In [50]:
df

Unnamed: 0_level_0,Set,BHT,TrueTemp,Depth_SS(m),Field,TD (ft),source
UWI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
42021301990000,Validation_Testing,73.333333,,,Eaglebine,6910.0,synthetic
100102606420W500,Validation_Testing,27.780000,,697.20,Duvernay,,synthetic
100141705519W500,Validation_Testing,83.330000,,1348.90,Duvernay,,synthetic
100141503621W400,Validation_Testing,48.890000,,408.15,Duvernay,,synthetic
100043406718W500,Validation_Testing,65.560000,,1440.35,Duvernay,,synthetic
...,...,...,...,...,...,...,...
100101606423W500,Training,47.220000,56.340000,683.70,Duvernay,,synthetic
42177309850000,Training,74.444444,96.655556,,Eaglebine,7035.0,synthetic
100100805919W500,Training,69.440000,73.050000,1110.40,Duvernay,,synthetic
100110905226W400,Training,51.670000,55.420000,651.30,Duvernay,,synthetic


In [11]:
df['Depth (m)'] = df['Depth (ft)']*0.3048

## Zipping the prediction

In [None]:
import zipfile
zipfile.ZipFile('predictions.zip', mode='w').write("predictions.csv")