# Import

In [1]:
import matplotlib
import pylab as plt
import numpy as np
import pandas as pd 
from numpy import random

In [2]:
# Read data file
df= pd.read_csv("../GHG_Emission.csv",na_values=['NA', '?','']) 
np.random.seed(42)

#Rearrange the index order to a random permutation of exising index
GHG = df.reindex(np.random.permutation(df.index))

# Reset index
GHG.reset_index(inplace=True, drop=True)

# Display top five rows
GHG.head()

Unnamed: 0,X Coordinate (km),Y Coordinate (km),Measured Depth (m),Deviation (deg),Abandoned (True/False),Surface-Casing Weight (kg/m),Production-Casing Size (mm),Cumulative GAS Prod. (e3m3),Month Well Spudded,Classification,Emission Rate (m3/day)
0,588.1,476.3,,,True,,177.8,32683.5,0.0,Non Serious,44.43268
1,62.4,666.9,491.7,,False,35.7,139.7,,0.0,Non Serious,29.998576
2,534.4,391.8,,15.690192,True,,177.8,32683.5,0.0,Serious,55.424137
3,298.7,583.0,,,True,35.7,139.7,32683.6,0.0,Serious,53.076994
4,513.8,434.9,2598.2,9.27331,False,,114.3,32683.6,843.0,Serious,50.506939


In [3]:
#Statistical Analysis of the data
GHG.describe()

Unnamed: 0,X Coordinate (km),Y Coordinate (km),Measured Depth (m),Deviation (deg),Surface-Casing Weight (kg/m),Production-Casing Size (mm),Cumulative GAS Prod. (e3m3),Month Well Spudded,Emission Rate (m3/day)
count,1500.0,1500.0,1071.0,556.0,1140.0,1392.0,1148.0,1500.0,1500.0
mean,445.051933,489.247133,1421.464426,31.400825,41.92,144.07342,32683.537195,177.501533,50.161205
std,174.153002,218.796603,943.375914,18.960863,12.611025,26.083219,0.093194,248.301182,9.912002
min,10.9,1.4,158.6,0.231718,13.7,73.0,32683.3,0.0,11.750512
25%,339.3,368.1,674.15,14.642007,35.7,114.3,32683.5,0.0,43.476749
50%,509.25,494.0,1102.8,31.660038,35.7,139.7,32683.5,0.0,50.262507
75%,592.6,595.325,1912.9,44.343357,48.1,177.8,32683.6,321.0,56.689195
max,649.6,1186.1,5418.9,76.894237,107.2,244.5,32683.8,1264.0,78.843781


In [4]:
GHG.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 11 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   X Coordinate (km)             1500 non-null   float64
 1   Y Coordinate (km)             1500 non-null   float64
 2   Measured Depth (m)            1071 non-null   float64
 3   Deviation (deg)               556 non-null    float64
 4   Abandoned (True/False)        1500 non-null   bool   
 5   Surface-Casing Weight (kg/m)  1140 non-null   float64
 6   Production-Casing Size (mm)   1392 non-null   float64
 7   Cumulative GAS Prod. (e3m3)   1148 non-null   float64
 8   Month Well Spudded            1500 non-null   float64
 9   Classification                1500 non-null   object 
 10  Emission Rate (m3/day)        1500 non-null   float64
dtypes: bool(1), float64(9), object(1)
memory usage: 118.8+ KB


From a very quick look at the data we notice a lot of missing values

# Data Processing

## Removing Outliers

Emission Rate values outside the range of 𝜇±2.5𝜎 (𝜇 = mean, 𝜎 = standard deviation) are considered outliers and are removed

In [5]:
def outlier_remove(df, n,name):
    """Delete rows for a specified column where values are out of +/- n*sd standard deviations
    df  : Pandas dataframe
    n   : n in the equation 𝑚±𝑛𝜎
    name: Column name
    """
    mean=df[name].mean() # Calclute mean of column
    sd=df[name].std()    # Calclute standard deviation of column
    drop_r = df.index[(mean -n * sd> df[name]) | (mean+n * sd< df[name])]
    df.drop(drop_r, axis=0, inplace=True)
    df.reset_index(inplace=True, drop=True) # Reset index

In [6]:
# Drop outliers in last column 'Emission Rate (m3/day)'
outlier_remove(GHG, n=2.5,name='Emission Rate (m3/day)') 
GHG.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1478 entries, 0 to 1477
Data columns (total 11 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   X Coordinate (km)             1478 non-null   float64
 1   Y Coordinate (km)             1478 non-null   float64
 2   Measured Depth (m)            1056 non-null   float64
 3   Deviation (deg)               552 non-null    float64
 4   Abandoned (True/False)        1478 non-null   bool   
 5   Surface-Casing Weight (kg/m)  1122 non-null   float64
 6   Production-Casing Size (mm)   1371 non-null   float64
 7   Cumulative GAS Prod. (e3m3)   1130 non-null   float64
 8   Month Well Spudded            1478 non-null   float64
 9   Classification                1478 non-null   object 
 10  Emission Rate (m3/day)        1478 non-null   float64
dtypes: bool(1), float64(9), object(1)
memory usage: 117.0+ KB


# Regression

We can drop 'Classification' column

In [7]:
#Drop 'Classification' column
GHG=GHG.drop('Classification',axis=1)

## Split the data

Data is divided into a 80/20 split of training set and test set 

In [8]:
#split the data using train_test_split
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(GHG, test_size=0.2, random_state=42)

for dataset in (train_set, test_set):
    dataset.reset_index(inplace=True, drop=True) # Reset index

x_train = train_set.drop("Emission Rate (m3/day)",axis=1)
y_train = train_set["Emission Rate (m3/day)"]
x_test = test_set.drop("Emission Rate (m3/day)",axis=1)
y_test = test_set["Emission Rate (m3/day)"]

x_train.head()

Unnamed: 0,X Coordinate (km),Y Coordinate (km),Measured Depth (m),Deviation (deg),Abandoned (True/False),Surface-Casing Weight (kg/m),Production-Casing Size (mm),Cumulative GAS Prod. (e3m3),Month Well Spudded
0,384.8,230.3,959.4,,True,41.6,139.7,32683.5,0.0
1,294.3,548.7,2504.7,47.479024,False,,177.8,32683.6,0.0
2,499.0,83.8,569.5,,True,35.7,114.3,32683.5,174.7
3,497.6,79.1,2322.3,50.696513,True,,177.8,32683.6,469.0
4,515.8,153.2,2480.9,9.6091,False,53.6,177.8,32683.6,0.0


In [27]:
# Save training dataframe for visualization
pd.concat([x_train,y_train],axis=1).to_csv("./Data/regression_train_vis.csv",index=False)

## Imputation 

In [9]:
from sklearn.impute import SimpleImputer

#Imputs median
imput_mdn = SimpleImputer(strategy="median") 

In [10]:
#Select numerical columns that need to be imputed with median
numcolumns = x_train.drop('Abandoned (True/False)',axis=1)

In [11]:
#Calucate median and impute into missing values
x_train_im = imput_mdn.fit_transform(numcolumns)

In [12]:
#Create imputed dataframe
x_train_im = pd.DataFrame(x_train_im,columns=numcolumns.columns)
x_train_im.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1182 entries, 0 to 1181
Data columns (total 8 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   X Coordinate (km)             1182 non-null   float64
 1   Y Coordinate (km)             1182 non-null   float64
 2   Measured Depth (m)            1182 non-null   float64
 3   Deviation (deg)               1182 non-null   float64
 4   Surface-Casing Weight (kg/m)  1182 non-null   float64
 5   Production-Casing Size (mm)   1182 non-null   float64
 6   Cumulative GAS Prod. (e3m3)   1182 non-null   float64
 7   Month Well Spudded            1182 non-null   float64
dtypes: float64(8)
memory usage: 74.0 KB


In [13]:
#Transform test dataset using median from train dataset
x_test_im = imput_mdn.transform(x_test.drop('Abandoned (True/False)',axis=1))
x_test_im = pd.DataFrame(x_test_im,columns=numcolumns.columns) 

## Text Handling

In [14]:
from sklearn.preprocessing import OneHotEncoder

#Transforms text data into multiple columns of 1 and 0
cat_encoder = OneHotEncoder()

In [15]:
#Select text columns that need to be one hot encoded
txtcolumns = x_train[['Abandoned (True/False)']]

In [16]:
#Fit & transform the data to be one hot encoded
x_train_text = cat_encoder.fit_transform(txtcolumns)
x_train_text = x_train_text.toarray()
x_train_text[0:10]

array([[0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.]])

In [17]:
#Transform test data based on training data
x_test_text = cat_encoder.transform(x_test[['Abandoned (True/False)']])
x_test_text = x_test_text.toarray()

## Standardization

In [18]:
from sklearn.preprocessing import StandardScaler

#As the columns are in different units we must standardize the data
scaler = StandardScaler()

In [19]:
#Fit & Transform the data
x_train_std = scaler.fit_transform(x_train_im)
x_train_std[0:5]

array([[-0.33088007, -1.16981778, -0.47024137,  0.01210518,  0.09920808,
        -0.1681966 , -0.3329649 , -0.71545354],
       [-0.84894367,  0.28161131,  1.41736806,  1.31755322, -0.43595384,
         1.33936633,  0.8669269 , -0.71545354],
       [ 0.32285323, -1.83763927, -0.94651065,  0.01210518, -0.43595384,
        -1.17323855, -0.3329649 , -0.02162736],
       [ 0.31483899, -1.85906426,  1.19456345,  1.58942698, -0.43595384,
         1.33936633,  0.8669269 ,  1.14719372],
       [ 0.41902415, -1.52127879,  1.38829596, -1.88240715,  1.18767302,
         1.33936633,  0.8669269 , -0.71545354]])

In [20]:
#Transform the test data
x_test_std = scaler.transform(x_test_im)

## Concatenate

In [21]:
#Concatenate all processed and scaled training data
x_train_reg = np.concatenate((x_train_std[:,:4],x_train_text,x_train_std[:,4:]),axis=1)
x_train_reg[0:5]

array([[-0.33088007, -1.16981778, -0.47024137,  0.01210518,  0.        ,
         1.        ,  0.09920808, -0.1681966 , -0.3329649 , -0.71545354],
       [-0.84894367,  0.28161131,  1.41736806,  1.31755322,  1.        ,
         0.        , -0.43595384,  1.33936633,  0.8669269 , -0.71545354],
       [ 0.32285323, -1.83763927, -0.94651065,  0.01210518,  0.        ,
         1.        , -0.43595384, -1.17323855, -0.3329649 , -0.02162736],
       [ 0.31483899, -1.85906426,  1.19456345,  1.58942698,  0.        ,
         1.        , -0.43595384,  1.33936633,  0.8669269 ,  1.14719372],
       [ 0.41902415, -1.52127879,  1.38829596, -1.88240715,  1.        ,
         0.        ,  1.18767302,  1.33936633,  0.8669269 , -0.71545354]])

In [22]:
#Concatenate all processed and scaled test data
x_test_reg = np.concatenate((x_test_std[:,:4],x_test_text,x_test_std[:,4:]),axis=1)

# Save Regression Data

In [23]:
# Concatenate training attributes and target into one dataframe
train_reg = pd.concat([pd.DataFrame(x_train_reg,columns=['X Coordinate (km)','Y Coordinate (km)', 'Measured Depth (m)',
                                                'Deviation (deg)','Abandoned (False)','Abandoned (True)',
                                                'Surface-Casing Weight (kg/m)','Production-Casing Size (mm)',
                                                'Cumulative GAS Prod. (e3m3)','Month Well Spudded']),
                        y_train],axis=1)
train_reg

Unnamed: 0,X Coordinate (km),Y Coordinate (km),Measured Depth (m),Deviation (deg),Abandoned (False),Abandoned (True),Surface-Casing Weight (kg/m),Production-Casing Size (mm),Cumulative GAS Prod. (e3m3),Month Well Spudded,Emission Rate (m3/day)
0,-0.330880,-1.169818,-0.470241,0.012105,0.0,1.0,0.099208,-0.168197,-0.332965,-0.715454,42.620033
1,-0.848944,0.281611,1.417368,1.317553,1.0,0.0,-0.435954,1.339366,0.866927,-0.715454,51.237935
2,0.322853,-1.837639,-0.946511,0.012105,0.0,1.0,-0.435954,-1.173239,-0.332965,-0.021627,31.671919
3,0.314839,-1.859064,1.194563,1.589427,0.0,1.0,-0.435954,1.339366,0.866927,1.147194,50.264058
4,0.419024,-1.521279,1.388296,-1.882407,1.0,0.0,1.187673,1.339366,0.866927,-0.715454,70.277448
...,...,...,...,...,...,...,...,...,...,...,...
1177,0.950826,0.115226,-0.506887,0.012105,0.0,1.0,0.688793,1.339366,0.866927,-0.715454,52.964451
1178,-0.248448,0.063715,-0.286281,0.012105,0.0,1.0,-0.435954,1.339366,-0.332965,-0.715454,43.555976
1179,1.105959,-0.370711,-0.286281,0.695828,0.0,1.0,-0.435954,-1.173239,0.866927,-0.715454,62.929801
1180,0.965710,0.469422,-0.286281,0.012105,0.0,1.0,-1.379290,-1.173239,-0.332965,-0.715454,58.452586


In [24]:
# Save training data to a csv
train_reg.to_csv("./Data/regression_train.csv",index=False)

In [25]:
# Concatenate test attributes and target into one dataframe
test_reg = pd.concat([pd.DataFrame(x_test_reg,columns=['X Coordinate (km)','Y Coordinate (km)', 'Measured Depth (m)',
                                                'Deviation (deg)','Abandoned (False)','Abandoned (True)',
                                                'Surface-Casing Weight (kg/m)','Production-Casing Size (mm)',
                                                'Cumulative GAS Prod. (e3m3)','Month Well Spudded']),
                        y_test],axis=1)

In [26]:
# Save test data to a csv
test_reg.to_csv("./Data/regression_test.csv",index=False)