IMPORTING LIBRARIES

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

IMPORTING/CLEANING DATASET

In [None]:
#importing dataset from https://www.kaggle.com/datasets/tylerdurden99/production-data-for-oil-well (dataset includes time series data for one well)
# changing time series data to integers for WELL_ID col, assuming each row (initially one day) is a separate well just for learning purposes and to fit the dataset into a classification model
wells = pd.read_csv('Well_Data/oil_production.csv')

#keeping only first 1000 rows for simplicity
wells = wells.iloc[:1000]
#setting WELL_ID as index
wells.set_index('WELL_ID', inplace=True)
#dropping first column as it is not needed
wells=wells.drop(wells.columns[0], axis=1)

#Determining well productivity based on water cut
#water cut is the ratio of water volume to total volume (oil + gas + water)
#Set well as 'productive' if water cut is less than or equal to 50% of total bore fluid volume
wells['water cut'] = wells['BORE_WAT_VOL'] / (wells['BORE_GAS_VOL']/6000 + wells['BORE_WAT_VOL'] + wells['BORE_OIL_VOL'])
wells['Productive well'] = np.where(wells['water cut']>0.5, 'no', 'yes')  

wells

Unnamed: 0_level_0,AVG_DOWNHOLE_PRESSURE,AVG_DOWNHOLE_TEMPERATURE,AVG_DP_TUBING,AVG_CHOKE_SIZE_P,AVG_WHP_P,DP_CHOKE_SIZE,BORE_OIL_VOL,BORE_GAS_VOL,BORE_WAT_VOL,water cut,Productive well
WELL_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,257.442241,105.339191,163.294586,35.298817,94.147654,61.049726,4535.43,649388.07,298.19,0.060340,yes
1,261.481233,105.364799,164.352071,34.697693,97.129162,65.804801,4379.88,629307.34,143.54,0.031014,yes
2,264.386001,105.410873,166.214057,34.779443,98.171944,64.987169,4509.07,638750.17,108.74,0.023017,yes
3,266.709086,105.395959,166.273529,34.054792,100.435557,67.330436,4319.02,612912.62,106.60,0.023544,yes
4,266.667137,105.410516,166.513059,34.395363,100.154078,66.986427,4417.66,625514.01,117.37,0.025299,yes
...,...,...,...,...,...,...,...,...,...,...,...
995,227.991229,102.380165,195.408040,100.000000,32.583188,3.205074,1095.90,172278.01,2707.10,0.706499,no
996,231.279309,102.336143,196.870912,100.000000,34.408397,4.993426,935.42,149159.79,2447.88,0.718241,no
997,228.276308,102.370433,195.643405,100.000000,32.632902,3.257175,1055.05,165692.63,2696.75,0.713536,no
998,228.368339,102.364361,195.731727,100.000000,32.636612,3.274347,1068.47,167934.29,2717.16,0.712489,no


TEST/TRAIN SPLIT

In [24]:
#TRIAL!
# Y = problematic well (>50% water cut or bore water volume is greater than 50% of total bore oil, gas and water volumes in this well) = dependent variable (target)
# X = all other columns (avg downhole temp, avg choke size, gas vol etc.) = independent variables (features)
# Splitting the dataset into features and target variable
X = wells.drop(['Productive well', 'water cut'], axis=1)
Y = wells['Productive well']
# Splitting the dataset into training and testing sets using sklearn (80% train, 20% test)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

CHOICE OF K

In [25]:
#k too low: inaccurate classification
#k too high: overfitting/ long processing time
#general guide: k = sqrt(n_samples), take closest odd integer

k = np.sqrt(len(wells)).astype(int)  # Calculate k as the square root of the number of samples
print(f"Recommended k value: {k}")

Recommended k value: 31


SCALING DATA

In [26]:
scaler = StandardScaler()
# Fit the scaler on the training data and transform both training and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

MODEL

In [30]:
model = KNeighborsClassifier(k,p=2, metric='euclidean')

model.fit(X_train_scaled, Y_train)



0,1,2
,n_neighbors,np.int64(31)
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'euclidean'
,metric_params,
,n_jobs,


In [33]:
Y_pred = model.predict(X_test_scaled)

EVAL MODEL  

In [None]:
#outputs f1 score ()
f1 = f1_score(Y_test, Y_pred, pos_label='yes')
f1

0.9264705882352942

In [36]:
accuracy = accuracy_score(Y_test, Y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.95
