# Q5: Can we predict lithology accurately based on wireline well logs data?
    a. Train machine learning models using wireline well logs data
    B. Compare the accuracy of the models.

**Purpose of the question**: classify the layers of lithology in the well to find the sandstone

In [1]:
import lasio
import pandas as pd
import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
from utils import *
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("./Data/labeled_logs.csv")
rename_columns(df)
resistivity_to_log10(df)

In [3]:
df = df.drop(
    [
        My_Columns.FORMATION,
        My_Columns.GROUP,
        My_Columns.WELL,
        My_Columns.X_LOC,
        My_Columns.Y_LOC,
        My_Columns.Z_LOC,
        My_Columns.WELL,
        My_Columns.CONFIDENCE,
        My_Columns.SPECTRA_GAMMA_RAY,
        My_Columns.AVERAGE_RATE_OF_PENETRATION,
        My_Columns.MICRO_RESISTIVITY,
        My_Columns.WEIGHT_OF_DRILLING_MUD,
        My_Columns.FLUSHED_ZONE_RESISTIVITY,
        My_Columns.DIFFERENTIAL_CALIPER,
        My_Columns.SHEAR_WAVE_SONIC,
    ],
    axis=1,
)
df.isnull().sum()
df = df.dropna()

In [4]:
df.columns, len(df.columns)


(Index(['Measured Depth', 'Caliper', 'Shallow Resistivity',
        'Medium Deep Resistivity', 'Deep Resistivity', 'Bulk Density',
        'Gamma Ray', 'Neutron Porosity', 'Photo Electric Factor',
        'Compressional waves sonic', 'Self Potential', 'Borehole Size',
        'Rate of Penetration', 'Density Correction', 'LITHOLOGY'],
       dtype='object'),
 15)

In [5]:
rename_lithology(df)
lithology_to_class(df)

In [6]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2, random_state=42)

train_X = train.drop([My_Columns.LITHOLOGY], axis=1)
train_Y = train[My_Columns.LITHOLOGY]

test_X = test.drop([My_Columns.LITHOLOGY], axis=1)
test_Y = test[My_Columns.LITHOLOGY]


In [7]:
train_X.head()


Unnamed: 0,Measured Depth,Caliper,Shallow Resistivity,Medium Deep Resistivity,Deep Resistivity,Bulk Density,Gamma Ray,Neutron Porosity,Photo Electric Factor,Compressional waves sonic,Self Potential,Borehole Size,Rate of Penetration,Density Correction
409931,1332.5424,12.65782,-0.063526,-0.131539,-0.209502,2.116763,91.90493,0.995263,10.661796,2.149468,-233.182571,12.250001,37.471474,-0.004245
306235,3439.112,9.238626,0.832998,0.658911,0.686044,2.565505,91.702042,0.264772,8.612202,1.933798,240.597031,8.5,35.261501,0.143808
463860,3083.552199,9.093799,0.556097,0.522212,0.346094,2.523961,110.023872,0.385514,3.721776,1.961045,60.566254,8.5,4.761444,0.127706
1195248,1413.5664,12.468761,0.009207,-0.018151,-0.008808,2.188227,99.009926,0.418006,5.115295,2.14938,-107.0,12.250001,34.179344,-0.053228
1197374,1736.7184,12.386744,-0.138125,-0.223501,-0.214643,2.201126,73.900574,0.464747,3.182544,2.148671,-97.545929,12.250001,44.669052,-0.063741


In [8]:
train_Y.head()


409931     4
306235     4
463860     4
1195248    4
1197374    4
Name: LITHOLOGY, dtype: int64

In [9]:
scaler = StandardScaler()
train_X = scaler.fit_transform(train_X)
test_X = scaler.transform(test_X)


In [10]:
lr = LogisticRegression()
lr.fit(train_X, train_Y)

print("Accuracy: ", accuracy_score(test_Y, lr.predict(test_X)))


Accuracy:  0.8645216727133855


In [11]:
svc = SVC(kernel="rbf", C=1000)
svc.fit(train_X, train_Y)

print("Accuracy: ", accuracy_score(test_Y, svc.predict(test_X)))


Accuracy:  0.9558907771624976


In [12]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(train_X, train_Y)

print("Accuracy: ", accuracy_score(test_Y, rf.predict(test_X)))


Accuracy:  0.9633377888103877
