## "AIgators" team submission for SPE GCS ML Competition
## by Jamal Ahmadov, Cristina Mariana Ruse, Rodrigue Rizk
## University of Louisiana at Lafayette

In [1]:
import sklearn
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import glob
import seaborn as sns
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
import lasio

In [2]:
# Change directory
os.chdir("C:/Users/Jamal/Downloads/ML SPE competition-take home/ML Challenge Data - modified - techlog")
# Create a list with all the files
path = os.getcwd()
files = os.listdir(path)
# Select only xlsx files
files_las = [f for f in files if f[-3:] == "las"]

## All logs with well names appended

In [3]:
lst=[]
lst2=[]

# Loop over list of Excel files
for f in files_las: 
    las=lasio.read(f)
    df=las.df()
    df.reset_index(inplace=True)
    lst.append(df)
    
    well_name = pd.DataFrame([f] * df.shape[0])
    lst2.append(well_name)
    
df2=pd.concat(lst,axis=0)
df_names=pd.concat(lst2,axis=0)

df_wells=pd.concat([df2,df_names],axis=1)
df_wells.rename(columns={0:"Well name"},inplace=True)

## Coordinates and well names

In [4]:
Lat=[]

# Loop over list of Excel files
for f in files_las:
    
    las=lasio.read(f)
    Lat.append({'Latitude':las.well['SLAT'].value,'Longtitude':las.well['SLON'].value,'Well name':f})

df_cord=pd.DataFrame(Lat)
df_cord

Unnamed: 0,Latitude,Longtitude,Well name
0,1.678958,6.390539,0052442d0162_TGS.las
1,3.680670,7.095404,00a60e5cc262_TGS.las
2,2.763908,9.104955,01c726e0fabe_TGS.las
3,2.299873,5.807630,02571837c35f_TGS.las
4,1.404772,8.618104,03d4fc789db8_TGS.las
...,...,...,...
229,3.361701,7.688234,fb82f07561bd_TGS.las
230,5.515475,2.537780,fc913430daa9_TGS.las
231,2.048852,8.203397,fcd64679cafa_TGS.las
232,6.216308,3.438106,fe47e0c3ac55_TGS.las


## All mnemonics

In [10]:
clmn=df2.columns
clmn_df=pd.DataFrame(clmn)
clmn_df.columns=['Property']
clmn_df.sort_values('Property',inplace=True)
clmn_df.reset_index(drop=True,inplace=True)

## All curves description with well names appended

In [12]:
logs=[]
a_list = []
b_list = []
c_list = []
Lat=[]
Lon=[]
w_list=[]

for f in files_las: 
    las=lasio.read(f)
    colnum=las.keys()
    logs=las.curves
    for i in range(0,len(colnum)):
        a=logs[i]['descr']
        a_list.append(a)
        b=logs[i]['unit']
        b_list.append(b)
        c=logs[i]['mnemonic']
        c_list.append(c)
        Lat.append(las.well['SLAT'].value)
        Lon.append(las.well['SLON'].value)
        w_list.append(f)

df_curves = pd.DataFrame({'Mnemonic': c_list, 'Unit': b_list,'Description': a_list,'Latitude': Lat,'Longtitude': Lon,
                               'Well name': w_list})
df_curves

Unnamed: 0,Mnemonic,Unit,Description,Latitude,Longtitude,Well name
0,DEPT,F,1 MEASURED DEPTH,1.678958,6.390539,0052442d0162_TGS.las
1,DTCO,US/F,DELTA-T COMPRESSIONAL,1.678958,6.390539,0052442d0162_TGS.las
2,GRS,GAPI,GAMMA RAY FROM SONIC LOG,1.678958,6.390539,0052442d0162_TGS.las
3,DTSM,US/F,DELTA-T SHEAR,1.678958,6.390539,0052442d0162_TGS.las
4,DEPT,ft,,3.680670,7.095404,00a60e5cc262_TGS.las
...,...,...,...,...,...,...
3275,RHOZ,G/C3,HRDD STANDARD RESOLUTION FORMATION DENSITY,2.824751,8.884746,fe8ab5538224_TGS.las
3276,RLA3,OHMM,APPARENT RESISTIVITY FROM COMPUTED FOCUSING MO...,2.824751,8.884746,fe8ab5538224_TGS.las
3277,RLA5,OHMM,APPARENT RESISTIVITY FROM COMPUTED FOCUSING MO...,2.824751,8.884746,fe8ab5538224_TGS.las
3278,RXOZ,OHMM,INVADED FORMATION RESISTIVITY FILTERED AT 18 I...,2.824751,8.884746,fe8ab5538224_TGS.las


## Function to select only logs needed

In [13]:
def log_count(df,name):
    prop=df.groupby(['Latitude', 'Longtitude']).size().rename(name+'_count').sort_values(ascending=False)
    df_f=pd.DataFrame(prop).reset_index()
    return df_f

## Resistivity

In [14]:
comp_df_res=df_curves[(df_curves['Mnemonic']=='AE90') | (df_curves['Mnemonic']=='AF90') | (df_curves['Mnemonic']=='AHT90') 
                 | (df_curves['Mnemonic']=='AO90') | (df_curves['Mnemonic']=='AST90') | (df_curves['Mnemonic']=='AT90')
                 | (df_curves['Mnemonic']=='HLLD') | (df_curves['Mnemonic']=='HRID') | (df_curves['Mnemonic']=='IDPH')
                 | (df_curves['Mnemonic']=='ILD')| (df_curves['Mnemonic']=='LLD') | (df_curves['Mnemonic']=='LLD_R')
                 | (df_curves['Mnemonic']=='RILD') | (df_curves['Mnemonic']=='RLA5') | (df_curves['Mnemonic']=='TBIT90')]

## Neutron

In [15]:
comp_df_neut=df_curves[(df_curves['Mnemonic']=='APLCLS') | (df_curves['Mnemonic']=='APLC_LS') | (df_curves['Mnemonic']=='CNC') 
                 | (df_curves['Mnemonic']=='CNC_LS') | (df_curves['Mnemonic']=='CNPOR_LS') | (df_curves['Mnemonic']=='TNPH_LIM')
                 | (df_curves['Mnemonic']=='TNPH_LS') | (df_curves['Mnemonic']=='TPHI_LS') | (df_curves['Mnemonic']=='SNP')
                 | (df_curves['Mnemonic']=='NPOR')| (df_curves['Mnemonic']=='NPORLS') | (df_curves['Mnemonic']=='NPOR_LS')
                 | (df_curves['Mnemonic']=='NPHS') | (df_curves['Mnemonic']=='NPHI_LS') | (df_curves['Mnemonic']=='NPHILS')
                | (df_curves['Mnemonic']=='NPHI') | (df_curves['Mnemonic']=='ENPH_LS')| (df_curves['Mnemonic']=='TNPH')]

## Density porosity

In [16]:
comp_df_densp=df_curves[(df_curves['Mnemonic']=='DPHI') | (df_curves['Mnemonic']=='DPHILS') | (df_curves['Mnemonic']=='DPHI_LS') 
                 | (df_curves['Mnemonic']=='DPHI_SLDT') | (df_curves['Mnemonic']=='DPHZ') | (df_curves['Mnemonic']=='DPHZLS')
                 | (df_curves['Mnemonic']=='DPHZ_LS') | (df_curves['Mnemonic']=='DPO') | (df_curves['Mnemonic']=='DPOR')
                 | (df_curves['Mnemonic']=='DPOR_LS')| (df_curves['Mnemonic']=='DPO_LS') | (df_curves['Mnemonic']=='PHND_LS')
                 | (df_curves['Mnemonic']=='PORZ_LS')]

## Density

In [17]:
comp_df_dens=df_curves[(df_curves['Mnemonic']=='NRHO') | (df_curves['Mnemonic']=='RHOB') | (df_curves['Mnemonic']=='RHOB_SLDT') 
                 | (df_curves['Mnemonic']=='RHOZ') | (df_curves['Mnemonic']=='ZDEN') | (df_curves['Mnemonic']=='RHOM')]

## Gamma ray

In [18]:
comp_df_gr=df_curves[(df_curves['Mnemonic']=='ECGR') | (df_curves['Mnemonic']=='ECGRD') | (df_curves['Mnemonic']=='ECGREDTC') 
                 | (df_curves['Mnemonic']=='ECGRR') | (df_curves['Mnemonic']=='ECGRS') | (df_curves['Mnemonic']=='GR')
                | (df_curves['Mnemonic']=='GRC') | (df_curves['Mnemonic']=='GRD') | (df_curves['Mnemonic']=='GRD1')
                | (df_curves['Mnemonic']=='GRN') | (df_curves['Mnemonic']=='GRR') | (df_curves['Mnemonic']=='GRS')
                | (df_curves['Mnemonic']=='GRT') | (df_curves['Mnemonic']=='GR_EDTC') | (df_curves['Mnemonic']=='GR_STGC')
                | (df_curves['Mnemonic']=='HCGR') | (df_curves['Mnemonic']=='HCGRD') | (df_curves['Mnemonic']=='HCGRR')
                | (df_curves['Mnemonic']=='HCGRS') | (df_curves['Mnemonic']=='HGRT') | (df_curves['Mnemonic']=='HSGR')
                | (df_curves['Mnemonic']=='HSGRD') | (df_curves['Mnemonic']=='HSGRR') | (df_curves['Mnemonic']=='HSGRS')
                    | (df_curves['Mnemonic']=='SGRDD') | (df_curves['Mnemonic']=='SGRS')]

## Photoelectric

In [19]:
comp_df_pe=df_curves[(df_curves['Mnemonic']=='PE') | (df_curves['Mnemonic']=='PEF') | (df_curves['Mnemonic']=='PEFL') 
                 | (df_curves['Mnemonic']=='PEFS') | (df_curves['Mnemonic']=='PEFZ') | (df_curves['Mnemonic']=='PEF_SLDT')]

## Compressional

In [20]:
comp_df_compr=df_curves[(df_curves['Mnemonic']=='DTCO') | (df_curves['Mnemonic']=='DTLF') | (df_curves['Mnemonic']=='DT')
                       | (df_curves['Mnemonic']=='DT4P') | (df_curves['Mnemonic']=='DTCM')]

## Shear

In [21]:
comp_df_shear=df_curves[(df_curves['Mnemonic']=='DTSM')]

In [22]:
df_curves.sort_values(by=['Latitude'])[3185:3205]

Unnamed: 0,Mnemonic,Unit,Description,Latitude,Longtitude,Well name
1275,AT10,OHMM,ARRAY INDUCTION TWO FOOT RESISTIVITY A10,6.725493,3.908037,734da1169c53_TGS.las
327,NPHI_LS,DEC,NEUTRON POROSITY LIMESTONE,6.788468,3.401863,19ed10214869_TGS.las
326,DEPT,F,1 MEASURED DEPTH,6.788468,3.401863,19ed10214869_TGS.las
328,DT,US/F,DELTA-T (INTERVAL TRANSIT TIME),6.788468,3.401863,19ed10214869_TGS.las
331,DT1,US/F,DELTA-T SHEAR - LOWER DIPOLE,6.788468,3.401863,19ed10214869_TGS.las
330,GRS,GAPI,GAMMA RAY FROM SONIC LOG,6.788468,3.401863,19ed10214869_TGS.las
329,DTSM,US/F,DELTA-T SHEAR,6.788468,3.401863,19ed10214869_TGS.las
332,DTCO,US/F,DELTA-T COMPRESSIONAL,6.788468,3.401863,19ed10214869_TGS.las
681,TNPH_LS,DEC,THERMAL NEUTRON POROSITY [LIMESTONE],6.930438,3.417693,4460f43fb0fd_TGS.las
680,TENS,LBF,TENSION FROM SONIC LOG,6.930438,3.417693,4460f43fb0fd_TGS.las


## Caliper

In [23]:
comp_df_calip=df_curves[(df_curves['Mnemonic']=='CAL1R') | (df_curves['Mnemonic']=='CALD') | (df_curves['Mnemonic']=='CALI') 
                 | (df_curves['Mnemonic']=='CALI_SPCS') | (df_curves['Mnemonic']=='CALR') | (df_curves['Mnemonic']=='CALS')
                | (df_curves['Mnemonic']=='CALSR') | (df_curves['Mnemonic']=='CALSR_R') | (df_curves['Mnemonic']=='CALX')
                | (df_curves['Mnemonic']=='HCAL') | (df_curves['Mnemonic']=='HCALD') | (df_curves['Mnemonic']=='HCALR')
               | (df_curves['Mnemonic']=='HCALS') | (df_curves['Mnemonic']=='HD') | (df_curves['Mnemonic']=='HD1')
                | (df_curves['Mnemonic']=='HD1_PPC1') | (df_curves['Mnemonic']=='HD2_PPC2') | (df_curves['Mnemonic']=='LCAL')
                | (df_curves['Mnemonic']=='LCALD') | (df_curves['Mnemonic']=='LCALR') | (df_curves['Mnemonic']=='C1')
                       | (df_curves['Mnemonic']=='C1S')]

## Density correction

In [24]:
comp_df_denscorr=df_curves[(df_curves['Mnemonic']=='HDRA') | (df_curves['Mnemonic']=='DRHO') | (df_curves['Mnemonic']=='DRH')
                          | (df_curves['Mnemonic']=='ZCOR') | (df_curves['Mnemonic']=='DCOR') | (df_curves['Mnemonic']=='CORR')]

## Merging well logs

In [25]:
dfs=[log_count(comp_df_res,'res'),log_count(comp_df_neut,'neutron'),log_count(comp_df_dens,'dens'),log_count(comp_df_densp,'densp'),
         log_count(comp_df_gr,'gr'), log_count(comp_df_pe,'pe'),log_count(comp_df_compr,'compr'),log_count(comp_df_shear,'shear'),
         log_count(comp_df_calip,'calip'),log_count(comp_df_denscorr,'denscorr')]

## Counting number of curves in each well

In [26]:
df_0=pd.merge(dfs[0],dfs[1],how="outer", on=["Latitude", "Longtitude"])
for df in dfs[2:]:
    df_1=pd.merge(df_0,df,how="outer", on=["Latitude", "Longtitude"])
    df_0=df_1

In [27]:
df_1

Unnamed: 0,Latitude,Longtitude,res_count,neutron_count,dens_count,densp_count,gr_count,pe_count,compr_count,shear_count,calip_count,denscorr_count
0,4.409538,1.957873,4.0,1.0,1.0,1.0,1.0,1.0,2.0,1,2.0,1.0
1,1.936264,7.993787,3.0,1.0,2.0,2.0,2.0,2.0,1.0,1,2.0,2.0
2,5.979132,4.182920,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1,1.0,1.0
3,1.887905,8.017887,2.0,2.0,3.0,3.0,2.0,3.0,1.0,1,3.0,3.0
4,2.565869,8.097015,2.0,,,,1.0,,1.0,1,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...
228,3.194788,7.462122,,,,,1.0,,2.0,1,,
229,3.933811,4.685115,,,,,1.0,,1.0,1,,
230,4.028449,1.562470,,,,,1.0,,1.0,1,,
231,5.619905,4.511351,,,,,,,2.0,1,,


In [28]:
df_1.describe()

Unnamed: 0,Latitude,Longtitude,res_count,neutron_count,dens_count,densp_count,gr_count,pe_count,compr_count,shear_count,calip_count,denscorr_count
count,233.0,233.0,171.0,200.0,159.0,167.0,227.0,160.0,229.0,233.0,186.0,164.0
mean,3.677477,5.900734,1.140351,1.04,1.08805,1.077844,1.079295,1.0875,1.157205,1.004292,1.11828,1.079268
std,1.873927,2.195399,0.410391,0.196451,0.305719,0.290285,0.301711,0.304835,0.388094,0.065512,0.355633,0.29275
min,0.594357,1.260153,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,2.25412,3.994768,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
50%,2.980969,6.372655,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
75%,5.709014,7.946482,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,7.421619,9.844046,4.0,2.0,3.0,3.0,3.0,3.0,3.0,2.0,3.0,3.0


In [29]:
final_logs=pd.merge(df_1,df_cord,how="outer", on=["Latitude", "Longtitude"])

In [30]:
os.chdir("C:/Users/Jamal/Downloads/ML SPE competition-take home")
final_logs.to_csv('log inventory-techlog.csv')

In [31]:
features=['Well name','DEPT',
          'AE90','AF90','AHT90', 'AO90', 'AST90','AT90', 'HLLD','IDPH','ILD','LLD', 'RILD', 'RLA5','TBIT90',
'APLCLS','APLC_LS','CNC_LS','CNPOR_LS','TNPH_LIM','TNPH_LS','SNP','NPOR','NPORLS','NPOR_LS',
'NPHI_LS','NPHILS','NPHI','TNPH',
'DPHI','DPHILS','DPHI_LS','DPHI_SLDT','DPHZ','DPHZLS','DPHZ_LS','DPO','DPOR','DPOR_LS', 'DPO_LS','PHND_LS','PORZ_LS',
'NRHO','RHOB','RHOB_SLDT','RHOZ','ZDEN', 'RHOM',
'ECGR','ECGRD', 'ECGREDTC', 'ECGRR', 'ECGRS', 'GR', 'GRC', 'GRD', 'GRD1','GRN', 'GRR', 'GRS', 'GR_EDTC', 'HCGR',
'HSGR', 'HSGRD','HSGRS', 'SGRDD','SGRS',
'PE','PEF','PEFL','PEFZ','PEF_SLDT',    
'DTCO', 'DTLF', 'DT','DT4P','DTSM','DTCM',
'CAL1R', 'CALD', 'CALI', 'CALI_SPCS', 'CALR', 'CALS', 'CALSR', 'CALSR_R', 'CALX', 'HCAL', 'HCALD', 'HCALR','HCALS',
'LCAL', 'LCALD', 'LCALR', 'C1','C1S',
'HDRA','DRHO','DRH','ZCOR','DCOR','CORR']

df_features=df_wells[features]

# Obtain dataset code

In [32]:
df=df_features
df[df.loc[:]==-9999]=np.nan

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[df.loc[:]==-9999]=np.nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._where(-key, value, inplace=True)


## Dataset with only shear values

In [33]:
df_shear=df.loc[df.loc[pd.notna(df['DTSM']), :].index]
df_shear['DTSM'].isnull().sum()

KeyboardInterrupt: 

## Curves for each well

In [None]:
neutron_df=df_curves[(df_curves['Mnemonic']=='NPHI') | (df_curves['Mnemonic']=='NPOR')| (df_curves['Mnemonic']=='TNPH')]
neutron_df_remove=neutron_df[(neutron_df['Description']!='NEUTRON POROSITY LIMESTONE') 
                             & (neutron_df['Description']!='THERMAL NEUTRON POROSITY LIMESTONE')]
df_curves.drop(neutron_df_remove.index,axis=0,inplace=True)

In [None]:
densitypor_df=df_curves[(df_curves['Mnemonic']=='DPHI') | (df_curves['Mnemonic']=='DPHZ')]
densitypor_df_remove=densitypor_df[(densitypor_df['Description']!='DENSITY POROSITY LIMESTONE')]
df_curves.drop(densitypor_df_remove.index,axis=0,inplace=True)

In [None]:
df_replace=final_logs
df_curves_replace=df_curves[df_curves['Well name'].isin(df_replace['Well name'])]
df_curves_replace=df_curves_replace[df_curves_replace['Mnemonic'].isin(df_shear.columns[1:])]
df_curves_replace

## Function to combine the curves

In [None]:
def replace(curve,property_curves):
    
    mnemonic_0=df_curves_replace.loc[df_curves_replace['Mnemonic'].isin(property_curves)]['Mnemonic'].unique()
    mnemonic=np.append(mnemonic_0,'Well name')
    mnemonic=np.append(mnemonic,'DEPT')
    #wname=df_curves_replace.loc[mnemonic_0]['Well name']
    wname=df_curves_replace.loc[df_curves_replace.loc[df_curves_replace['Mnemonic'].isin(property_curves)]['Mnemonic'].index]["Well name"].unique()
    
    df_replace=df_shear[df_shear['Well name'].isin(wname)]
    df_f=df_replace[mnemonic]
    
    df_f.dropna(how='all',subset=mnemonic_0,inplace=True)
    
    stat=df_f.isnull().sum(axis=1)
    stat_index=stat[stat<len(mnemonic_0)-1].index
    
    index_df = df_f.index.isin(stat_index)
    
    df_f.fillna(0,inplace=True)
    
    df_f[curve]=""
    
    df_dup=df_f[mnemonic_0].loc[index_df]
    df_ndup=df_f[mnemonic_0].loc[~index_df]
    
    df_f[curve].loc[index_df]=df_dup[df_dup!=0].mean(axis=1)
    
    df_f[curve].loc[~index_df]=df_ndup.sum(axis=1)
    
    
    return df_f

In [None]:
res_curves=['AE90','AF90','AHT90', 'AO90', 'AST90','AT90', 'HLLD','IDPH','ILD','LLD','LLD_R', 'RILD', 'RLA5','TBIT90']
gr_curves=['ECGR','ECGRD', 'ECGREDTC', 'ECGRR', 'ECGRS', 'GR', 'GRC', 'GRD', 'GRD1','GRN', 'GRR', 'GRS', 'GRT', 
           'GR_EDTC', 'GR_STGC','HCGR','HCGRD', 'HSGR', 'HSGRD', 'HSGRR','HSGRS', 'SGRDD','SGRS']
calp_curves=['CAL1R', 'CALD', 'CALI', 'CALI_SPCS', 'CALR', 'CALS', 'CALSR', 'CALSR_R', 'CALX', 'HCAL', 'HCALD', 'HCALR','HCALS',
'HD', 'HD1', 'LCAL', 'LCALD', 'LCALR', 'C1','C1S']
pe_curves=['PE','PEF','PEFL','PEFZ','PEF_SLDT']
dens_curves=['NRHO','RHOB','RHOB_SLDT','RHOZ','ZDEN', 'RHOM']
neut_curves=['APLCLS','APLC_LS','CNC', 'CNC_LS','CNPOR_LS','TNPH_LIM','TNPH_LS','TPHI_LS','SNP','NPOR','NPORLS','NPOR_LS',
             'NPHS','NPHI_LS','NPHILS','NPHI', 'ENPH_LS','TNPH']
densp_curves=['DPHI','DPHILS','DPHI_LS','DPHI_SLDT','DPHZ','DPHZLS','DPHZ_LS','DPO','DPOR','DPOR_LS', 'DPO_LS','PHND_LS','PORZ_LS']
comp_curves=['DTCO', 'DTLF', 'DT','DT4P','DTCM']
shear_curves=['DTSM']
denscorr_curves=['HDRA','DRHO','DRH']

In [None]:
curves=[res_curves,gr_curves,calp_curves,pe_curves,dens_curves,denscorr_curves,neut_curves,densp_curves,comp_curves,
        shear_curves]
properties=['Resistivity','Gamma ray','Caliper','Photoelectric','Density','Density correction','Neutron porosity',
            'Density porosity','Compressional slowness','Shear slowness']

df_prop_0=replace(properties[0],curves[0])

for i in range(1,len(curves)):
    
    df_prop=replace(properties[i],curves[i])
    df_prop=df_prop.merge(df_prop_0,how='outer')
    df_prop_0=df_prop

In [None]:
final_props=properties
final_props.append('DEPT')
final_props.append('Well name')

# Data preprocessing

In [None]:
df=df_prop[final_props]

## Before cleaning

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.hist(bins=25,figsize=(15,20))

In [None]:
df.skew()

In [None]:
def corr_matrix(dataframe):
    matrix=dataframe.corr(method='spearman')
    return matrix.round(2)

In [None]:
corr_matrix(df)

## Cleaning process

In [None]:
def remove_outliers_index(property,lower_quant,upper_quant):
    if lower_quant>0 and upper_quant>0:
        dat=df[property]
        P_uq=dat.quantile(upper_quant)
        P_lq=dat.quantile(lower_quant)
        index = df[(dat >= P_uq)|(dat<=P_lq)].index
    elif lower_quant>0 and upper_quant==0:
        dat=df[property]
        P_lq=dat.quantile(lower_quant)
        index = df[dat<=P_lq].index
    else:
        dat=df[property]
        P_uq=dat.quantile(upper_quant)
        index = df[dat >= P_uq].index
    return index

In [None]:
def remove_outliers_df(df_out):
    
    uq=0.99
    lq=0.01

    df_out.loc[remove_outliers_index('Gamma ray',lq,0.998),'Gamma ray']=np.nan #GR's corr was improved
    df_out.loc[remove_outliers_index('Density',0.008,uq),'Density']=np.nan
    df_out.loc[remove_outliers_index('Neutron porosity',0.005,0.993),'Neutron porosity']=np.nan 
    df_out.loc[remove_outliers_index('Compressional slowness',0.008,0.998),'Compressional slowness']=np.nan
    df_out.loc[remove_outliers_index('Resistivity',0.005,0.995),'Resistivity']=np.nan
    df_out.loc[remove_outliers_index('Photoelectric',0.005,0.998),'Photoelectric']=np.nan
    df_out.loc[remove_outliers_index('Density porosity',lq,0.995),'Density porosity']=np.nan
    df_out.loc[remove_outliers_index('Shear slowness',0.008,0.998),'Shear slowness']=np.nan
    df_out.loc[remove_outliers_index('Caliper',lq,0.97),'Caliper']=np.nan

    return df_out

In [None]:
df_outlier=df
df5=remove_outliers_df(df_outlier)
df5.describe()

In [None]:
df5.hist(bins=25,figsize=(15,20))

In [None]:
corr_matrix(df5)

## Density filter

In [None]:
df5_denscorr=df5
index_denscorr=df5_denscorr[(df5_denscorr['Density correction']>0.2) | (df5_denscorr['Density correction']<-0.2)].index
df5_denscorr.loc[index_denscorr,'Density correction']=np.nan
df5_denscorr.loc[index_denscorr,'Density']=np.nan
df5_denscorr.loc[index_denscorr,'Photoelectric']=np.nan
df5_denscorr.loc[index_denscorr,'Caliper']=np.nan

In [None]:
df5_denscorr.describe()

In [None]:
import seaborn as sns
plt.figure(figsize=(9,7))
sns.heatmap(corr_matrix(df5_denscorr),annot=True,cmap=plt.cm.Reds,)
plt.show()

In [None]:
df5_denscorr_cord=df5_denscorr.merge(df_cord,on='Well name')
df5_denscorr_cord

## Depth intervals

In [None]:
def div_interval(df5_denscorr):
    
    df5_depth1=df5_denscorr[(df5_denscorr['DEPT']>=586) & (df5_denscorr['DEPT']<=6615)]
    df5_depth2=df5_denscorr[(df5_denscorr['DEPT']>6615) & (df5_denscorr['DEPT']<=9153)]
    df5_depth3=df5_denscorr[(df5_denscorr['DEPT']>9153) & (df5_denscorr['DEPT']<=12506)]
    df5_depth4=df5_denscorr[(df5_denscorr['DEPT']>12506)]
    
    return df5_depth1,df5_depth2,df5_depth3,df5_depth4

In [None]:
df5_depth1,df5_depth2,df5_depth3,df5_depth4=div_interval(df5_denscorr_cord)

In [None]:
def strat_split(df_f,test,prop):
    
    cat=prop+"_cat"
    
    df_f.drop(['Well name'],axis=1,inplace=True)
    df_f[cat] = pd.cut(df_f[prop],bins=[64,100,114, 136, np.inf],labels=[1, 2, 3, 4])
    df_f=df_f.astype('float64')
    df_f.reset_index(drop=True,inplace=True)
    
    train_set, test_set = train_test_split(df_f, test_size=test, random_state=42)
    
    train_set=train_set.drop([cat],axis=1)
    test_set=test_set.drop([cat],axis=1)
    
    return train_set, test_set

In [None]:
depth_intervals=[df5_depth1,df5_depth2,df5_depth3,df5_depth4]

In [None]:
def export(df_input,depth_no):
    
    df_final = df_input.sample(frac=1,random_state=42).reset_index(drop=True)
    
    train_set, test_set=train_test_split(df_final, test_size=0.2, random_state=42)
    train_set.to_csv('Train set depth '+str(depth_no)+'.csv')
    test_set.to_csv('Test set depth '+str(depth_no)+'.csv')

In [None]:
i=0
for depth_interval in depth_intervals:
    i=i+1
    export(depth_interval,i)

# Model runs

In [None]:
import sklearn
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import glob
import seaborn as sns
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import joblib

In [None]:
def cross_val(model_name,model_input,X_train_scaled, y_train):
    
    model=model_input
    
    #Whole training dataset
    model.fit(X_train_scaled, y_train)
    predictions = model.predict(X_train_scaled)
    mse = mean_squared_error(y_train, predictions)
    rmse = np.sqrt(mse)
    graph=plt.scatter(y_train, predictions)
    
    #Cross-validation
    scores = cross_val_score(model, X_train_scaled, y_train,scoring="neg_mean_squared_error", cv=10)
    rmse_scores = np.sqrt(-scores)
    
    r_squared=r2_score(y_train,predictions)
    
    import joblib
    joblib.dump(model, model_name)
    
    print('R2:',r_squared)
    print("RMSE on whole training set:", rmse)
    print("Scores:", rmse_scores)
    print("Mean:", rmse_scores.mean())
    print("Standard deviation:", rmse_scores.std())
    
    
    return graph

In [None]:
def run_model(model,train_set,test_set,depth_no):
    
    X_train=train_set[features[:-1]]
    X_train.loc[:,'Resistivity']=np.log10(X_train['Resistivity'])
    X_test=test_set[features[:-1]]
    X_test.loc[:,'Resistivity']=np.log10(X_test['Resistivity'])
    y_train=train_set[features[-1]]
    y_test=test_set[features[-1]]
    
    from sklearn.preprocessing import MinMaxScaler, StandardScaler
    scaler=MinMaxScaler()
    X_train_scaled=pd.DataFrame(scaler.fit_transform(X_train))
    X_test_scaled=pd.DataFrame(scaler.fit_transform(X_test))
    
    from sklearn.neighbors import KNeighborsRegressor
    knn_reg=KNeighborsRegressor(n_neighbors=5)
    cross_val(model+str(depth_no),knn_reg,X_train_scaled,y_train)
    
    return depth_no

In [None]:
def model_save(features,model):
    
    train_set_depth1=pd.read_csv('Train set depth 1.csv',index_col=False)
    train_set_depth1.drop('Unnamed: 0',axis=1,inplace=True)
    train_set_depth1.dropna(subset=features,inplace=True)

    test_set_depth1=pd.read_csv('Test set depth 1.csv',index_col=False)
    test_set_depth1.drop('Unnamed: 0',axis=1,inplace=True)
    test_set_depth1.dropna(subset=features,inplace=True)

    train_set_depth2=pd.read_csv('Train set depth 2.csv',index_col=False)
    train_set_depth2.drop('Unnamed: 0',axis=1,inplace=True)
    train_set_depth2.dropna(subset=features,inplace=True)

    test_set_depth2=pd.read_csv('Test set depth 2.csv',index_col=False)
    test_set_depth2.drop('Unnamed: 0',axis=1,inplace=True)
    test_set_depth2.dropna(subset=features,inplace=True)

    train_set_depth3=pd.read_csv('Train set depth 3.csv',index_col=False)
    train_set_depth3.drop('Unnamed: 0',axis=1,inplace=True)
    train_set_depth3.dropna(subset=features,inplace=True)

    test_set_depth3=pd.read_csv('Test set depth 3.csv',index_col=False)
    test_set_depth3.drop('Unnamed: 0',axis=1,inplace=True)
    test_set_depth3.dropna(subset=features,inplace=True)

    train_set_depth4=pd.read_csv('Train set depth 4.csv',index_col=False)
    train_set_depth4.drop('Unnamed: 0',axis=1,inplace=True)
    train_set_depth4.dropna(subset=features,inplace=True)

    test_set_depth4=pd.read_csv('Test set depth 4.csv',index_col=False)
    test_set_depth4.drop('Unnamed: 0',axis=1,inplace=True)
    test_set_depth4.dropna(subset=features,inplace=True)

    train_sets=[train_set_depth1,train_set_depth2,train_set_depth3,train_set_depth4]
    test_sets=[test_set_depth1,test_set_depth2,test_set_depth3,test_set_depth4]

    for i in range(len(train_sets)):

        run_model(model,train_sets[i],test_sets[i],i+1)

In [None]:
features1=['Resistivity','Gamma ray', 'Density', 'Neutron porosity', 'Compressional slowness', 'Shear slowness']
features2=['Resistivity','Gamma ray', 'Density', 'Compressional slowness', 'Shear slowness']
features3=['Resistivity','Gamma ray', 'Neutron porosity', 'Compressional slowness', 'Shear slowness']
features4=['Resistivity','Gamma ray', 'Compressional slowness', 'Shear slowness']
features5=['Gamma ray', 'Neutron porosity','Density', 'Compressional slowness', 'Shear slowness']
features6=['Compressional slowness', 'Shear slowness']
features_all=[features1,features2,features3,features4,features5,features6]
models_all=['knn_reg','knn_reg_cdgr','knn_reg_cngr','knn_reg_cgr','knn_reg_cgnd','knn_reg_comp']

In [None]:
for i in range(len(features_all)):

        model_save(features_all[i],models_all[i])

# Prediction

## Obtain dataset

In [None]:
# Change directory
os.chdir("C:/Users/Jamal/Downloads/ML SPE competition-take home/Final submission")
# Create a list with all the files
path = os.getcwd()
files = os.listdir(path)
# Select only xlsx files
files_las = [f for f in files if f[-3:] == "las"]

In [None]:
lst=[]
lst2=[]

# Loop over list of Excel files
for f in files_las: 
    las=lasio.read(f)
    df=las.df()
    df.reset_index(inplace=True)
    lst.append(df)
    
    well_name = pd.DataFrame([f] * df.shape[0])
    lst2.append(well_name)
    
df2=pd.concat(lst,axis=0)
df_names=pd.concat(lst2,axis=0)

df_wells=pd.concat([df2,df_names],axis=1)
df_wells.rename(columns={0:"Well name"},inplace=True)

In [None]:
Lat=[]

# Loop over list of Excel files
for f in files_las:
    
    las=lasio.read(f)
    Lat.append({'Latitude':las.well['SLAT'].value,'Longtitude':las.well['SLON'].value,'Well name':f})

df_cord=pd.DataFrame(Lat)
df_cord

In [None]:
clmn=df2.columns
clmn_df=pd.DataFrame(clmn)
clmn_df.columns=['Property']
clmn_df.sort_values('Property',inplace=True)
clmn_df.reset_index(drop=True,inplace=True)

In [None]:
logs=[]
a_list = []
b_list = []
c_list = []
Lat=[]
Lon=[]
w_list=[]

for f in files_las: 
    las=lasio.read(f)
    colnum=las.keys()
    logs=las.curves
    for i in range(0,len(colnum)):
        a=logs[i]['descr']
        a_list.append(a)
        b=logs[i]['unit']
        b_list.append(b)
        c=logs[i]['mnemonic']
        c_list.append(c)
        Lat.append(las.well['SLAT'].value)
        Lon.append(las.well['SLON'].value)
        w_list.append(f)

df_curves = pd.DataFrame({'Mnemonic': c_list, 'Unit': b_list,'Description': a_list,'Latitude': Lat,'Longtitude': Lon,
                               'Well name': w_list})
df_curves

In [None]:
def log_count(df,name):
    prop=df.groupby(['Latitude', 'Longtitude']).size().rename(name+'_count').sort_values(ascending=False)
    df_f=pd.DataFrame(prop).reset_index()
    return df_f

In [None]:
comp_df_res=df_curves[(df_curves['Mnemonic']=='AE90') | (df_curves['Mnemonic']=='AF90') | (df_curves['Mnemonic']=='AHT90') 
                 | (df_curves['Mnemonic']=='AO90') | (df_curves['Mnemonic']=='AST90') | (df_curves['Mnemonic']=='AT90')
                 | (df_curves['Mnemonic']=='HLLD') | (df_curves['Mnemonic']=='HRID') | (df_curves['Mnemonic']=='IDPH')
                 | (df_curves['Mnemonic']=='ILD')| (df_curves['Mnemonic']=='LLD') | (df_curves['Mnemonic']=='LLD_R')
                 | (df_curves['Mnemonic']=='RILD') | (df_curves['Mnemonic']=='RLA5') | (df_curves['Mnemonic']=='TBIT90')
                 | (df_curves['Mnemonic']=='HLLD1')]
comp_df_neut=df_curves[(df_curves['Mnemonic']=='APLCLS') | (df_curves['Mnemonic']=='APLC_LS') | (df_curves['Mnemonic']=='CNC') 
                 | (df_curves['Mnemonic']=='CNC_LS') | (df_curves['Mnemonic']=='CNPOR_LS') | (df_curves['Mnemonic']=='TNPH_LIM')
                 | (df_curves['Mnemonic']=='TNPH_LS') | (df_curves['Mnemonic']=='TPHI_LS') | (df_curves['Mnemonic']=='SNP')
                 | (df_curves['Mnemonic']=='NPOR')| (df_curves['Mnemonic']=='NPORLS') | (df_curves['Mnemonic']=='NPOR_LS')
                 | (df_curves['Mnemonic']=='NPHS') | (df_curves['Mnemonic']=='NPHI_LS') | (df_curves['Mnemonic']=='NPHILS')
                | (df_curves['Mnemonic']=='NPHI') | (df_curves['Mnemonic']=='ENPH_LS')| (df_curves['Mnemonic']=='TNPH')]
comp_df_densp=df_curves[(df_curves['Mnemonic']=='DPHI') | (df_curves['Mnemonic']=='DPHILS') | (df_curves['Mnemonic']=='DPHI_LS') 
                 | (df_curves['Mnemonic']=='DPHI_SLDT') | (df_curves['Mnemonic']=='DPHZ') | (df_curves['Mnemonic']=='DPHZLS')
                 | (df_curves['Mnemonic']=='DPHZ_LS') | (df_curves['Mnemonic']=='DPO') | (df_curves['Mnemonic']=='DPOR')
                 | (df_curves['Mnemonic']=='DPOR_LS')| (df_curves['Mnemonic']=='DPO_LS') | (df_curves['Mnemonic']=='PHND_LS')
                 | (df_curves['Mnemonic']=='PORZ_LS')]
comp_df_dens=df_curves[(df_curves['Mnemonic']=='NRHO') | (df_curves['Mnemonic']=='RHOB') | (df_curves['Mnemonic']=='RHOB_SLDT') 
                 | (df_curves['Mnemonic']=='RHOZ') | (df_curves['Mnemonic']=='ZDEN') | (df_curves['Mnemonic']=='RHOM')]
comp_df_gr=df_curves[(df_curves['Mnemonic']=='ECGR') | (df_curves['Mnemonic']=='ECGRD') | (df_curves['Mnemonic']=='ECGREDTC') 
                 | (df_curves['Mnemonic']=='ECGRR') | (df_curves['Mnemonic']=='ECGRS')
                | (df_curves['Mnemonic']=='GRC') | (df_curves['Mnemonic']=='GRD') | (df_curves['Mnemonic']=='GRD1')
                | (df_curves['Mnemonic']=='GRN') | (df_curves['Mnemonic']=='GRR') | (df_curves['Mnemonic']=='GRS')
                | (df_curves['Mnemonic']=='GRT') | (df_curves['Mnemonic']=='GR_EDTC') | (df_curves['Mnemonic']=='GR_STGC')
                | (df_curves['Mnemonic']=='HCGR') | (df_curves['Mnemonic']=='HCGRD') | (df_curves['Mnemonic']=='HCGRR')
                | (df_curves['Mnemonic']=='HCGRS') | (df_curves['Mnemonic']=='HGRT') | (df_curves['Mnemonic']=='HSGR')
                | (df_curves['Mnemonic']=='HSGRD') | (df_curves['Mnemonic']=='HSGRR') | (df_curves['Mnemonic']=='HSGRS')
                    | (df_curves['Mnemonic']=='SGRDD') | (df_curves['Mnemonic']=='SGRS')]
comp_df_pe=df_curves[(df_curves['Mnemonic']=='PE') | (df_curves['Mnemonic']=='PEF') | (df_curves['Mnemonic']=='PEFL') 
                 | (df_curves['Mnemonic']=='PEFS') | (df_curves['Mnemonic']=='PEFZ') | (df_curves['Mnemonic']=='PEF_SLDT')]
comp_df_compr=df_curves[(df_curves['Mnemonic']=='DTCO')]
comp_df_shear=df_curves[(df_curves['Mnemonic']=='DTSM')]
comp_df_calip=df_curves[(df_curves['Mnemonic']=='CAL1R') | (df_curves['Mnemonic']=='CALD') | (df_curves['Mnemonic']=='CALI') 
                 | (df_curves['Mnemonic']=='CALI_SPCS') | (df_curves['Mnemonic']=='CALR') | (df_curves['Mnemonic']=='CALS')
                | (df_curves['Mnemonic']=='CALSR') | (df_curves['Mnemonic']=='CALSR_R') | (df_curves['Mnemonic']=='CALX')
                | (df_curves['Mnemonic']=='HCAL') | (df_curves['Mnemonic']=='HCALD') | (df_curves['Mnemonic']=='HCALR')
               | (df_curves['Mnemonic']=='HCALS') | (df_curves['Mnemonic']=='HD') | (df_curves['Mnemonic']=='HD1')
                | (df_curves['Mnemonic']=='LCAL') | (df_curves['Mnemonic']=='LCALD') | (df_curves['Mnemonic']=='LCALR') |
                        (df_curves['Mnemonic']=='C1') | (df_curves['Mnemonic']=='C1S') | (df_curves['Mnemonic']=='HCALR_1')]

In [None]:
dfs=[log_count(comp_df_res,'res'),log_count(comp_df_neut,'neutron'),log_count(comp_df_dens,'dens'),log_count(comp_df_densp,'densp'),
         log_count(comp_df_gr,'gr'), log_count(comp_df_pe,'pe'),log_count(comp_df_compr,'compr'),log_count(comp_df_shear,'shear'),
         log_count(comp_df_calip,'calip')]

In [None]:
df_0=pd.merge(dfs[0],dfs[1],how="outer", on=["Latitude", "Longtitude"])
for df in dfs[2:]:
    df_1=pd.merge(df_0,df,how="outer", on=["Latitude", "Longtitude"])
    df_0=df_1

In [None]:
final_logs=pd.merge(df_1,df_cord,how="outer", on=["Latitude", "Longtitude"])

In [None]:
features=np.concatenate((['Well name','DEPT'],comp_df_res['Mnemonic'].unique(),comp_df_neut['Mnemonic'].unique(),comp_df_densp['Mnemonic'].unique(),
                comp_df_dens['Mnemonic'].unique(),comp_df_gr['Mnemonic'].unique(),comp_df_pe['Mnemonic'].unique(),
                comp_df_calip['Mnemonic'].unique(),comp_df_compr['Mnemonic'].unique()),axis=0)

In [None]:
df_features=df_wells[features]

In [None]:
df=df_features
df[df.loc[:]==-9999]=np.nan

In [None]:
df_replace=final_logs
df_curves_replace=df_curves[df_curves['Well name'].isin(df_replace['Well name'])]
df_curves_replace=df_curves_replace[df_curves_replace['Mnemonic'].isin(df.columns[1:])]
df_curves_replace

In [None]:
def replace(curve,property_curves):
    
    
    mnemonic_0=df_curves_replace.loc[df_curves_replace['Mnemonic'].isin(property_curves)]['Mnemonic'].unique()
    mnemonic=np.append(mnemonic_0,'Well name')
    mnemonic=np.append(mnemonic,'DEPT')
    #wname=df_curves_replace.loc[mnemonic_0]['Well name']
    wname=df_curves_replace.loc[df_curves_replace.loc[df_curves_replace['Mnemonic'].isin(property_curves)]['Mnemonic'].index]["Well name"].unique()
    
    df_replace=df[df['Well name'].isin(wname)]
    df_f=df_replace[mnemonic]
    
    df_f.dropna(how='all',subset=mnemonic_0,inplace=True)
    
    stat=df_f.isnull().sum(axis=1)
    stat_index=stat[stat<len(mnemonic_0)-1].index
    
    index_df = df_f.index.isin(stat_index)
    
    df_f.fillna(0,inplace=True)
    
    df_f[curve]=""
    
    df_dup=df_f[mnemonic_0].loc[index_df]
    df_ndup=df_f[mnemonic_0].loc[~index_df]
    
    df_f[curve].loc[index_df]=df_dup[df_dup!=0].mean(axis=1)
    
    df_f[curve].loc[~index_df]=df_ndup.sum(axis=1)
    
    
    return df_f

In [None]:
res_curves=['AT90', 'HLLD','IDPH','ILD','LLD','HLLD1']
gr_curves=['GRD', 'GRR', 'GRS']
calp_curves=['CALR', 'HCALD', 'HCALR','CAL1R', 'CALD', 'CALS', 'CALSR', 'LCALD', 'HCAL','HCALR_1']
pe_curves=['PEF','PEFL','PEFZ','PEFS']
dens_curves=['RHOB','RHOZ','RHOM']
neut_curves=['TNPH_LS','NPOR_LS','NPHI_LS','NPHI']
densp_curves=['DPHI_LS','DPHZ_LS', 'DPHZ','DPHI']
comp_curves=['DTCO']

In [None]:
curves=[res_curves,gr_curves,calp_curves,pe_curves,dens_curves,neut_curves,densp_curves,comp_curves]
properties=['Resistivity','Gamma ray','Caliper','Photoelectric','Density','Neutron porosity','Density porosity',
           'Compressional slowness']

df_prop_0=replace(properties[0],curves[0])

for i in range(1,len(curves)):
    
    df_prop=replace(properties[i],curves[i])
    df_prop=df_prop.merge(df_prop_0,how='outer')
    df_prop_0=df_prop

In [None]:
final_props=properties
final_props.append('DEPT')
final_props.append('Well name')

In [None]:
df=df_prop[final_props]

## Data preprocessing

In [None]:
df['Resistivity']=np.log10(df['Resistivity'])

In [None]:
df.describe()

In [None]:
df=df.sort_values(['Well name','DEPT'])
df.reset_index(drop=True,inplace=True)

In [None]:
def div_interval(df5_denscorr):
    
    df5_depth1=df5_denscorr[(df5_denscorr['DEPT']>=586) & (df5_denscorr['DEPT']<=6615)]
    df5_depth2=df5_denscorr[(df5_denscorr['DEPT']>6615) & (df5_denscorr['DEPT']<=9153)]
    df5_depth3=df5_denscorr[(df5_denscorr['DEPT']>9153) & (df5_denscorr['DEPT']<=12506)]
    df5_depth4=df5_denscorr[(df5_denscorr['DEPT']>12506)]
    
    return df5_depth1,df5_depth2,df5_depth3,df5_depth4

In [None]:
df5_depth1,df5_depth2,df5_depth3,df5_depth4=div_interval(df)

### Predictions for Depth interval 1

In [None]:
def predict(model_name,X_pred,X_pred_scaled):
    
    import joblib
    loaded_model = joblib.load(model_name)
    final_predictions=loaded_model.predict(X_pred_scaled)
    X_pred['DTSM']=final_predictions
    
    return X_pred['DTSM']

In [None]:
df=df5_depth1
features=['Resistivity','Gamma ray', 'Density', 'Neutron porosity', 'Compressional slowness']
X=df[features]
X_pred=X
X_pred.dropna(inplace=True)
rest_index=df[features].index.isin(X_pred.index)
X_rest=df[features][~rest_index]
X_neut_res=X_rest.dropna(subset=['Compressional slowness','Gamma ray','Neutron porosity','Resistivity'])
X_neut_res=X_neut_res[['Compressional slowness','Gamma ray','Neutron porosity','Resistivity']]
X_dens_res=X_rest.dropna(subset=['Compressional slowness','Gamma ray','Density','Resistivity'])
X_dens_res=X_dens_res[['Compressional slowness','Gamma ray','Density','Resistivity']]
nores_index=df[features].index.isin([*X_dens_res.index,*X_neut_res.index,*X_pred.index])
X_nores=df[features][~nores_index]
X_cgdn=X_nores.dropna(subset=['Compressional slowness','Gamma ray','Density','Neutron porosity'])
X_cgdn.drop('Resistivity',axis=1,inplace=True)
rest_index=df[features].index.isin([*X_cgdn.index,*X_dens_res.index,*X_neut_res.index,*X_pred.index])
X_rest=df[features][~rest_index]
X_cgr=X_rest.dropna(subset=['Compressional slowness','Gamma ray','Resistivity'])
X_cgr=X_cgr[['Compressional slowness','Gamma ray','Resistivity']]
last_index=df[features].index.isin([*X_cgr.index,*X_cgdn.index,*X_dens_res.index,*X_neut_res.index,*X_pred.index])
X_last=df[features][~last_index]
X_comp=X_last.dropna(subset=['Compressional slowness'])
X_comp=pd.DataFrame(X_comp['Compressional slowness'])
nocomp_index=df[features].index.isin([*X_comp.index,*X_cgr.index,*X_cgdn.index,*X_dens_res.index,*X_neut_res.index,
                                      *X_pred.index])
X_nocomp=df[features][~nocomp_index]
X_nocomp['DTSM']=0

from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler=MinMaxScaler()                           
X_pred_scaled=pd.DataFrame(scaler.fit_transform(X_pred))
X_comp_scaled=pd.DataFrame(scaler.fit_transform(X_comp.values))
X_neut_res_scaled=pd.DataFrame(scaler.fit_transform(X_neut_res))
X_dens_res_scaled=pd.DataFrame(scaler.fit_transform(X_dens_res))
X_cgdn_scaled=pd.DataFrame(scaler.fit_transform(X_cgdn))
X_cgr_scaled=pd.DataFrame(scaler.fit_transform(X_cgr))

models=['knn_reg1','knn_reg_cdgr1','knn_reg_cngr1','knn_reg_cgnd1','knn_reg_cgr1','knn_reg_comp1']
X_preds=[X_pred,X_dens_res,X_neut_res,X_cgdn,X_cgr,X_comp]
X_preds_scaled=[X_pred_scaled,X_dens_res_scaled,X_neut_res_scaled,X_cgdn_scaled,X_cgr_scaled,X_comp_scaled.values]
dflist=[]

for i in range(len(models)):
    preds=pd.DataFrame(predict(models[i],X_preds[i],X_preds_scaled[i]))
    dflist.append(preds)
    
dflist_f=pd.concat(dflist,axis=0)
dff=[dflist_f,pd.DataFrame(X_nocomp['DTSM'])]
df_comb=pd.concat(dff)

df_comb=pd.DataFrame(df_comb).sort_index()
df_f=df
df_f['DTSM']=df_comb

df_f.to_csv('Depth1.csv')

### Predictions for Depth interval 2

In [None]:
df=df5_depth2
features=['Resistivity','Gamma ray', 'Density', 'Neutron porosity', 'Compressional slowness']
X=df[features]
X_pred=X
X_pred.dropna(inplace=True)
rest_index=df[features].index.isin(X_pred.index)
X_rest=df[features][~rest_index]
X_neut_res=X_rest.dropna(subset=['Compressional slowness','Gamma ray','Neutron porosity','Resistivity'])
X_neut_res=X_neut_res[['Compressional slowness','Gamma ray','Neutron porosity','Resistivity']]
X_dens_res=X_rest.dropna(subset=['Compressional slowness','Gamma ray','Density','Resistivity'])
X_dens_res=X_dens_res[['Compressional slowness','Gamma ray','Density','Resistivity']]
nores_index=df[features].index.isin([*X_dens_res.index,*X_neut_res.index,*X_pred.index])
X_nores=df[features][~nores_index]
X_cgdn=X_nores.dropna(subset=['Compressional slowness','Gamma ray','Density','Neutron porosity'])
X_cgdn.drop('Resistivity',axis=1,inplace=True)
rest_index=df[features].index.isin([*X_cgdn.index,*X_dens_res.index,*X_neut_res.index,*X_pred.index])
X_rest=df[features][~rest_index]
X_cgr=X_rest.dropna(subset=['Compressional slowness','Gamma ray','Resistivity'])
X_cgr=X_cgr[['Compressional slowness','Gamma ray','Resistivity']]
last_index=df[features].index.isin([*X_cgr.index,*X_cgdn.index,*X_dens_res.index,*X_neut_res.index,*X_pred.index])
X_last=df[features][~last_index]
X_comp=X_last.dropna(subset=['Compressional slowness'])
X_comp=pd.DataFrame(X_comp['Compressional slowness'])
nocomp_index=df[features].index.isin([*X_comp.index,*X_cgr.index,*X_cgdn.index,*X_dens_res.index,*X_neut_res.index,
                                      *X_pred.index])
X_nocomp=df[features][~nocomp_index]
X_nocomp['DTSM']=0

from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler=MinMaxScaler()                           
X_pred_scaled=pd.DataFrame(scaler.fit_transform(X_pred))
X_comp_scaled=pd.DataFrame(scaler.fit_transform(X_comp.values))
X_neut_res_scaled=pd.DataFrame(scaler.fit_transform(X_neut_res))
X_dens_res_scaled=pd.DataFrame(scaler.fit_transform(X_dens_res))
X_cgdn_scaled=pd.DataFrame(scaler.fit_transform(X_cgdn))
X_cgr_scaled=pd.DataFrame(scaler.fit_transform(X_cgr))

models=['knn_reg2','knn_reg_cdgr2','knn_reg_cngr2','knn_reg_cgnd2','knn_reg_cgr2','knn_reg_comp2']
X_preds=[X_pred,X_dens_res,X_neut_res,X_cgdn,X_cgr,X_comp]
X_preds_scaled=[X_pred_scaled,X_dens_res_scaled,X_neut_res_scaled,X_cgdn_scaled,X_cgr_scaled,X_comp_scaled.values]
dflist=[]

for i in range(len(models)):
    preds=pd.DataFrame(predict(models[i],X_preds[i],X_preds_scaled[i]))
    dflist.append(preds)
    
dflist_f=pd.concat(dflist,axis=0)
dff=[dflist_f,pd.DataFrame(X_nocomp['DTSM'])]
df_comb=pd.concat(dff)

df_comb=pd.DataFrame(df_comb).sort_index()
df_f=df
df_f['DTSM']=df_comb

df_f.to_csv('Depth2.csv')

### Predictions for Depth interval 3

In [None]:
df=df5_depth3
features=['Resistivity','Gamma ray', 'Density', 'Neutron porosity', 'Compressional slowness']
X=df[features]
X_pred=X
X_pred.dropna(inplace=True)
rest_index=df[features].index.isin(X_pred.index)
X_rest=df[features][~rest_index]
X_neut_res=X_rest.dropna(subset=['Compressional slowness','Gamma ray','Neutron porosity','Resistivity'])
X_neut_res=X_neut_res[['Compressional slowness','Gamma ray','Neutron porosity','Resistivity']]
X_dens_res=X_rest.dropna(subset=['Compressional slowness','Gamma ray','Density','Resistivity'])
X_dens_res=X_dens_res[['Compressional slowness','Gamma ray','Density','Resistivity']]
nores_index=df[features].index.isin([*X_dens_res.index,*X_neut_res.index,*X_pred.index])
X_nores=df[features][~nores_index]
X_cgdn=X_nores.dropna(subset=['Compressional slowness','Gamma ray','Density','Neutron porosity'])
X_cgdn.drop('Resistivity',axis=1,inplace=True)
rest_index=df[features].index.isin([*X_cgdn.index,*X_dens_res.index,*X_neut_res.index,*X_pred.index])
X_rest=df[features][~rest_index]
X_cgr=X_rest.dropna(subset=['Compressional slowness','Gamma ray','Resistivity'])
X_cgr=X_cgr[['Compressional slowness','Gamma ray','Resistivity']]
last_index=df[features].index.isin([*X_cgr.index,*X_cgdn.index,*X_dens_res.index,*X_neut_res.index,*X_pred.index])
X_last=df[features][~last_index]
X_comp=X_last.dropna(subset=['Compressional slowness'])
X_comp=pd.DataFrame(X_comp['Compressional slowness'])
nocomp_index=df[features].index.isin([*X_comp.index,*X_cgr.index,*X_cgdn.index,*X_dens_res.index,*X_neut_res.index,
                                      *X_pred.index])
X_nocomp=df[features][~nocomp_index]
X_nocomp['DTSM']=0

from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler=MinMaxScaler()                           
X_pred_scaled=pd.DataFrame(scaler.fit_transform(X_pred))
X_comp_scaled=pd.DataFrame(scaler.fit_transform(X_comp.values))
X_neut_res_scaled=pd.DataFrame(scaler.fit_transform(X_neut_res))
X_dens_res_scaled=pd.DataFrame(scaler.fit_transform(X_dens_res))
X_cgdn_scaled=pd.DataFrame(scaler.fit_transform(X_cgdn))
X_cgr_scaled=pd.DataFrame(scaler.fit_transform(X_cgr))

models=['knn_reg3','knn_reg_cdgr3','knn_reg_cngr3','knn_reg_cgnd3','knn_reg_cgr3','knn_reg_comp3']
X_preds=[X_pred,X_dens_res,X_neut_res,X_cgdn,X_cgr,X_comp]
X_preds_scaled=[X_pred_scaled,X_dens_res_scaled,X_neut_res_scaled,X_cgdn_scaled,X_cgr_scaled,X_comp_scaled.values]
dflist=[]

for i in range(len(models)):
    preds=pd.DataFrame(predict(models[i],X_preds[i],X_preds_scaled[i]))
    dflist.append(preds)
    
dflist_f=pd.concat(dflist,axis=0)
dff=[dflist_f,pd.DataFrame(X_nocomp['DTSM'])]
df_comb=pd.concat(dff)

df_comb=pd.DataFrame(df_comb).sort_index()
df_f=df
df_f['DTSM']=df_comb

df_f.to_csv('Depth3.csv')

### Predictions for Depth interval 4

In [None]:
df=df5_depth4
features=['Resistivity','Gamma ray', 'Density', 'Neutron porosity', 'Compressional slowness']
X=df[features]
X_pred=X
X_pred.dropna(inplace=True)
rest_index=df[features].index.isin(X_pred.index)
X_rest=df[features][~rest_index]
X_neut_res=X_rest.dropna(subset=['Compressional slowness','Gamma ray','Neutron porosity','Resistivity'])
X_neut_res=X_neut_res[['Compressional slowness','Gamma ray','Neutron porosity','Resistivity']]
X_dens_res=X_rest.dropna(subset=['Compressional slowness','Gamma ray','Density','Resistivity'])
X_dens_res=X_dens_res[['Compressional slowness','Gamma ray','Density','Resistivity']]
nores_index=df[features].index.isin([*X_dens_res.index,*X_neut_res.index,*X_pred.index])
X_nores=df[features][~nores_index]
X_cgdn=X_nores.dropna(subset=['Compressional slowness','Gamma ray','Density','Neutron porosity'])
X_cgdn.drop('Resistivity',axis=1,inplace=True)
rest_index=df[features].index.isin([*X_cgdn.index,*X_dens_res.index,*X_neut_res.index,*X_pred.index])
X_rest=df[features][~rest_index]
X_cgr=X_rest.dropna(subset=['Compressional slowness','Gamma ray','Resistivity'])
X_cgr=X_cgr[['Compressional slowness','Gamma ray','Resistivity']]
last_index=df[features].index.isin([*X_cgr.index,*X_cgdn.index,*X_dens_res.index,*X_neut_res.index,*X_pred.index])
X_last=df[features][~last_index]
X_comp=X_last.dropna(subset=['Compressional slowness'])
X_comp=pd.DataFrame(X_comp['Compressional slowness'])
nocomp_index=df[features].index.isin([*X_comp.index,*X_cgr.index,*X_cgdn.index,*X_dens_res.index,*X_neut_res.index,
                                      *X_pred.index])
X_nocomp=df[features][~nocomp_index]
X_nocomp['DTSM']=0

from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler=MinMaxScaler()
X_pred_scaled=pd.DataFrame(scaler.fit_transform(X_pred))
X_comp_scaled=pd.DataFrame(scaler.fit_transform(X_comp.values))
X_neut_res_scaled=pd.DataFrame(scaler.fit_transform(X_neut_res))
X_dens_res_scaled=pd.DataFrame(scaler.fit_transform(X_dens_res))
X_cgdn_scaled=pd.DataFrame(scaler.fit_transform(X_cgdn))
#X_cgr_scaled=pd.DataFrame(scaler.fit_transform(X_cgr))

models=['knn_reg4','knn_reg_cdgr4','knn_reg_cngr4','knn_reg_cgnd4','knn_reg_comp4']
X_preds=[X_pred,X_dens_res,X_neut_res,X_cgdn,X_comp]
X_preds_scaled=[X_pred_scaled,X_dens_res_scaled,X_neut_res_scaled,X_cgdn_scaled,X_comp_scaled.values]
dflist=[]

for i in range(len(models)):
    preds=pd.DataFrame(predict(models[i],X_preds[i],X_preds_scaled[i]))
    dflist.append(preds)
    
dflist_f=pd.concat(dflist,axis=0)
dff=[dflist_f,pd.DataFrame(X_nocomp['DTSM'])]
df_comb=pd.concat(dff)

df_comb=pd.DataFrame(df_comb).sort_index()
df_f=df
df_f['DTSM']=df_comb

df_f.to_csv('Depth4.csv')

## Combining predictions and exporting 

In [None]:
df_1=pd.read_csv('Depth1.csv',index_col=False)
df_1.drop('Unnamed: 0',axis=1,inplace=True)

df_2=pd.read_csv('Depth2.csv',index_col=False)
df_2.drop('Unnamed: 0',axis=1,inplace=True)

df_3=pd.read_csv('Depth3.csv',index_col=False)
df_3.drop('Unnamed: 0',axis=1,inplace=True)

df_4=pd.read_csv('Depth4.csv',index_col=False)
df_4.drop('Unnamed: 0',axis=1,inplace=True)

df_f=pd.concat([df_1,df_2,df_3,df_4])
df_f=df_f.sort_values(['Well name','DEPT'])
df_f.reset_index(drop=True,inplace=True)

In [None]:
df_f.columns=['Resistivity', 'Gamma ray', 'Caliper', 'Photoelectric', 'Density',
       'Neutron porosity', 'Density porosity', 'Compressional slowness',
       'Depth', 'Well name', 'DTSM']
UniqueNames=df_f['Well name'].unique()
UniqueNames

In [None]:
UniqueNames=['00d02be79f49_TGS', '0a7822c59487_TGS',
       '113412eec2a6_TGS', '1684cc35f399_TGS',
       '20372701d5e2_TGS', '2f96a5f92418_TGS',
       '302460e3021a_TGS', '3369b6f8fb6f_TGS',
       '34a80ab7a5fa_TGS', '63250f7d463b_TGS',
       '638f2cc65681_TGS', '7595ba9fb314_TGS',
       '84c5fb9cc880_TGS', '8e37531ba266_TGS',
       '94c1f5cae85c_TGS', 'ae16a9f64878_TGS',
       'ed48bda2217f_TGS', 'eed1e9537976_TGS',
       'fca03aa6acde_TGS', 'ff7845ea074d_TGS']

DataFrameDict = {elem : pd.DataFrame for elem in UniqueNames}

for key in DataFrameDict.keys():
    DataFrameDict[key] = df_f[:][df_f['Well name'] == key]
    DataFrameDict[key][['Depth','DTSM']].to_excel(key+'.xlsx',index=False)