<a href="https://www.kaggle.com/code/priyam8210/ml-petro?scriptVersionId=246472801" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import warnings 
warnings.filterwarnings('ignore')
%config InlineBackend.figure_format ='retina'
sns.set(style='ticks')

plt.rc('figure', figsize=(6, 3.7), dpi=100) 
plt.rc('axes', labelpad=20, facecolor="#ffffff", 
       linewidth=0.4, grid=True, labelsize=10) 
plt.rc('xtick.major', width=0.2) 
plt.rc('ytick.major', width=0.2) 
plt.rc('grid', color='#EEEEEE', linewidth=0.25)
plt.rc('font', family='Arial', weight='400', size=10)
plt.rc('text', color='#282828')
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)
plt.rc('savefig', pad_inches=0.3, dpi=300)

## Reading Data

In [None]:
# importing train and test data

df_tr = pd.read_csv('/kaggle/input/cheating/train.csv')
df_te= pd.read_csv('/kaggle/input/cheating/test.csv')

In [None]:
df_tr.shape

In [None]:
#finding missing value

df_tr.isnull().sum()


In [None]:
df_te.isnull().sum()

## looking at unique value in log data

In [None]:
for col in df_tr.columns:
    print(df_tr[col].value_counts())

In [None]:
df_tr.columns

## Plotting log data in desireed for

In [None]:
import pylab
from mpl_toolkits import mplot3d
from matplotlib.ticker import ScalarFormatter



def plotalllogs(dataframe): 
    tracks=6
    linewidth=0.4
    ## Gamma Ray log
    trackGR=0 
    ## caliper log
    trackCALI=1 #Gamma Ray
    ## Resistivity  log
    trackRes=2
    ## Porosity log
    trackPOR = 3
    ## PEF log
    trackPE = 4 
    ## DTC log
    trackDTC=5
  
    pylab.rcParams.update({'font.size': 13, 'font.family': 'serif', 'figure.figsize': (10,20)})
    fig,axes = plt.subplots(1,tracks,sharey=True,figsize=(15,15), dpi=100) ##sharey: share y axis
    fig.gca().invert_yaxis()

    ## GR
    ax1 = axes[trackGR]
    ax1.grid(which='both')
    ax1.plot(dataframe['GR'],dataframe.index, label='GR', color='green', lw=linewidth)
    ax1.set_xlabel('GR, GAPI')
    ax1.set_xlim(0,400)
    ax1.xaxis.set_ticks_position('top')
    ax1.xaxis.set_label_position('top')
    
    ## CALI
    ax2 = axes[trackCALI]
    ax2.grid(which='both')
    ax2.plot(dataframe['CAL'],dataframe.index, label='CAL', color='black', lw=linewidth)
    ax2.set_xlabel('CAL, IN')
    ax2.set_xlim(0,20)
    ax2.xaxis.set_ticks_position('top')
    ax2.xaxis.set_label_position('top')    
   
    # Resistivity   
    ax3 = axes[trackRes]
    ax3.set_xscale('log')
    ax3.grid(which='both')
    ax3.plot(dataframe['HRD'],dataframe.index, label='HRD', color='red', lw=linewidth)
    ax3.plot(dataframe['HRM'],dataframe.index, label='HRM',color='black', ls = '--', lw=linewidth)
    ax3.set_xlabel('Resistivity, ohm.m')
    ax3.legend(loc=1)
    ax3.set_xlim(0.2,2000)
    ax3.xaxis.set_ticks_position('top')
    ax3.xaxis.set_label_position('top')
    ax3.set_xticks([1, 10, 100, 1000])
    ax3.xaxis.set_major_formatter(ScalarFormatter())


    ## RHOB
    ax4 = axes[trackPOR].twiny()
    ax4.plot(dataframe['ZDEN'],dataframe.index, label='ZDEN', color='red', lw=linewidth)
    ax4.set_xlabel('ZDEN')
    ax4.set_xlim(1.5,3)
    ax4.spines['top'].set_position(('outward', 40))
    ax4.xaxis.set_ticks_position('top')
    ax4.xaxis.set_label_position('top') 
    ax4.legend(bbox_to_anchor=(0.6, .9))
    
    ##NPHI
    ax5 = axes[trackPOR]
    ax5.grid(which='both')
    ax5.plot(dataframe['CNC'],dataframe.index, label='CNC', color='blue', lw=linewidth)
    ax5.set_xlim(0.60, 0)
    ax5.set_xlabel('CNC') 
    ax5.legend(bbox_to_anchor=(0.6, 1))
    ax5.xaxis.set_label_position('top') 
    ax5.xaxis.set_ticks_position('top')
    ax5.set_xticks([0.6, 0.45, 0.3, 0.15, 0])
    
    ##PE
    ax6 = axes[trackPE]
    ax6.grid(which='both')
    ax6.plot(dataframe['PE'],dataframe.index, label='PE', lw=linewidth)
    ax6.set_xlabel('PE, barns/electron')
    ax6.set_xlim(-5,10)
    ax6.xaxis.set_ticks_position('top')
    ax6.xaxis.set_label_position('top')  
    
    ## DTC & DTS
    ax7 = axes[trackDTC]
    ax7.grid(which='both')
    ax7.plot(dataframe['DTC'],dataframe.index, label='DTC', lw=linewidth)
    ax7.plot(dataframe['DTS'],dataframe.index, label='DTS', lw=linewidth)
    ax7.set_xlabel('DT')
    ax7.set_xlim(500,40)
    ax7.xaxis.set_ticks_position('top')
    ax7.xaxis.set_label_position('top')
    

In [None]:
plotalllogs(df_tr)

In [None]:
df_tra = df_tr.copy()

In [None]:
#Re 
df = df_tra.drop(df_tra[df_tra['CNC']==-999.0].index)
df = df.drop(df[df['ZDEN']==-999.0].index)
df = df.drop(df[df['GR']==-999.0].index)
df = df.drop(df[df['CAL']==-999.0].index)
df = df.drop(df[df['DTS']==-999.0].index)
df = df.drop(df[df['DTC']==-999.0].index)

In [None]:
def plotalllogs1(dataframe, test):   
    tracks=6
    linewidth=0.4
    ## Gamma Ray log for train data
    trackGR1=0 
    ## Resistivity log for train data
    trackRes1=1
    ## Porosity log for train data 
    trackPOR1=2
    ## Gamma Ray log for test data
    trackGR2=3 
    ## Resistivity log for test
    trackRes2=4
    ## Porosity log for test
    trackPOR2=5
 
    pylab.rcParams.update({'font.size': 13, 'font.family': 'serif', 'figure.figsize': (10,20)})
    fig,axes = plt.subplots(1,tracks,sharey=True,figsize=(15,15), dpi=100) ##sharey: share y axis
    fig.gca().invert_yaxis()

    ## GR-1
    ax1 = axes[trackGR1]
    ax1.grid(which='both')
    ax1.plot(dataframe['GR'],dataframe.index, label='GR', color='green', lw=linewidth)
    ax1.set_xlabel('GR, GAPI')
    ax1.set_xlim(0,400)
    ax1.xaxis.set_ticks_position('top')
    ax1.xaxis.set_label_position('top') 
    
    # Resistivity-1 
    ax2 = axes[trackRes1]
    ax2.set_xscale('log')
    ax2.grid(which='both')
    ax2.plot(dataframe['HRD'],dataframe.index, label='HRD', color='red', lw=linewidth)
    ax2.plot(dataframe['HRM'],dataframe.index, label='HRM',color='black', ls = '--', lw=linewidth)
    ax2.set_xlabel('Resistivity, ohm.m')
    ax2.legend(loc=1)
    ax2.set_xlim(0.2,2000)
    ax2.xaxis.set_ticks_position('top')
    ax2.xaxis.set_label_position('top')
    ax2.set_xticks([1, 10, 100, 1000])
    ax2.xaxis.set_major_formatter(ScalarFormatter())
        
    ## RHOB-1
    ax3 = axes[trackPOR1].twiny()
    ax3.plot(dataframe['ZDEN'],dataframe.index, label='ZDEN', color='red', lw=linewidth)
    ax3.set_xlabel('ZDEN')
    ax3.set_xlim(1.5,3)
    #ax4.set_xticks([1.95, 2.2, 2.45, 2.7, 2.95])
    ax3.spines['top'].set_position(('outward', 40))
    #ax4.spines['top'].set_edgecolor('red')
    ax3.xaxis.set_ticks_position('top') # set the position of the second x-axis to top
    #ax4.tick_params(axis='x', colors='red')
    ax3.xaxis.set_label_position('top') # set the position of the second x-axis to top
    ax3.legend(bbox_to_anchor=(0.6, .9))
    
    ##NPHI-1
    ax4 = axes[trackPOR1]
    ax4.grid(which='both')
    ax4.plot(dataframe['CNC'],dataframe.index, label='CNC', color='blue', lw=linewidth)
    ax4.set_xlim(0.60, 0)
    ax4.set_xlabel('CNC') 
    ax4.legend(bbox_to_anchor=(0.6, 1))
    ax4.xaxis.set_label_position('top') # set the position of the second x-axis to top
    ax4.xaxis.set_ticks_position('top')
    ax4.set_xticks([0.6, 0.45, 0.3, 0.15, 0])
    
     ## GR-2
    ax5 = axes[trackGR2]
    ax5.grid(which='both')
    ax5.plot(test['GR'],test.index, label='GR2', color='green', lw=linewidth)
    ax5.set_xlabel('GR2, GAPI')
    ax5.set_xlim(0,400)
    ax5.xaxis.set_ticks_position('top')
    ax5.xaxis.set_label_position('top')
    
    # Resistivity-2   
    ax6 = axes[trackRes2]
    ax6.set_xscale('log')
    ax6.grid(which='both')
    ax6.plot(test['HRD'],test.index, label='HRD2', color='red', lw=linewidth)
    ax6.plot(test['HRM'],test.index, label='HRM2',color='black', ls = '--', lw=linewidth)
    ax6.set_xlabel('Resistivity, ohm.m')
    ax6.legend(loc=1)
    ax6.set_xlim(0.2,2000)
    ax6.xaxis.set_ticks_position('top')
    ax6.xaxis.set_label_position('top')
    ax6.set_xticks([1, 10, 100, 1000])
    ax6.xaxis.set_major_formatter(ScalarFormatter())

    ## RHOB-2
    ax7 = axes[trackPOR2].twiny()
    ax7.plot(test['ZDEN'],test.index, label = 'ZDEN2', color='red', lw=linewidth)
    ax7.set_xlabel('ZDEN2')
    ax7.set_xlim(1.5,3)
    ax7.spines['top'].set_position(('outward', 40))
    ax7.xaxis.set_ticks_position('top') 
    ax7.xaxis.set_label_position('top') # set the position of the second x-axis to top
    ax7.legend(bbox_to_anchor=(0.6, .9))
    
    ##NPHI-2
    ax8 = axes[trackPOR2]
    ax8.grid(which='both')
    ax8.plot(test['CNC'],test.index, label='CNC2', color='blue', lw=linewidth)
    ax8.set_xlim(0.60, 0)
    ax8.set_xlabel('CNC2') 
    ax8.legend(bbox_to_anchor=(0.6, 1))
    ax8.xaxis.set_label_position('top') # set the position of the second x-axis to top
    ax8.xaxis.set_ticks_position('top')
    ax8.set_xticks([0.6, 0.45, 0.3, 0.15, 0])


In [None]:
df_t = df_te.copy()

In [None]:
#Log scale for resistivity logs

df['log_hrd'] = np.log10(df['HRD'])
df['log_hrm'] = np.log10(df['HRM'])
df_t['log_hrd'] = np.log10(df_t['HRD'])
df_t['log_hrm'] = np.log10(df_t['HRM'])

index_min = 13400
index_max = 19000

In [None]:
plotalllogs1(df, df_te)

In [None]:
df = df.drop(['HRD', 'HRM'], axis = 1)
df_t = df_t.drop(['HRD', 'HRM'], axis = 1)

In [None]:
plt.figure(figsize=(16, 8))
cor_mat = df.corr()
sns.heatmap(cor_mat, annot=True)

In [None]:
df = df.drop(['log_hrm'], axis = 1)
df_t = df_t.drop(['log_hrm'], axis = 1)

In [None]:
x_vars = ['PE', 'ZDEN']
g = sns.PairGrid(df, y_vars=['CNC'], x_vars=x_vars);
g.map(plt.scatter, color="orange", edgecolors="#000000", linewidths=0.5) 


In [None]:
x_vars = ['PE', 'ZDEN', 'CNC']
g = sns.PairGrid(df, y_vars=['DTC'], x_vars=x_vars);
g.map(plt.scatter, color="orange", edgecolors="#000000", linewidths=0.5) 


## Analysing variation of higly correlated features on Target value,i.e, DTC and DTS

In [None]:
x_vars = ['PE', 'ZDEN', 'CNC']
g = sns.PairGrid(df, y_vars=['DTS'], x_vars=x_vars);
g.map(plt.scatter, color="orange", edgecolors="#000000", linewidths=0.5) 


In [None]:
df_tr_copy = df

## Finding outliers in log data, i.e, features

In [None]:
from scipy import stats
for col in df_tr_copy.drop(['DTC', 'DTS'], axis=1).columns:
    outlier_score =pd.Series(stats.zscore(df_tr_copy[col]))
    print([df_tr_copy[outlier_score>3]])

### in this data, many outliers are present, but we can't remove it, because it signifies some extra features 

## Analysing outliers in data by ploting two sets of features

In [None]:
x_vars = ['CAL', 'CNC', 'GR', 'PE']
g = sns.PairGrid(df, y_vars=['ZDEN'], x_vars=x_vars);
g.map(plt.scatter, color="orange", edgecolors="#000000", linewidths=0.5) 


In [None]:
x_vars = ['CAL', 'CNC', 'GR', 'ZDEN']
g = sns.PairGrid(df, y_vars=['PE'], x_vars=x_vars);
g.map(plt.scatter, color="orange", edgecolors="#000000", linewidths=0.5) 


In [None]:
x_vars = ['CAL', 'CNC', 'PE', 'ZDEN']
g = sns.PairGrid(df, y_vars=['GR'], x_vars=x_vars);
g.map(plt.scatter, color="orange", edgecolors="#000000", linewidths=0.5) 


In [None]:
x_vars = ['CAL', 'GR', 'PE', 'ZDEN']
g = sns.PairGrid(df, y_vars=['CNC'], x_vars=x_vars);
g.map(plt.scatter, color="orange", edgecolors="#000000", linewidths=0.5) 


In [None]:
# Subset of whole data

df =df[(df['CNC']<1)&(df['CNC']>-0.2)&(df['ZDEN']<3)&(df['ZDEN']>1.75)&(df['DTC']<160)&(df['DTC']>40)]

In [None]:
fig,axes = plt.subplots(1,3,figsize=(15,15))
ax1 = axes[0]
ax1.scatter(df['CNC'], df['ZDEN'])
ax2 = axes[1]
ax2.scatter(df['DTC'], df['ZDEN'])
ax2.set_ylim(1,3)
ax2.set_xlim(40,160)
ax3 = axes[2]
ax3.scatter(df['CNC'], df['DTC'])
ax3.set_ylim(40, 160)
ax3.set_xlim(-0.2,1)

## Looking at uniform distribution of Data, i.e, skewness of data

### if value of skewness is in between -1 to 1, then, data is uniformly distributed

In [None]:
for col in df_tr_copy:
    print(col, '::', df_tr_copy[col].skew())

### data is highly non-uniform

### extracting features and targets

In [None]:
df_t.columns

In [None]:

x_tr = df.drop([ 'DTC', 'DTS', ], axis= 1)
y_tr1 = df['DTC']

In [None]:
df_t.shape[1]

In [None]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()


In [None]:
x_tr = scaler.fit_transform(x_tr)
df_te = scaler.fit_transform(df_t)

## Applying Random Forest model for DTC prediction

In [None]:
from sklearn.model_selection import train_test_split
x_tr, x_te, y_tr, y_te = train_test_split(x_tr, y_tr1, test_size=.2, random_state = 9  )

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import KFold


## Hyperparameter tunning for Random forest with RandomizedSearchCV

In [None]:
cv= KFold(n_splits=5,  shuffle = False)
param = {'n_estimators':range(60,95),
         'max_depth':range(20, 80),
         'min_samples_split':range(2,5 ), 
         'min_samples_leaf':range(3, 6), 
         'max_features':range(1, x_tr.shape[1])}
clf1 = RandomizedSearchCV(RandomForestRegressor(random_state=2), param, cv=cv , n_iter = 10, n_jobs=-1,scoring = 'neg_mean_absolute_error', random_state = 2)
clf1.fit(x_tr, y_tr)
print(clf1.best_params_)

In [None]:
model = RandomForestRegressor(**clf1.best_params_, oob_score = True)
model.fit(x_te, y_te)

In [None]:
model.oob_score_

### Here we get our cross validation score above 97 percent

In [None]:
# predicting DTC value 

dtc_pred = model.predict(df_t)

## Random Forest Model for DTS Prediction

In [None]:
x_tr = df.drop([ 'DTC', 'DTS', ], axis= 1)
y_tr2 = df['DTS']

In [None]:
x_tr = scaler.fit_transform(x_tr)
df_te = scaler.fit_transform(df_t)

In [None]:
x_tr.shape

In [None]:
y_tr2.shape

In [None]:
from sklearn.model_selection import train_test_split
x_tr1, x_te1, y_tr1, y_te1 = train_test_split(x_tr, y_tr2, test_size=.2, random_state = 9  )

In [None]:
cv= KFold(n_splits=5,  shuffle = False)
param = {'n_estimators':range(60, 95),
         'max_depth':range(30, 70),
         'min_samples_split':range(2,5 ), 
         'min_samples_leaf':range(3, 6), 
         'max_features':range(1, x_tr.shape[1])}
clf2 = RandomizedSearchCV(RandomForestRegressor(random_state=2), param, cv=cv , n_iter = 10, n_jobs=-1,scoring = 'neg_mean_absolute_error', random_state = 2)
clf2.fit(x_tr1, y_tr1)
print(clf2.best_params_)

In [None]:
model1 = RandomForestRegressor(**clf2.best_params_, oob_score = True)
model1.fit(x_te1, y_te1)

In [None]:
model1.oob_score_

In [None]:
dts_pred = model1.predict(df_te)

In [None]:
dts_pred.shape

In [None]:
dtc_pred.shape

In [None]:
targets = pd.DataFrame(np.stack([dtc_pred, dts_pred], axis=1))

In [None]:
targets

In [None]:
targets.to_csv('submission.csv', index=False)