# Library Import

In [None]:
!pip install wellpathpy -q
!pip install phik -q

In [None]:
import pandas as pd
import numpy as np
import wellpathpy as wp
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import phik

from scipy.stats import zscore
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

# Data extraction

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
training_dataset_1 = pd.read_csv("phase_1/Training_dataset_1.csv")
training_dataset_2 = pd.read_csv("phase_1/Training_dataset_2.csv")
training_dataset_3 = pd.read_csv("phase_1/Training_dataset_3.csv")

test_1 = pd.read_csv("phase_1/Test_dataset_1.csv")
test_2 = pd.read_csv("phase_1/Test_dataset_2.csv")
test_3 = pd.read_csv("phase_1/Test_dataset_3.csv")

In [None]:
# Добавил словарь с именами датасетов для более простого обращения
training_datasets = {
    "training_dataset_1": training_dataset_1,
    "training_dataset_2": training_dataset_2,
    "training_dataset_3": training_dataset_3
}

test_datasets = {
    "test_1": test_1,
    "test_2": test_2,
    "test_3": test_3
}

# Data exploration

In [None]:
def data_exploration(df, name):
    display(df.head())
    display(df.describe())
    display(f'Info about data in {name}:')
    display(df.info())
    display(f'Nan in {name} dataset: {sum(df.isna().sum())}')
    display(f'Duplicate rows in {name} dataset: {df.duplicated().sum()}') #хз, для красоты

In [None]:
for name, dataset in training_datasets.items():
    print(f"Dataset: {name}")
    data_exploration(dataset, name)

In [None]:
for name, dataset in test_datasets.items():
    print(f"Dataset: {name}")
    data_exploration(dataset, name)

All of the data except for `DateTime` is of *float64* type. Let's fix it right away: 

In [None]:
for name, dataset in training_datasets.items():
    dataset['DateTime'] = pd.to_datetime(dataset['DateTime'])
    print(f'{name} datset DateTime type:', dataset['DateTime'].dtype)

In [None]:
for name, dataset in test_datasets.items():
    dataset['DateTime'] = pd.to_datetime(dataset['DateTime'])
    print(f'{name} datset DateTime type:', dataset['DateTime'].dtype)

# Feature engineering

## TVD count

In [None]:
def tvd_counting (df, 
                  columns_to_select=['Depth(ft)', 
                 'Svy Inclination (deg)', 
                 'Svy Azimuth (deg)'], 
                  md_depth='Depth(ft)', 
                  inc_svy='Svy Inclination (deg)', 
                  az_svy='Svy Azimuth (deg)',
                  tvd_svy = 'Svy Depth (ft)'):

    df.loc[df[az_svy] < 0, az_svy] = 0 
    df.loc[df[inc_svy] < 0, inc_svy] = 0 
    df.loc[(df[inc_svy] == 90) & (df[md_depth]< 500), inc_svy] = 0
    
    sorted_df = df.sort_values(by=md_depth, ascending=True)

    md = sorted_df[md_depth]
    inc = sorted_df[inc_svy]
    azi = sorted_df[az_svy]

    fname = sorted_df[columns_to_select]

    new_column_names = ['md', 'inc', 'azi']
    fname.columns = new_column_names

    print('Check increasing', fname['md'].is_monotonic_increasing)

    fname = fname.drop_duplicates()
    fname = fname.sort_values(by='md', ascending=True)
    fname = fname.reset_index(drop=True)

    md, inc, azi = fname['md'], fname['inc'], fname['azi']  

    dev = wp.deviation(
        md = md,
        inc = inc,
        azi = azi
    )

    pos = dev.minimum_curvature(course_length=30)
    tvd = pos.depth
    northing = pos.northing
    easting = pos.easting
    
    fname['tvd'] = pos.depth.round(2)

    fname = fname.drop(columns=['inc', 'azi'], axis=1)
    fname = fname.rename(columns={'md':md_depth})

    df_1 = pd.merge(df , fname, on=md_depth, how='left')
    
    print('NaN checking -', df_1.tvd.isna().sum())

    
    df_1['tvd'] = df_1['tvd'] + df_1.loc[0, md_depth]
    df_1['tvd'] = df_1['tvd'] / df_1[md_depth]
    display(df_1[[md_depth, tvd_svy, 'tvd']])
    return df_1

In [None]:
training_dataset_1 = tvd_counting(training_dataset_1)
training_dataset_2 = tvd_counting(training_dataset_2)
training_dataset_3 = tvd_counting(training_dataset_3)

test_1 = tvd_counting(test_1)
test_2 = tvd_counting(test_2)
test_3 = tvd_counting(test_3)

# Choose necessary features

In [None]:
IMPORTANT_FEATURES = ['AD ROP SP (ft/hr)', 
                        'AD WOB SP (klb)',
                        'Annular Velocity (ft/min)',
                        'BHA Length (ft)',
                        'Bit Size (in)',
                        'Bit Time (hr)',
                        'Bit Weight (klb)',
                        'Bttm Pipe Temp (°F)',
                        'Circulating Hrs (hr)',
                        'Co. Man G/L (bbl)', 
                        'Depth(ft)',
                        'Diff Press (psi)',
                        'DateTime',
                        'Flow Deviation (%)', 
                        'Flow In Rate (galUS/min)', 
                        'Flow Out Percent (%)',
                        'Gain Loss (bbl)',
                        'Gain Loss - Spare (bbl)',
                        'Hook Load (klb)',
                        'ML Mud Temp IN (°F)', 
                        'ML Mud Temp OUT (°F)', 
                        'Mud Motor Torque (ft·lbf)', 
                        'Mud Temp In (°F)', 
                        'Mud Temp Out (°F)', 
                        'Mud Volume (bbl)', 
                        'Pump Pressure (psi)', 
                        'Pump SPM - Total (SPM)', 
                        'ROP - Average (ft/hr)', 
                        'Strks - Total (strokes)', 
                        'Suction 1 Pit 9 (bbl)', 
                        'Suction 2 Pit 10 (bbl)', 
                        'Suction 3 Pit 11 (bbl)', 
                        'Time On Job (hr)', 
                        'Total Rotations on DP ()', 
                        'WC Bit Weight (klb)', 
                        'Washout Factor ()',
                        'tvd']       

IMPORTANT_FEATURES_test = IMPORTANT_FEATURES.copy()
IMPORTANT_FEATURES_test.remove('Bttm Pipe Temp (°F)')

In [None]:
training_dataset_1_important = training_dataset_1[IMPORTANT_FEATURES]
training_dataset_2_important = training_dataset_2[IMPORTANT_FEATURES]
training_dataset_3_important = training_dataset_3[IMPORTANT_FEATURES]

test_1_important = test_1[IMPORTANT_FEATURES_test]
test_2_important = test_2[IMPORTANT_FEATURES_test]
test_3_important = test_3[IMPORTANT_FEATURES_test]

# EDA before preproc

In [None]:
def EDA(df, constant_name):
    for i in df.columns:
        plt.figure(figsize=(3,3))
        sns.scatterplot(data=df, x=df[i], y=df[constant_name], alpha=0.2)
        plt.title(f'{i}')
        plt.show()

## Train

### First df

In [None]:
EDA(training_dataset_1_important, 'Bttm Pipe Temp (°F)')

### Second df

In [None]:
EDA(training_dataset_2_important, 'Bttm Pipe Temp (°F)')

### Third df

In [None]:
EDA(training_dataset_3_important, 'Bttm Pipe Temp (°F)')

## Test

### First test df

In [None]:
EDA(test_1_important, 'Depth(ft)')

### Second test df

In [None]:
EDA(test_2_important, 'Depth(ft)')

### Third test df

In [None]:
EDA(test_3_important, 'Depth(ft)')

# Data preproc

In [None]:
datasets = [
    "training_dataset_1_important",
    "training_dataset_2_important",
    "training_dataset_3_important",
    "test_1_important",
    "test_2_important",
    "test_3_important",
]

## Train temp outliers check 

### First df

In [None]:
px.line(training_dataset_1_important['Bttm Pipe Temp (°F)'])

In [None]:
training_dataset_1_important.loc[(training_dataset_1_important['Bttm Pipe Temp (°F)'] > 200) & (training_dataset_1_important.index > 6000), 'Bttm Pipe Temp (°F)'] = np.nan
training_dataset_1_important.loc[(training_dataset_1_important['Bttm Pipe Temp (°F)'] > 150) & (training_dataset_1_important.index < 2000), 'Bttm Pipe Temp (°F)'] = np.nan
training_dataset_1_important.loc[:, 'Bttm Pipe Temp (°F)'] = training_dataset_1_important['Bttm Pipe Temp (°F)'].ffill()

In [None]:
px.line(training_dataset_1_important['Bttm Pipe Temp (°F)'])

### Second df

In [None]:
px.line(training_dataset_2_important, x=training_dataset_2_important.index, y=['Bit Size (in)','Bttm Pipe Temp (°F)'])

In [None]:
training_dataset_2_important.loc[(training_dataset_2_important.index<1500) &
                                ((training_dataset_2_important['Bttm Pipe Temp (°F)']>140) | 
                                 (training_dataset_2_important['Bttm Pipe Temp (°F)']<50)), 'Bttm Pipe Temp (°F)'] = np.nan

training_dataset_2_important.loc[(training_dataset_2_important.index<200) &
                                (training_dataset_2_important['Bttm Pipe Temp (°F)']>90), 'Bttm Pipe Temp (°F)'] = np.nan

training_dataset_2_important.loc[:, 'Bttm Pipe Temp (°F)'] = training_dataset_2_important.loc[:, 'Bttm Pipe Temp (°F)'].bfill()

In [None]:
px.line(training_dataset_2_important, x=training_dataset_2_important.index, y=['Bit Size (in)','Bttm Pipe Temp (°F)'])

### Third df

There is no outliers.

## Annular Velocity (ft/min)  outliers check 

In [None]:
ANNULAR_VELOCITY = 0

In [None]:
def annular_velocity_correctin(df, value = 0):
    df.loc[df['Annular Velocity (ft/min)'] <= value, 'Annular Velocity (ft/min)'] = np.nan
    df.loc[:, 'Annular Velocity (ft/min)'] = df.loc[:, 'Annular Velocity (ft/min)'].bfill()
    return df

In [None]:
for name in datasets:
    globals()[name] = annular_velocity_correctin(globals()[name])

## Bit weight

In [None]:
training_dataset_2_important.loc[training_dataset_2_important['Bit Weight (klb)'] > 100, 'Bit Weight (klb)'] = np.nan
training_dataset_2_important.loc[:, 'Bit Weight (klb)'] = training_dataset_2_important['Bit Weight (klb)'].ffill().bfill()

## Co. Man G/L (bbl)

In [None]:
def g_l_function(df):
    df.loc[df['Co. Man G/L (bbl)'] > 0, 'Co. Man G/L (bbl)'] = 0 
    df.loc[df['Co. Man G/L (bbl)'] < 0, 'Co. Man G/L (bbl)'] = 1 
    return df

In [None]:
for name in datasets:
    globals()[name] = g_l_function(globals()[name])

## Gain Loss (bbl)  outliers check 

In [None]:
training_dataset_1_important.loc[training_dataset_1_important['Gain Loss (bbl)'] > 200, 'Gain Loss (bbl)'] = 0

## Mud temp in|out

### first train df

In [None]:
px.line(training_dataset_1_important['ML Mud Temp IN (°F)'])

In [None]:
training_dataset_1_important.loc[((training_dataset_1_important['ML Mud Temp IN (°F)'] < 60) |
                                 (training_dataset_1_important['ML Mud Temp IN (°F)'] > 80)) &
                                 (training_dataset_1_important.index < 100), 'ML Mud Temp IN (°F)'] = np.nan
training_dataset_1_important.loc[:, 'ML Mud Temp IN (°F)'] = training_dataset_1_important.loc[:, 'ML Mud Temp IN (°F)'].bfill()

In [None]:
px.line(training_dataset_1_important['ML Mud Temp OUT (°F)'])

In [None]:
training_dataset_1_important.loc[training_dataset_1_important['ML Mud Temp OUT (°F)'] > 500, 'ML Mud Temp OUT (°F)'] = np.nan
training_dataset_1_important.loc[:, 'ML Mud Temp OUT (°F)'] = training_dataset_1_important.loc[:, 'ML Mud Temp OUT (°F)'].bfill()

### third train df

In [None]:
training_dataset_3_important.loc[training_dataset_3_important['ML Mud Temp OUT (°F)'] > 500, 'ML Mud Temp OUT (°F)'] = np.nan
training_dataset_3_important.loc[:, 'ML Mud Temp OUT (°F)'] = training_dataset_3_important.loc[:, 'ML Mud Temp OUT (°F)'].bfill()

### first test df

In [None]:
test_1_important.loc[test_1_important['Mud Temp In (°F)'] < 0, 'Mud Temp In (°F)'] = np.nan
test_1_important.loc[:, 'Mud Temp In (°F)'] = test_1_important.loc[:, 'Mud Temp In (°F)'].bfill()

### Third test df

In [None]:
test_3_important.loc[test_3_important['Mud Temp In (°F)'] < 0, 'Mud Temp In (°F)'] = np.nan
test_3_important.loc[:, 'Mud Temp In (°F)'] = test_3_important.loc[:, 'Mud Temp In (°F)'].bfill()

## Mud gradient count

In [None]:
def temp_preproc(df, lower_thresh = 0, upper_thresh = 100):
    df = df.copy()
    df.loc[:, 'ML_mud_temp_grad'] = -df['ML Mud Temp IN (°F)'] + df['ML Mud Temp OUT (°F)']
    df.loc[:, 'Mud_temp_grad'] = -df['Mud Temp In (°F)'] + df['Mud Temp Out (°F)']
    
    df.loc[(df['ML_mud_temp_grad'] < lower_thresh) | (df['Mud_temp_grad'] > upper_thresh), 'ML_mud_temp_grad'] = np.nan
    df.loc[:, 'ML_mud_temp_grad'] = df.loc[:, 'ML_mud_temp_grad'].bfill().ffill()

    df.loc[(df['Mud_temp_grad'] < lower_thresh) | (df['Mud_temp_grad'] > upper_thresh), 'Mud_temp_grad'] = np.nan
    df.loc[:, 'Mud_temp_grad'] = df.loc[:, 'Mud_temp_grad'].bfill().ffill()
    
    return df

In [None]:
for name in datasets:
    globals()[name] = temp_preproc(globals()[name])

## Washout Factor () 

In [None]:
def washout_factor_count(df):
    df = df.copy()
    df.loc[df['Washout Factor ()'] <= 0, 'Washout Factor ()'] = np.nan
    df.loc[:, 'Washout Factor ()'] = df.loc[:, 'Washout Factor ()'].bfill()
    return df

In [None]:
for name in datasets:
    globals()[name] = washout_factor_count(globals()[name])

## Flow In Rate (galUS/min)

In [None]:
def flow_in_rate_count(df, value = 10):
    df = df.copy()
    df.loc[df['Flow In Rate (galUS/min)'] <= value, 'Flow In Rate (galUS/min)'] = np.nan
    df.loc[:, 'Flow In Rate (galUS/min)'] = df.loc[:, 'Flow In Rate (galUS/min)'].bfill()
    return df

In [None]:
for name in datasets:
    globals()[name] = flow_in_rate_count(globals()[name])

## ROP avarage correction

In [None]:
def ROP_correction(df, value = 800):
    df = df.copy()
    df.loc[df['ROP - Average (ft/hr)'] > value, 'ROP - Average (ft/hr)'] = np.nan
    df.loc[:, 'ROP - Average (ft/hr)'] = df.loc[:, 'ROP - Average (ft/hr)'].bfill()
    return df

In [None]:
for name in datasets:
    globals()[name] = ROP_correction(globals()[name])

## Dif press

In [None]:
def dif_press_correction(df, value = -600):
    df = df.copy()
    df.loc[df['Diff Press (psi)'] < value, 'Diff Press (psi)'] = np.nan
    df.loc[:, 'Diff Press (psi)'] = df.loc[:, 'Diff Press (psi)'].bfill()
    return df

In [None]:
for name in datasets:
    globals()[name] = dif_press_correction(globals()[name])

# EDA after preproc

## First train df

In [None]:
EDA(training_dataset_1_important, 'Bttm Pipe Temp (°F)')

## Second train df

In [None]:
EDA(training_dataset_2_important, 'Bttm Pipe Temp (°F)')

## Third train df

In [None]:
EDA(training_dataset_3_important, 'Bttm Pipe Temp (°F)')

## First test df

In [None]:
EDA(test_1_important, 'Depth(ft)')

## Second test df

In [None]:
EDA(test_2_important, 'Depth(ft)')

## Third test df

In [None]:
EDA(test_3_important, 'Depth(ft)')

# Delete useless data

In [None]:
USELESS_COLUMNS = ['AD WOB SP (klb)',
                    'BHA Length (ft)',
                    'DateTime', 
                    'Flow Deviation (%)', 
                    'Flow Out Percent (%)',
                    'Gain Loss (bbl)',
                    'Gain Loss - Spare (bbl)',
                    'Hook Load (klb)',
                    'ML Mud Temp IN (°F)', 
                    'ML Mud Temp OUT (°F)', 
                    'Mud Motor Torque (ft·lbf)', 
                    'Mud Temp In (°F)', 
                    'Mud Temp Out (°F)', 
                    'Pump SPM - Total (SPM)', 
                    'Strks - Total (strokes)', 
                    'Suction 1 Pit 9 (bbl)', 
                    'Suction 2 Pit 10 (bbl)', 
                    'Suction 3 Pit 11 (bbl)', 
                    'Time On Job (hr)', 
                    'Total Rotations on DP ()'] 

In [None]:
training_dataset_1_important = training_dataset_1_important.drop(columns=USELESS_COLUMNS, axis=1)
training_dataset_2_important = training_dataset_2_important.drop(columns=USELESS_COLUMNS, axis=1)
training_dataset_3_important = training_dataset_3_important.drop(columns=USELESS_COLUMNS, axis=1)

test_1_important = test_1_important.drop(columns=USELESS_COLUMNS, axis=1)
test_2_important = test_2_important.drop(columns=USELESS_COLUMNS, axis=1)
test_3_important = test_3_important.drop(columns=USELESS_COLUMNS, axis=1)

# Final exploration

## Circulating exp

In [None]:
def Circulating_exploration(df, df1):
    sns.lineplot(data=df, x='Depth(ft)', y='Circulating Hrs (hr)')
    sns.lineplot(data=df1, x='Depth(ft)', y='Circulating Hrs (hr)')
    plt.show()

In [None]:
Circulating_exploration(training_dataset_1_important, test_1_important)
Circulating_exploration(training_dataset_2_important, test_2_important)
Circulating_exploration(training_dataset_3_important, test_3_important)

In second df, we prolong time circulating

In [None]:
prolong_value = max(training_dataset_2_important['Circulating Hrs (hr)'])
display(prolong_value)

In [None]:
cirk_condition = (training_dataset_2_important['Circulating Hrs (hr)'] <100) & (training_dataset_2_important['Depth(ft)'] > 6000)
training_dataset_2_important.loc[cirk_condition, 'Circulating Hrs (hr)'] = training_dataset_2_important.loc[cirk_condition, 'Circulating Hrs (hr)'] + prolong_value

test_2_important.loc[:, 'Circulating Hrs (hr)'] = test_2_important.loc[:, 'Circulating Hrs (hr)'] + prolong_value

In [None]:
Circulating_exploration(training_dataset_2_important, test_2_important)

## Final all data exploration

In [None]:
for df in datasets:
    melted_df = globals()[df].melt()
    fig = px.box(melted_df, x='variable', y="value")
    fig.show()

# Correlation analysis

In [None]:
def correlation (x, num_cols):
  
    display('Correlation matrix  - phik Matrix')
    
    plt.figure (figsize = (14,14))
    sns.heatmap (x.phik_matrix(interval_cols = num_cols), annot = True, cmap = 'cividis')
    plt.show()
    

# Unification all train ds

Т.к. мы объединяем данные есть смысл провести корреляционный анализ после объединения всех трейновых данных 

Объясняем, что для каждого датасета берем как тестовую выборку до -500 

In [None]:
all_train_df = pd.concat([training_dataset_1_important, training_dataset_2_important, training_dataset_3_important], axis=0).reset_index(drop=True)

cut_value = 500

cut_1 = len(training_dataset_1_important) - cut_value
cut_2 = len(training_dataset_2_important) - cut_value
cut_3 = len(training_dataset_3_important) - cut_value


cut_train_data = pd.concat([training_dataset_1_important[training_dataset_1_important.index<cut_1], 
                        training_dataset_2_important[training_dataset_2_important.index<cut_2], 
                        training_dataset_3_important[training_dataset_3_important.index<cut_3]], axis=0).reset_index(drop=True)

#creatind 3 test df

cut_test_1 = training_dataset_1_important[training_dataset_1_important.index>=cut_1]
cut_test_2 = training_dataset_2_important[training_dataset_2_important.index>=cut_2]
cut_test_3 = training_dataset_3_important[training_dataset_3_important.index>=cut_3]

In [None]:
correlation(cut_train_data, cut_train_data.columns)

**There is no Multicollinearity**

# Using ML, pipeline, 

In [None]:
def X_y(df, name_y = 'Bttm Pipe Temp (°F)'):
    X = df.drop(columns=name_y, axis=1)
    y = df[name_y]
    return X, y

In [None]:
X_train, y_train = X_y(cut_train_data)

X_test_1, y_test_1 = X_y(cut_test_1)
X_test_2, y_test_2 = X_y(cut_test_2)
X_test_3, y_test_3 = X_y(cut_test_3)

In [None]:
ord_columns = ['Co. Man G/L (bbl)']
num_columns = list(X_train.columns)
num_columns.remove('Co. Man G/L (bbl)')

In [None]:
RANDOM_STATE =4242

In [None]:
data_preprocessor = ColumnTransformer (
    [
        ('num', StandardScaler(), num_columns),
        ('ord', 'passthrough', ord_columns)
    ],
    remainder='passthrough'
)

pipeline = Pipeline ([
    ('preprocessor', data_preprocessor),
    ('model', RandomForestRegressor(random_state=RANDOM_STATE))  
])

In [None]:
param_grid_random_forest = [
        {
    'model': [RandomForestRegressor(random_state=RANDOM_STATE)],
    'model__n_estimators': [50, 100, 200, 300]
    }
    ]

param_grid_LGBM = [
    {
    'model': [LGBMRegressor()],
    'model__max_depth': [10, 50, 100]
    }
    ]

param_grid_catboost = [
    {
    'model': [CatBoostRegressor(random_seed=RANDOM_STATE)],
    'model__depth': range(1, 12, 2)
    }    
    ]

In [None]:
gs_RF = GridSearchCV(estimator = pipeline,
                 param_grid = param_grid_random_forest,
                 scoring = 'neg_root_mean_squared_error',
                 cv=5,
                 n_jobs = -1)

gs_LGBM = GridSearchCV(estimator = pipeline,
                 param_grid = param_grid_LGBM,
                 scoring = 'neg_root_mean_squared_error',
                 cv=5,
                 n_jobs = -1)

gs_catboost = GridSearchCV(estimator = pipeline,
                 param_grid = param_grid_catboost,
                 scoring = 'neg_root_mean_squared_error',
                 cv=5,
                 n_jobs = -1)

In [None]:
gs_RF.fit (X_train, y_train)

In [None]:
gs_LGBM.fit (X_train, y_train)

In [None]:
gs_catboost.fit (X_train, y_train)

In [None]:
best_RF = gs_RF.best_estimator_
best_LGBM = gs_LGBM.best_estimator_
best_catboost = gs_catboost.best_estimator_

In [None]:
display(f'RandomForest - {(-1) * gs_RF.best_score_}')
display(f'LGBM - {(-1) * gs_LGBM.best_score_}')
display(f'CatBoost - {(-1) * gs_catboost.best_score_}')

In [None]:
def prediction(estimator):
    y_pred_1 = estimator.predict(X_test_1)
    y_pred_2 = estimator.predict(X_test_2)
    y_pred_3 = estimator.predict(X_test_3)

    metric_data = []
    for pred, test in zip([y_pred_1, y_pred_2, y_pred_3], [y_test_1, y_test_2, y_test_3]):
        metric = root_mean_squared_error(pred, test)
        print(f'RMSE test data - {metric}')
        metric_data.append(metric)
        plt.plot(pred)
        plt.plot(list(test))
        plt.legend(['predicted', 'real'])
        plt.show()

    return(y_pred_1, y_pred_2, y_pred_3, metric_data)

In [None]:
RF_1, RF_2, RF_3, RF_metrics = prediction(best_RF)

In [None]:
LGBM_1, LGBM_2, LGBM_3, LGBM_metrics = prediction(best_LGBM)

In [None]:
CB_1, CB_2, CB_3, CB_metrics = prediction(best_catboost)

In [None]:
RF_metrics = pd.DataFrame(RF_metrics, columns=['RF'])
LGBM_metrics = pd.DataFrame(LGBM_metrics, columns=['LGBM'])
CB_metrics = pd.DataFrame(CB_metrics, columns=['CB'])

all_metrics = pd.concat([RF_metrics, LGBM_metrics, CB_metrics], axis=1)
all_metrics

In [None]:
list_RF_predictions = [RF_1, RF_2, RF_3]
list_LGBM_predictions = [LGBM_1, LGBM_2, LGBM_3]
list_CB_predictions = [CB_1, CB_2, CB_3]

In [None]:
CB_LGBM_pred = [(i + j) / 2 for i, j in zip(list_LGBM_predictions, list_CB_predictions)]

metric_CB_LGBM = []
for pred, target in zip(CB_LGBM_pred, [y_test_1, y_test_2, y_test_3]):
    metric = root_mean_squared_error(pred, target)
    print(f'RMSE test data - {metric}')
    metric_CB_LGBM.append(metric)
    plt.plot(pred)
    plt.plot(list(target))
    plt.legend(['predicted', 'real'])
    plt.show()

metric_CB_LGBM = pd.DataFrame(metric_CB_LGBM, columns=['LGBM_CB'])
all_metrics = pd.concat([all_metrics, metric_CB_LGBM], axis=1)
all_metrics

In [None]:
RF_LGBM_pred = [(i + j) / 2 for i, j in zip(list_LGBM_predictions, list_RF_predictions)]

metric_RF_LGBM = []
for pred, target in zip(RF_LGBM_pred, [y_test_1, y_test_2, y_test_3]):
    metric = root_mean_squared_error(pred, target)
    print(f'RMSE test data - {metric}')
    metric_RF_LGBM.append(metric)
    plt.plot(pred)
    plt.plot(list(target))
    plt.legend(['predicted', 'real'])
    plt.show()

metric_RF_LGBM = pd.DataFrame(metric_RF_LGBM, columns=['LGBM_RF'])
all_metrics = pd.concat([all_metrics, metric_RF_LGBM], axis=1)
all_metrics

In [None]:
RF_CB_LGBM_pred = [(i + j + k) / 3 for i, j, k in zip(list_LGBM_predictions, list_CB_predictions, list_RF_predictions)]

metric_CB_LGBM_RF = []
for pred, target in zip(RF_CB_LGBM_pred, [y_test_1, y_test_2, y_test_3]):
    metric = root_mean_squared_error(pred, target)
    print(f'RMSE test data - {metric}')
    metric_CB_LGBM_RF.append(metric)
    plt.plot(pred)
    plt.plot(list(target))
    plt.legend(['predicted', 'real'])
    plt.show()

metric_CB_LGBM_RF = pd.DataFrame(metric_CB_LGBM_RF, columns=['RF_LGBM_CB'])
all_metrics = pd.concat([all_metrics, metric_CB_LGBM_RF], axis=1)

In [None]:
all_metrics

In [None]:
all_metrics.mean()

# Feature importance

In [None]:
def plot_feature_importances(gs, model_name, feature_names):
    best_model = gs.best_estimator_.named_steps['model']
    importances = best_model.feature_importances_
    indices = np.argsort(importances)  # Без [::-1] для горизонтального графика

    plt.figure(figsize=(10, 6))
    plt.title(f'Feature Importances ({model_name})')
    plt.barh(np.array(feature_names)[indices], importances[indices], align='center')  # barh вместо bar
    plt.xlabel('Importance')
    plt.ylabel('Features')
    plt.show()
    
    return pd.DataFrame({'Feature': np.array(feature_names)[indices], 'Importance': importances[indices]})


feature_names = num_columns + ord_columns

rf_importances = plot_feature_importances(gs_RF, 'RandomForest', feature_names)
lgbm_importances = plot_feature_importances(gs_LGBM, 'LGBM', feature_names)
catboost_importances = plot_feature_importances(gs_catboost, 'CatBoost', feature_names)

# Blind prediction

In [None]:
blind_form_1 = pd.read_csv("phase_1/phase1_blind_test_predictions_1.csv")
blind_form_2 = pd.read_csv("phase_1/phase1_blind_test_predictions_2.csv")
blind_form_3 = pd.read_csv("phase_1/phase1_blind_test_predictions_3.csv")

columns = blind_form_3.columns

X_train_all = all_train_df.drop(columns='Bttm Pipe Temp (°F)', axis=1)
y_train_all = all_train_df['Bttm Pipe Temp (°F)']

best_RF.fit(X_train_all, y_train_all)
best_LGBM.fit(X_train_all, y_train_all)
best_catboost.fit(X_train_all, y_train_all)

In [None]:
def blind_prediction(df_blind_prediction,
                    rf_model, 
                    lgbm_model,
                    catboost_model,
                    blind_form):
    
    real_data = rf_model.predict(df_blind_prediction) + lgbm_model.predict(df_blind_prediction) + catboost_model.predict(df_blind_prediction)
    real_data /= 3
    real_data = pd.DataFrame(real_data)
    
    value_depth = df_blind_prediction['Depth(ft)']
    
    real_predictions = pd.concat([value_depth, real_data], axis=1)
    real_predictions.columns = blind_form.columns

    start_date = blind_form.iloc[0, 0]
    end_date = blind_form.iloc[-1, 0] + 0.001
    interpolate_columns = pd.DataFrame(np.arange(start_date, end_date, 0.001), columns=['Depth(ft)'])
    print(real_predictions.columns)
    sns.lineplot(data = real_predictions, x='Depth(ft)', y='Bttm Pipe Temp (°F) - predicted', label = 'predicted')
    
    real_predictions['Depth(ft)'] = real_predictions['Depth(ft)'].map(lambda x: f"{x:.3f}")
    interpolate_columns['Depth(ft)'] = interpolate_columns['Depth(ft)'].map(lambda x: f"{x:.3f}")

    before_interpolation = interpolate_columns.merge(real_predictions, how='left', on='Depth(ft)')
    before_interpolation['Bttm Pipe Temp (°F) - predicted'] = before_interpolation['Bttm Pipe Temp (°F) - predicted'].interpolate(method='linear')

    blind_form['Depth(ft)'] = blind_form['Depth(ft)'].map(lambda x: f"{x:.3f}")
    blind_form = blind_form.drop(columns=['Bttm Pipe Temp (°F) - predicted'], axis=1)
    
    final_version = blind_form.merge(before_interpolation, how='left', on='Depth(ft)')
    final_version['Depth(ft)'] = final_version['Depth(ft)'].astype(float)
        
    sns.lineplot(data = final_version, x='Depth(ft)', y='Bttm Pipe Temp (°F) - predicted', label = 'interpolated')

    return (final_version)

In [None]:
blind_form_1_pred = blind_prediction(test_1_important, best_RF, best_LGBM, best_catboost, blind_form_1)

In [None]:
blind_form_2_pred = blind_prediction(test_2_important, best_RF, best_LGBM, best_catboost, blind_form_2)

In [None]:
blind_form_3_pred = blind_prediction(test_3_important, best_RF, best_LGBM, best_catboost, blind_form_3)

In [None]:
blind_form_1_pred.to_csv('phase1_blind_test_predictions_1.csv', index=False)
blind_form_2_pred.to_csv('phase1_blind_test_predictions_2.csv', index=False)
blind_form_3_pred.to_csv('phase1_blind_test_predictions_3.csv', index=False)

---

# 2nd phase

Now lets work around with 2nd phase test/train datasets:

In [None]:
useful_columns = training_dataset_1_important.columns
display(useful_columns)

In [None]:
test_useful_columns = useful_columns.drop('Bttm Pipe Temp (°F)')
display(test_useful_columns)

In [None]:
ph2_training_dataset_1 = pd.read_csv("phase_2/FineTune_Train_dataset_1.csv")
ph2_training_dataset_2 = pd.read_csv("phase_2/FineTune_Train_dataset_2.csv")

ph2_test_1 = pd.read_csv("phase_2/FineTune_Test_dataset_1.csv")
ph2_test_2 = pd.read_csv("phase_2/FineTune_Test_dataset_2.csv")

In [None]:
ph2_training_datasets = {
    "ph2_training_dataset_1": ph2_training_dataset_1,
    "ph2_training_dataset_2": ph2_training_dataset_2
}

ph2_test_datasets = {
    "ph2_test_1": ph2_test_1,
    "ph2_test_2": ph2_test_2
}

### checking dataset's sizes

In [None]:
for name, dataset in ph2_training_datasets.items():
    print(f"Dataset: {name}")
    print(dataset.shape)

In [None]:
for name, dataset in ph2_test_datasets.items():
    print(f"Dataset: {name}")
    print(dataset.shape)

### adding tvd to all datasets

In [None]:
for name in ph2_training_datasets:
    ph2_training_datasets[name] = tvd_counting(ph2_training_datasets[name])
for name in ph2_test_datasets:
    ph2_test_datasets[name] = tvd_counting(ph2_test_datasets[name])

### preprocessing

In [None]:
for name in ph2_training_datasets:
    ph2_training_datasets[name] = temp_preproc(ph2_training_datasets[name])
for name in ph2_test_datasets:
    ph2_test_datasets[name] = temp_preproc(ph2_test_datasets[name])

In [None]:
for name, dataset in ph2_test_datasets.items():
    print(f"Dataset: {name}")
    print(dataset.shape)

### sorting out 2nd phase based on columns chosed in 1st phase

In [None]:
for name in ph2_training_datasets:
    ph2_training_datasets[name]['DateTime'] = pd.to_datetime(ph2_training_datasets[name]['DateTime'])
    ph2_training_datasets[name] = ph2_training_datasets[name][useful_columns]
    
    print(f"Dataset: {name}")
    print(ph2_training_datasets[name].shape)
    
    

In [None]:
for name in ph2_test_datasets:
    ph2_test_datasets[name]['DateTime'] = pd.to_datetime(ph2_test_datasets[name]['DateTime'])
    ph2_test_datasets[name] = ph2_test_datasets[name][test_useful_columns]
    
    print(f"Dataset: {name}")
    print(ph2_test_datasets[name].shape)

In [None]:
display(useful_columns)

# Data exploration

In [None]:
for name, dataset in ph2_training_datasets.items():
    print(f"Dataset: {name}")
    data_exploration(dataset, name)

# Phase 2 - EDA before preproc

## 1 train

In [None]:
EDA(ph2_training_datasets["ph2_training_dataset_1"], 'Bttm Pipe Temp (°F)')

## 2 train

In [None]:
EDA(ph2_training_datasets["ph2_training_dataset_2"], 'Bttm Pipe Temp (°F)')

# Phase 2 - Data PreProc

## `AD ROP SP (ft/hr)`

### **Train 1**

In [None]:
px.line(ph2_training_datasets["ph2_training_dataset_1"]['AD ROP SP (ft/hr)'])

### **Train 2**

In [None]:
px.line(ph2_training_datasets["ph2_training_dataset_2"]['AD ROP SP (ft/hr)'])

### **Test 1**

In [None]:
px.line(ph2_test_datasets['ph2_test_1']['AD ROP SP (ft/hr)'])

### **Test 2**

In [None]:
px.line(ph2_test_datasets['ph2_test_2']['AD ROP SP (ft/hr)'])

## `Annular Velocity (ft/min)`

### **Train 1**

In [None]:
px.line(ph2_training_datasets["ph2_training_dataset_1"]['Annular Velocity (ft/min)'])

In [None]:
for name in ph2_training_datasets:
    ph2_training_datasets[name] = annular_velocity_correctin(ph2_training_datasets[name])

In [None]:
px.line(ph2_training_datasets["ph2_training_dataset_1"]['Annular Velocity (ft/min)'])

### **Train 2**

In [None]:
px.line(ph2_training_datasets["ph2_training_dataset_2"]['Annular Velocity (ft/min)'])

### **Test 1**

In [None]:
for name in ph2_test_datasets:
    ph2_test_datasets[name] = annular_velocity_correctin(ph2_test_datasets[name])

In [None]:
px.line(ph2_test_datasets["ph2_test_1"]['Annular Velocity (ft/min)'])

### **Test 2**

In [None]:
px.line(ph2_test_datasets["ph2_test_2"]['Annular Velocity (ft/min)'])

## `Bit Size (in)`

### **Train 1**

In [None]:
px.line(ph2_training_datasets["ph2_training_dataset_1"]['Bit Size (in)'])

### **Train 2**

In [None]:
px.line(ph2_training_datasets["ph2_training_dataset_2"]['Bit Size (in)'])

### **Test 1**

In [None]:
px.line(ph2_test_datasets["ph2_test_1"]['Bit Size (in)'])

### **Test 2**

In [None]:
px.line(ph2_test_datasets["ph2_test_2"]['Bit Size (in)'])

## `Bit Time (hr)`

### **Train 1**

In [None]:
px.line(ph2_training_datasets["ph2_training_dataset_1"]['Bit Time (hr)'])

### **Train 2**

In [None]:
px.line(ph2_training_datasets["ph2_training_dataset_2"]['Bit Time (hr)'])

### **Test 1**

In [None]:
px.line(ph2_test_datasets["ph2_test_1"]['Bit Time (hr)'])

### **Test 2**

In [None]:
px.line(ph2_test_datasets["ph2_test_2"]['Bit Time (hr)'])

## `Bit Weight (klb)`

### **Train 1**

In [None]:
px.line(ph2_training_datasets["ph2_training_dataset_1"]['Bit Weight (klb)'])

### **Train 2**

In [None]:
px.line(ph2_training_datasets["ph2_training_dataset_2"]['Bit Weight (klb)'])

### **Test 1**

In [None]:
px.line(ph2_test_datasets["ph2_test_1"]['Bit Weight (klb)'])

### **Test 2**

In [None]:
px.line(ph2_test_datasets["ph2_test_2"]['Bit Weight (klb)'])

## `Bttm Pipe Temp (°F)`

### **Train 1**

In [None]:
# df1 = ph2_training_datasets["ph2_training_dataset_1"]
# df2 = ph2_training_datasets["ph2_training_dataset_2"]

In [None]:
px.line(ph2_training_datasets["ph2_training_dataset_1"]['Bttm Pipe Temp (°F)'])

In [None]:
ph2_training_datasets["ph2_training_dataset_1"].loc[(ph2_training_datasets["ph2_training_dataset_1"]['Bttm Pipe Temp (°F)'] > 101) & (ph2_training_datasets["ph2_training_dataset_1"].index < 1400), 'Bttm Pipe Temp (°F)'] = np.nan
ph2_training_datasets["ph2_training_dataset_1"].loc[(ph2_training_datasets["ph2_training_dataset_1"]['Bttm Pipe Temp (°F)'] < 50),'Bttm Pipe Temp (°F)' ] = np.nan
ph2_training_datasets["ph2_training_dataset_1"].loc[(ph2_training_datasets["ph2_training_dataset_1"]['Bttm Pipe Temp (°F)'] > 250), 'Bttm Pipe Temp (°F)']  = np.nan
ph2_training_datasets["ph2_training_dataset_1"].loc[:, 'Bttm Pipe Temp (°F)'] = ph2_training_datasets["ph2_training_dataset_1"]['Bttm Pipe Temp (°F)'].ffill()

In [None]:
px.line(ph2_training_datasets["ph2_training_dataset_1"]['Bttm Pipe Temp (°F)'])

### **Train 2**

In [None]:
px.line(ph2_training_datasets["ph2_training_dataset_2"]['Bttm Pipe Temp (°F)'])

## `Circulating Hrs (hr)`

### **Train 1**

In [None]:
px.line(ph2_training_datasets["ph2_training_dataset_1"]['Circulating Hrs (hr)'])

### **Train 2**

In [None]:
px.line(ph2_training_datasets["ph2_training_dataset_2"]['Circulating Hrs (hr)'])

### **Test 1**

In [None]:
px.line(ph2_test_datasets["ph2_test_1"]['Circulating Hrs (hr)'])

### **Test 2**

In [None]:
px.line(ph2_test_datasets["ph2_test_2"]['Circulating Hrs (hr)'])

## `Co. Man G/L (bbl)`

### **Train 1**

In [None]:
px.line(ph2_training_datasets["ph2_training_dataset_1"]['Co. Man G/L (bbl)'])

## `Diff Press (psi)`

### **Train 1**

In [None]:
px.line(ph2_training_datasets["ph2_training_dataset_1"]['Diff Press (psi)'])

### **Train 2**

In [None]:
px.line(ph2_training_datasets["ph2_training_dataset_2"]['Diff Press (psi)'])

### **Test 1**

In [None]:
px.line(ph2_test_datasets["ph2_test_1"]['Diff Press (psi)'])

### **Test 2**

In [None]:
px.line(ph2_test_datasets["ph2_test_2"]['Diff Press (psi)'])

## `Flow In Rate (galUS/min)`

### **Train 1**

In [None]:
px.line(ph2_training_datasets["ph2_training_dataset_1"]['Flow In Rate (galUS/min)'])

### **Train 2**

In [None]:
px.line(ph2_training_datasets["ph2_training_dataset_2"]['Flow In Rate (galUS/min)'])

### **Test 1**

In [None]:
px.line(ph2_test_datasets["ph2_test_1"]['Flow In Rate (galUS/min)'])

### **Test 2**

In [None]:
px.line(ph2_test_datasets["ph2_test_2"]['Flow In Rate (galUS/min)'])

## `Mud Volume (bbl)`

### **Train 1**

In [None]:
px.line(ph2_training_datasets["ph2_training_dataset_1"]['Mud Volume (bbl)'])

### **Train 2**

In [None]:
px.line(ph2_training_datasets["ph2_training_dataset_2"]['Mud Volume (bbl)'])

### **Test 1**

In [None]:
px.line(ph2_test_datasets["ph2_test_1"]['Mud Volume (bbl)'])

### **Test 2**

In [None]:
px.line(ph2_test_datasets["ph2_test_2"]['Mud Volume (bbl)'])

## `Pump Pressure (psi)`

### **Train 1**

In [None]:
px.line(ph2_training_datasets["ph2_training_dataset_1"]['Pump Pressure (psi)'])

### **Train 2**

In [None]:
px.line(ph2_training_datasets["ph2_training_dataset_2"]['Pump Pressure (psi)'])

### **Test 1**

In [None]:
px.line(ph2_test_datasets["ph2_test_1"]['Pump Pressure (psi)'])

### **Test 2**

In [None]:
px.line(ph2_test_datasets["ph2_test_2"]['Pump Pressure (psi)'])

## `ROP - Average (ft/hr)`

### **Train 1**

In [None]:
px.line(ph2_training_datasets["ph2_training_dataset_1"]['ROP - Average (ft/hr)'])

### **Train 2**

In [None]:
px.line(ph2_training_datasets["ph2_training_dataset_2"]['ROP - Average (ft/hr)'])

### **Test 1**

In [None]:
px.line(ph2_test_datasets["ph2_test_1"]['ROP - Average (ft/hr)'])

### **Test 2**

In [None]:
px.line(ph2_test_datasets["ph2_test_2"]['ROP - Average (ft/hr)'])

## `WC Bit Weight (klb)`

### **Train 1**

In [None]:
px.line(ph2_training_datasets["ph2_training_dataset_1"]['WC Bit Weight (klb)'])

### **Train 2**

In [None]:
px.line(ph2_training_datasets["ph2_training_dataset_2"]['WC Bit Weight (klb)'])

### **Test 1**

In [None]:
px.line(ph2_test_datasets["ph2_test_1"]['WC Bit Weight (klb)'])

### **Test 2**

In [None]:
px.line(ph2_test_datasets["ph2_test_2"]['WC Bit Weight (klb)'])

## `Washout Factor ()`

### **Train 1**

In [None]:
px.line(ph2_training_datasets["ph2_training_dataset_1"]['Washout Factor ()'])

### **Train 2**

In [None]:
px.line(ph2_training_datasets["ph2_training_dataset_2"]['Washout Factor ()'])

### **Test 1**

In [None]:
px.line(ph2_test_datasets["ph2_test_1"]['Washout Factor ()'])

### **Test 2**

In [None]:
px.line(ph2_test_datasets["ph2_test_2"]['Washout Factor ()'])

## `ML_mud_temp_grad`

### **Train 1**

In [None]:
px.line(ph2_training_datasets["ph2_training_dataset_1"]['ML_mud_temp_grad'])

In [None]:
ph2_training_datasets["ph2_training_dataset_1"].loc[(ph2_training_datasets["ph2_training_dataset_1"]['ML_mud_temp_grad'] < 10) & (ph2_training_datasets["ph2_training_dataset_1"].index > 5000), 'ML_mud_temp_grad'] = np.nan
ph2_training_datasets["ph2_training_dataset_1"].loc[:, 'ML_mud_temp_grad'] = ph2_training_datasets["ph2_training_dataset_1"]['ML_mud_temp_grad'].ffill()

In [None]:
px.line(ph2_training_datasets["ph2_training_dataset_1"]['ML_mud_temp_grad'])

### **Train 2**

In [None]:
px.line(ph2_training_datasets["ph2_training_dataset_2"]['ML_mud_temp_grad'])

In [None]:
ph2_training_datasets["ph2_training_dataset_2"].loc[(ph2_training_datasets["ph2_training_dataset_2"]['ML_mud_temp_grad'] < 19) , 'ML_mud_temp_grad'] = np.nan
ph2_training_datasets["ph2_training_dataset_2"].loc[:, 'ML_mud_temp_grad'] = ph2_training_datasets["ph2_training_dataset_2"]['ML_mud_temp_grad'].ffill()

In [None]:
px.line(ph2_training_datasets["ph2_training_dataset_2"]['ML_mud_temp_grad'])

### **Test 1**

In [None]:
px.line(ph2_test_datasets["ph2_test_1"]['ML_mud_temp_grad'])

In [None]:
ph2_test_datasets["ph2_test_1"].loc[(ph2_test_datasets["ph2_test_1"]['ML_mud_temp_grad'] < 9) , 'ML_mud_temp_grad'] = np.nan
ph2_test_datasets["ph2_test_1"].loc[:, 'ML_mud_temp_grad'] = ph2_test_datasets["ph2_test_1"]['ML_mud_temp_grad'].ffill()

In [None]:
px.line(ph2_test_datasets["ph2_test_1"]['ML_mud_temp_grad'])

### **Test 2**

In [None]:
px.line(ph2_test_datasets["ph2_test_2"]['ML_mud_temp_grad'])

# EDA after preproc

## First train df

In [None]:
EDA(ph2_training_dataset_1, 'Bttm Pipe Temp (°F)')

## Second train df

In [None]:
EDA(ph2_training_dataset_2, 'Bttm Pipe Temp (°F)')

## First test df

In [None]:
#EDA(ph2_test_1, 'tvd')
EDA(ph2_test_datasets["ph2_test_1"], 'tvd')

## Second test df

In [None]:
EDA(ph2_test_datasets["ph2_test_2"], 'tvd')

---

# Final exploration

## Circulating exp

In [None]:
Circulating_exploration(ph2_training_dataset_1, ph2_test_1)
Circulating_exploration(ph2_training_dataset_2, ph2_test_2)

## Final all data exploration

In [None]:
for name, df in ph2_training_datasets.items():
    melted_df = df.melt()
    fig = px.box(melted_df, x='variable', y="value", title=f"Boxplot for {name}")
    fig.show()

In [None]:
for name, df in ph2_test_datasets.items():
    melted_df = df.melt()
    fig = px.box(melted_df, x='variable', y="value", title=f"Boxplot for {name}")
    fig.show()

# Unification all train ds

Т.к. мы объединяем данные есть смысл провести корреляционный анализ после объединения всех трейновых данных 

Объясняем, что для каждого датасета берем как тестовую выборку до -500 

In [None]:
# Concatenating all training datasets
all_train_df = pd.concat([
    training_dataset_1_important, 
    training_dataset_2_important, 
    training_dataset_3_important, 
    ph2_training_datasets["ph2_training_dataset_1"], 
    ph2_training_datasets["ph2_training_dataset_2"]
], axis=0).reset_index(drop=True)

# Cut-off value
cut_value = 500

# Calculate cut-off indices
cut_1 = len(training_dataset_1_important) - cut_value
cut_2 = len(training_dataset_2_important) - cut_value
cut_3 = len(training_dataset_3_important) - cut_value
cut_4 = len(ph2_training_datasets["ph2_training_dataset_1"]) - cut_value
cut_5 = len(ph2_training_datasets["ph2_training_dataset_2"]) - cut_value

# Creating training set (excluding last `cut_value` rows)
cut_train_data = pd.concat([
    training_dataset_1_important[training_dataset_1_important.index < cut_1], 
    training_dataset_2_important[training_dataset_2_important.index < cut_2], 
    training_dataset_3_important[training_dataset_3_important.index < cut_3],
    ph2_training_datasets["ph2_training_dataset_1"][ph2_training_datasets["ph2_training_dataset_1"].index < cut_4],
    ph2_training_datasets["ph2_training_dataset_2"][ph2_training_datasets["ph2_training_dataset_2"].index < cut_5]
], axis=0).reset_index(drop=True)

# Creating test sets (last `cut_value` rows)
cut_test_1 = training_dataset_1_important[training_dataset_1_important.index >= cut_1]
cut_test_2 = training_dataset_2_important[training_dataset_2_important.index >= cut_2]
cut_test_3 = training_dataset_3_important[training_dataset_3_important.index >= cut_3]
cut_test_4 = ph2_training_datasets["ph2_training_dataset_1"][ph2_training_datasets["ph2_training_dataset_1"].index >= cut_4]
cut_test_5 = ph2_training_datasets["ph2_training_dataset_2"][ph2_training_datasets["ph2_training_dataset_2"].index >= cut_5]


In [None]:
correlation(cut_train_data, cut_train_data.columns)

**There is no Multicollinearity**

# Using ML, pipeline, 

In [None]:
def X_y(df, name_y = 'Bttm Pipe Temp (°F)'):
    X = df.drop(columns=name_y, axis=1)
    y = df[name_y]
    return X, y

In [None]:
X_train, y_train = X_y(cut_train_data)

X_test_1, y_test_1 = X_y(cut_test_1)
X_test_2, y_test_2 = X_y(cut_test_2)
X_test_3, y_test_3 = X_y(cut_test_3)
X_test_4, y_test_4 = X_y(cut_test_4)
X_test_5, y_test_5 = X_y(cut_test_5)

In [None]:
ord_columns = ['Co. Man G/L (bbl)']
num_columns = list(X_train.columns)
num_columns.remove('Co. Man G/L (bbl)')

In [None]:
RANDOM_STATE =4242

In [None]:
data_preprocessor = ColumnTransformer (
    [
        ('num', StandardScaler(), num_columns),
        ('ord', 'passthrough', ord_columns)
    ],
    remainder='passthrough'
)

pipeline = Pipeline ([
    ('preprocessor', data_preprocessor),
    ('model', RandomForestRegressor(random_state=RANDOM_STATE))  
])

In [None]:
param_grid_random_forest = [
        {
    'model': [RandomForestRegressor(random_state=RANDOM_STATE)],
    'model__n_estimators': [50, 100, 200, 300]
    # ,
    # 'model__max_depth': range(2, 20, 4),
    # 'model__min_samples_split': [2, 5, 10, 15],
    # 'model__min_samples_leaf': [1, 2, 4, 8]
    }
    ]

param_grid_LGBM = [
    {
    'model': [LGBMRegressor()],
    'model__max_depth': [10, 50, 100]
    # ,
    # 'model__num_leaves': [50, 100, 150],
    # 'model__learning_rate': [0.01, 0.1, 0.2],
    # 'model__n_estimators': [100, 500, 1000]
    }
    ]

param_grid_catboost = [
    {
    'model': [CatBoostRegressor(random_seed=RANDOM_STATE)],
    'model__depth': range(1, 12, 2)
    # ,
    # 'model__iterations': [50, 150, 300, 500],
    # 'model__learning_rate': [0.001, 0.01, 0.05, 0.1]
    }    
    ]

In [None]:
gs_RF = GridSearchCV(estimator = pipeline,
                 param_grid = param_grid_random_forest,
                 scoring = 'neg_root_mean_squared_error',
                 cv=5,
                 n_jobs = -1)

gs_LGBM = GridSearchCV(estimator = pipeline,
                 param_grid = param_grid_LGBM,
                 scoring = 'neg_root_mean_squared_error',
                 cv=5,
                 n_jobs = -1)

gs_catboost = GridSearchCV(estimator = pipeline,
                 param_grid = param_grid_catboost,
                 scoring = 'neg_root_mean_squared_error',
                 cv=5,
                 n_jobs = -1)

In [None]:
gs_RF.fit (X_train, y_train)

In [None]:
gs_LGBM.fit (X_train, y_train)

In [None]:
gs_catboost.fit (X_train, y_train)

In [None]:
best_RF = gs_RF.best_estimator_
best_LGBM = gs_LGBM.best_estimator_
best_catboost = gs_catboost.best_estimator_

In [None]:
display(f'RandomForest - {(-1) * gs_RF.best_score_}')
display(f'LGBM - {(-1) * gs_LGBM.best_score_}')
display(f'CatBoost - {(-1) * gs_catboost.best_score_}')

In [None]:
def prediction(estimator):
    y_pred_1 = estimator.predict(X_test_1)
    y_pred_2 = estimator.predict(X_test_2)
    y_pred_3 = estimator.predict(X_test_3)
    y_pred_4 = estimator.predict(X_test_4)
    y_pred_5 = estimator.predict(X_test_5)

    metric_data = []
    for pred, test in zip([y_pred_1, y_pred_2, y_pred_3, y_pred_4, y_pred_5], [y_test_1, y_test_2, y_test_3, y_test_4, y_test_5]):
        metric = root_mean_squared_error(pred, test)
        print(f'RMSE test data - {metric}')
        metric_data.append(metric)
        plt.plot(pred)
        plt.plot(list(test))
        plt.show()

    return(y_pred_1, y_pred_2, y_pred_3, y_pred_4, y_pred_5, metric_data)

In [None]:
RF_1, RF_2, RF_3, RF_4, RF_5, RF_metrics = prediction(best_RF)

In [None]:
LGBM_1, LGBM_2, LGBM_3, LGBM_4, LGBM_5, LGBM_metrics = prediction(best_LGBM)

In [None]:
CB_1, CB_2, CB_3, CB_4, CB_5, CB_metrics = prediction(best_catboost)

In [None]:
RF_metrics = pd.DataFrame(RF_metrics, columns=['RF'])
LGBM_metrics = pd.DataFrame(LGBM_metrics, columns=['LGBM'])
CB_metrics = pd.DataFrame(CB_metrics, columns=['CB'])

all_metrics = pd.concat([RF_metrics, LGBM_metrics, CB_metrics], axis=1)
all_metrics

In [None]:
list_RF_predictions = [RF_1, RF_2, RF_3, RF_4, RF_5]
list_LGBM_predictions = [LGBM_1, LGBM_2, LGBM_3, LGBM_4, LGBM_5]
list_CB_predictions = [CB_1, CB_2, CB_3, CB_4, CB_5]

In [None]:
CB_LGBM_pred = [(i + j) / 2 for i, j in zip(list_LGBM_predictions, list_CB_predictions)]

metric_CB_LGBM = []
for pred, target in zip(CB_LGBM_pred, [y_test_1, y_test_2, y_test_3, y_test_4, y_test_5]):
    metric = root_mean_squared_error(pred, target)
    print(f'RMSE test data - {metric}')
    metric_CB_LGBM.append(metric)
    plt.plot(pred)
    plt.plot(list(target))
    plt.show()

metric_CB_LGBM = pd.DataFrame(metric_CB_LGBM, columns=['LGBM_CB'])
all_metrics = pd.concat([all_metrics, metric_CB_LGBM], axis=1)
all_metrics

In [None]:
RF_LGBM_pred = [(i + j) / 2 for i, j in zip(list_LGBM_predictions, list_RF_predictions)]

metric_RF_LGBM = []
for pred, target in zip(RF_LGBM_pred, [y_test_1, y_test_2, y_test_3, y_test_4, y_test_5]):
    metric = root_mean_squared_error(pred, target)
    print(f'RMSE test data - {metric}')
    metric_RF_LGBM.append(metric)
    plt.plot(pred)
    plt.plot(list(target))
    plt.show()

metric_RF_LGBM = pd.DataFrame(metric_RF_LGBM, columns=['LGBM_RF'])
all_metrics = pd.concat([all_metrics, metric_RF_LGBM], axis=1)
all_metrics

In [None]:
RF_CB_LGBM_pred = [(i + j + k) / 3 for i, j, k in zip(list_LGBM_predictions, list_CB_predictions, list_RF_predictions)]

metric_CB_LGBM_RF = []
for pred, target in zip(RF_CB_LGBM_pred, [y_test_1, y_test_2, y_test_3, y_test_4, y_test_5]):
    metric = root_mean_squared_error(pred, target)
    print(f'RMSE test data - {metric}')
    metric_CB_LGBM_RF.append(metric)
    plt.plot(pred)
    plt.plot(list(target))
    plt.show()

metric_CB_LGBM_RF = pd.DataFrame(metric_CB_LGBM_RF, columns=['RF_LGBM_CB'])
all_metrics = pd.concat([all_metrics, metric_CB_LGBM_RF], axis=1)

In [None]:
all_metrics

In [None]:
all_metrics.mean()

In [None]:
import pandas as pd
import numpy as np

training_dataset_1 = pd.read_csv("phase_2/FineTune_Test_dataset_1.csv")
df_half = training_dataset_1.copy()
df_half['Depth(ft)'] += 0.5  # Shi'AI\ Oilers.ipynbft depth by 0.5 to create midpoints

# Concatenate and sort the DataFrame
df_interpolated = pd.concat([df, df_half]).sort_values('Depth(ft)').reset_index(drop=True)

In [None]:
# new_depths = np.arange(df_half['Depth(ft)'].min(), df_half['Depth(ft)'].max(), 0.5)

# # Reindex DataFrame with new depth values
# df_interpolated = new_depths.set_index('Depth(ft)').reindex(new_depths).interpolate().reset_index()

In [None]:
import numpy as np
import pandas as pd

# Create new depth values
new_depths = np.arange(df_half['Depth(ft)'].min(), df_half['Depth(ft)'].max(), 0.5)

# Convert to DataFrame
new_depths_df = pd.DataFrame(new_depths, columns=['Depth(ft)'])

# Perform interpolation
df_interpolated = (
    df_half.set_index('Depth(ft)')   # Set original index
    .reindex(new_depths_df['Depth(ft)'])  # Reindex with new depths
    .interpolate()  # Interpolate missing values
    .reset_index()  # Reset index
)

print(df_interpolated)


In [None]:
display(df_interpolated['Depth(ft)'])

# Feature Importance

In [None]:
feature_names = num_columns + ord_columns

rf_importances = plot_feature_importances(gs_RF, 'RandomForest', feature_names)
lgbm_importances = plot_feature_importances(gs_LGBM, 'LGBM', feature_names)
catboost_importances = plot_feature_importances(gs_catboost, 'CatBoost', feature_names)

# Blind Prediction

In [None]:
ph2_blind_form_1 = pd.read_csv("phase_2/phase2_blind_test_predictions_1.csv")
ph2_blind_form_2 = pd.read_csv("phase_2/phase2_blind_test_predictions_2.csv")

columns = ph2_blind_form_2.columns

X_train_all = all_train_df.drop(columns='Bttm Pipe Temp (°F)', axis=1)
y_train_all = all_train_df['Bttm Pipe Temp (°F)']

best_RF.fit(X_train_all, y_train_all)
best_LGBM.fit(X_train_all, y_train_all)
best_catboost.fit(X_train_all, y_train_all)

In [None]:
ph2_blind_form_1_pred = blind_prediction(ph2_test_datasets["ph2_test_1"], best_RF, best_LGBM, best_catboost, ph2_blind_form_1)

In [None]:
ph2_blind_form_2_pred = blind_prediction(ph2_test_datasets["ph2_test_2"], best_RF, best_LGBM, best_catboost, ph2_blind_form_2)

In [None]:
ph2_blind_form_1_pred.to_csv('ph2_blind_test_predictions_11.csv', index=False)
ph2_blind_form_2_pred.to_csv('ph2_blind_test_predictions_22.csv', index=False)
