### Cleaning and feature selection

In [None]:
# Warnings are provided to warn the developer of situations that aren’t necessarily exceptions. 
#Usually, a warning occurs when there is some obsolete of certain programming elements.
#Python program terminates immediately if an error occurs. Conversely, a warning is not critical.

import warnings
warnings.filterwarnings('ignore')

In [None]:
import os
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv('ConcatenatedResults/concatenated_results1.csv')

validation_data = pd.read_csv('ConcatenatedResults/results_influx_validation_2023-03-27_12-48-45.csv')

df_validation = validation_data

In [None]:
data.info()

In [None]:
data.nunique()

In [None]:
# Drop columns with non-float64 data type
data_original = data.select_dtypes(include=['float64', 'int64'])
df_validation_original = df_validation.select_dtypes(include=['float64', 'int64'])

data = data_original
df_validation = df_validation_original

### Removing constant columns

In [None]:
def remove_unique_columns(df):
    for column in df.columns:
        if column not in ['GasFlowRateOut', 'TotalInfluxMass'] and df[column].nunique() == 1:
            df = df.drop(column, axis=1)
    return df

# Remove unique columns from df_train
data = remove_unique_columns(data)

# Remove unique columns from df_validation
df_validation = remove_unique_columns(df_validation)

### Splitting each unique simulation into first 420 seconds into train, next 60 seconds into val and last 120 seconds into test

In [None]:
# Split the data into train, validation, and test sets based on unique simulation IDs
train_data = []
val_data = []
test_data = []

unique_sim_ids = data['sim_ID'].unique()

for sim_id in unique_sim_ids:
    sim_id_data = data[data['sim_ID'] == sim_id]

    # Determine the split indices
    split_index_train = int(len(sim_id_data) * 0.7)
    split_index_val = split_index_train + int(len(sim_id_data) * 0.10)

    # Split the data into train, validation, and test sets
    train_data_sim = sim_id_data[:split_index_train]
    val_data_sim = sim_id_data[split_index_train:split_index_val]
    test_data_sim = sim_id_data[split_index_val:]

    # Append the split data to the respective lists
    train_data.append(train_data_sim)
    val_data.append(val_data_sim)
    test_data.append(test_data_sim)

# Concatenate the dataframes in the lists to create a single dataframe for train, validation, and test
df_train = pd.concat(train_data)
df_val = pd.concat(val_data)
df_test = pd.concat(test_data)

### Scaling using the MinMaxScaler before Heatmap

In [None]:
# Scale the data separately for train, validation, and test sets
scaler = MinMaxScaler()

# Fit the scaler using the training set (excluding the 'sim_ID' column)
scaler.fit(df_train.drop('sim_ID', axis=1))

# Transform the data using the fitted scaler
df_train_scaled = scaler.transform(df_train.drop('sim_ID', axis=1))
df_val_scaled = scaler.transform(df_val.drop('sim_ID', axis=1))
df_test_scaled = scaler.transform(df_test.drop('sim_ID', axis=1))

# Convert the scaled arrays back to dataframes, preserving the column names (except for 'sim_ID')
df_train_scaled = pd.DataFrame(df_train_scaled, columns=df_train.columns.drop('sim_ID'))
df_val_scaled = pd.DataFrame(df_val_scaled, columns=df_val.columns.drop('sim_ID'))
df_test_scaled = pd.DataFrame(df_test_scaled, columns=df_test.columns.drop('sim_ID'))

In [None]:
# Plot the heatmap for MinMaxScaler
plt.figure(figsize=(20, 20))
plt.title("Heatmap with MinMaxScaler")
sns.heatmap(df_train_scaled.corr(), annot=True, cmap="coolwarm", linewidths=0.5)
plt.show()

### Removing features which are not relevant and are also printed in brackets below

In [None]:
# Identify Highly Correlated Features

# Create correlation matrix
corr_matrix = df_train.corr()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.9 or less than -0.9
high_correlation = [column for column in upper.columns if any((upper[column] > 0.9) | (upper[column] < -0.9))]

print(high_correlation)

In [None]:
unwanted_features = ['GasFlowRateOut','TotalInfluxMass', 'TopOfStringPosition', 'MainPitTemperature', 'ActivePitVolume', 'ActivePitTemperature', 'ActivePitDensity', 'CalculatedPressureBottomHole', 'DownholeECD', 'TopOfStringVelocity']

# Combine the train, validation, and test sets
combined_df = pd.concat([df_train, df_val, df_test])

# Unscaled the combined data using inverse_transform
unscaled_data = scaler.inverse_transform(combined_df.drop('sim_ID', axis=1))

# Create a DataFrame with the unscaled data
unscaled_df = pd.DataFrame(unscaled_data, columns=combined_df.columns.drop('sim_ID'))

# Drop the unwanted features
selected_features_df = unscaled_df.drop(columns=unwanted_features)

# Scale the data back with the selected features
minmax_scaled_data_selected = scaler.fit_transform(selected_features_df)

# Create a DataFrame with the selected and scaled data
minmax_scaled_df_selected = pd.DataFrame(minmax_scaled_data_selected, columns=selected_features_df.columns)

# Split the data back into train, validation, and test sets
train_len = len(df_train)
val_len = len(df_val)

df_train_selected = minmax_scaled_df_selected.iloc[:train_len]
df_val_selected = minmax_scaled_df_selected.iloc[train_len:train_len+val_len]
df_test_selected = minmax_scaled_df_selected.iloc[train_len+val_len:]

In [None]:
# Plot the heatmap for MinMaxScaler
plt.figure(figsize=(20, 20))
plt.title("Heatmap with MinMaxScaler")
sns.heatmap(df_train_selected.corr(), annot=True, cmap="coolwarm", linewidths=0.5)
plt.show()

### Removing unwanted features and scaling the other dataframes aswell

In [None]:
df_validation2 = df_validation.drop(columns=[col for col in unwanted_features])

scaled_data_validation = scaler.fit_transform(df_validation2)

scaled_data_validation = pd.DataFrame(scaled_data_validation, columns=df_validation2.columns)

df_validation1 = scaled_data_validation

### Storing the datasets, scaler and input/target column in the notebook for further ML in anoter notebook

In [None]:
# Store the DataFrame in the IPython database
%store df_train_selected
%store df_val_selected
%store df_test_selected
%store df_validation1

In [None]:
%store scaler

In [None]:
# create a list of column names to include in the new DataFrame
input_col1 = [col for col in df_train_selected.columns if col != 'DownholePressure']

target_col1 = ['DownholePressure']

%store input_col1
%store target_col1

In [None]:
# Split df_train_selected into x_train and y_train
x_train1 = df_train_selected[input_col1].to_numpy()
y_train1 = df_train_selected[target_col1].to_numpy()

# Split df_val_selected into x_val and y_val
x_val1 = df_val_selected[input_col1].to_numpy()
y_val1 = df_val_selected[target_col1].to_numpy()

# Split df_test_selected into x_test and y_test
x_test1 = df_test_selected[input_col1].to_numpy()
y_test1 = df_test_selected[target_col1].to_numpy()

# Store the arrays in the IPython database
%store x_train1
%store y_train1
%store x_val1
%store y_val1
%store x_test1
%store y_test1

In [None]:
# Split df_train into x_train and y_train
x_train = df_train_selected[input_col1]
y_train = df_train_selected[target_col1]

# Split df_train into x_val and y_val
x_val = df_val_selected[input_col1]
y_val = df_val_selected[target_col1]

# Split the testing data
x_test = df_test_selected[input_col1]
y_test = df_test_selected[target_col1]

#Split the validation data
x_validation = df_validation1[input_col1]
y_validation = df_validation1[target_col1]

### Plots to see how the features look against timesteps for split method 3

In [None]:
# Assuming df_train_selected and df_test_selected are your train_data and test_data DataFrames
df_train_plot = df_train_selected.head(420)
df_val_plot = df_val_selected.head(59)
df_test_plot = df_test_selected.head(119)

fig, axs = plt.subplots(3, figsize=(15, 10)) # Create three subplots, adjust the figsize here (width, height)

# Plot MainPitDensity and FlowRateOut for df_train_plot
color1 = 'tab:blue'
color2 = 'tab:red'
axs[0].set_xlabel('Index')
axs[0].set_ylabel('MainPitDensity', color=color1)
axs[0].plot(df_train_plot.index, df_train_plot['MainPitDensity'].fillna(0), color=color1)
axs[0].tick_params(axis='y', labelcolor=color1)
axs[0].set_title('MainPitDensity and FlowRateOut for First 420 Rows (Training Data)')

ax2 = axs[0].twinx()
ax2.set_ylabel('FlowRateOut', color=color2)
ax2.plot(df_train_plot.index, df_train_plot['FlowRateOut'], color=color2)
ax2.tick_params(axis='y', labelcolor=color2)

# Plot MainPitDensity and FlowRateOut for df_val_plot
axs[1].set_xlabel('Index')
axs[1].set_ylabel('MainPitDensity', color=color1)
axs[1].plot(df_val_plot.index, df_val_plot['MainPitDensity'].fillna(0), color=color1)
axs[1].tick_params(axis='y', labelcolor=color1)
axs[1].set_title('MainPitDensity and FlowRateOut for Next 60 Rows (Validation Data)')

ax3 = axs[1].twinx()
ax3.set_ylabel('FlowRateOut', color=color2)
ax3.plot(df_val_plot.index, df_val_plot['FlowRateOut'], color=color2)
ax3.tick_params(axis='y', labelcolor=color2)

# Plot MainPitDensity and FlowRateOut for df_test_plot
axs[2].set_xlabel('Index')
axs[2].set_ylabel('MainPitDensity', color=color1)
axs[2].plot(df_test_plot.index, df_test_plot['MainPitDensity'].fillna(0), color=color1)
axs[2].tick_params(axis='y', labelcolor=color1)
axs[2].set_title('MainPitDensity and FlowRateOut for Next 120 Rows (Test Data)')

ax4 = axs[2].twinx()
ax4.set_ylabel('FlowRateOut', color=color2)
ax4.plot(df_test_plot.index, df_test_plot['FlowRateOut'], color=color2)
ax4.tick_params(axis='y', labelcolor=color2)

# Adjust the spacing between subplots
fig.subplots_adjust(hspace=0.4)

plt.show()

In [None]:
import matplotlib.pyplot as plt

features = [
    "MainPitDensity",
    "FlowRateOut",
    "SPP",
    "InstantaneousROP",
    "FlowRateIn",
    "FluidTemperatureOut",
    "MainPitVolume",
    "HookLoad",
    "SurfaceTorque",
    "BitDepth",
    "timeStep", 
    "DownholePressure"
]

# Assuming df_train is your training DataFrame
plot_data = df_train_selected.head(420)

fig, axs = plt.subplots(len(features), figsize=(15, 30), sharex=True)

for i, feature in enumerate(features):
    axs[i].plot(plot_data[feature], label=feature)
    axs[i].set_ylabel(feature)
    axs[i].legend()

plt.xlabel('Time Step')
plt.tight_layout()
plt.show()
