In [61]:
import os
import glob
import segyio
import numpy as np
import pandas as pd
from scipy.signal import hilbert
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, KFold,GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.preprocessing import LabelEncoder

## Unknown well -1

The location of Unknown well 1 is:

X = 683447 Unit

Y = 5072765 Unit

Z = 157 Unit


The seismic trace passing through this location is LO-6-86_01._KNGD export.sgy. Let's identify the seismic trace at this location.

In [62]:
def instantaneous_frequency(seismic_data, sample_rate=4):
    # Compute the analytic signal using Hilbert Transform
    analytic_signal = hilbert(seismic_data)
    
    # Compute the instantaneous phase (angle of analytic signal)
    instantaneous_phase = np.angle(analytic_signal)
    
 
    # compute the derivative and multiply by the sample rate (in Hz)
    phase_derivative = np.diff(instantaneous_phase)  # First derivative of phase
    fs = 1000 / sample_rate  # Convert sample rate to Hz (1/sample_rate in seconds)
    instantaneous_freq = np.abs(phase_derivative) * fs / (2 * np.pi)  # Frequency in Hz
    
    return instantaneous_freq

In [63]:
# Path to SEG-Y file
path = 'C://Users/sushila/Desktop/Data_shared_with Participants/Data/2D Seismic/Unkown-1/LO-6-86_01._KNGD export.sgy'

# Coordinates to search for
target_x = 683447
target_y = 5072765
target_z = 157 

# Read and extract data
with segyio.open(path, "r", strict=False) as f:
    f.mmap()  # Memory map the file for efficient access

    # Extract coordinates from headers
    source_x = np.array(f.attributes(segyio.TraceField.SourceX))
    source_y = np.array(f.attributes(segyio.TraceField.SourceY))

    # Find the trace index that matches the given coordinates (allowing for floating-point precision)
    trace_index = None
    for i, (x, y) in enumerate(zip(source_x, source_y)):
        if np.isclose(x, target_x, atol=1e-3) and np.isclose(y, target_y, atol=1e-3):  
            print(f"Match found at index {i}: x = {x}, y = {y}")
            trace_index = i
            break

    if trace_index is None:
        print("No trace found with the given coordinates.")
        trace_data = None
    else:
        print(f"Trace found at index {trace_index}.")
        # Extract the trace data
        trace_data = f.trace[trace_index]
        
        # Retrieve sampling interval (in ms) and calculate time axis
        dt = segyio.dt(f) / 1000
        time_axis = np.arange(len(trace_data)) * dt

# Convert time from milliseconds to seconds
time_axis_seconds = time_axis / 1000  # Now the time is in seconds
        
# Create a DataFrame with trace amplitude and time
if trace_data is not None:
    df_unknown1 = pd.DataFrame({
        "Time (s)": time_axis_seconds,
        "Amplitude": trace_data
    })

    # Calculate instantaneous frequency
    instantaneous_freq = instantaneous_frequency(trace_data, sample_rate=4)

    # Add NaN to the start to align the length
    instantaneous_freq_full = np.concatenate(([np.nan], instantaneous_freq))

    # Define the desired time interval for resampling
    desired_time_interval = 0.001  # in seconds

    # Ensure the 'Time' column is sorted
    df_unknown1 = df_unknown1.sort_values(by='Time (s)').reset_index(drop=True)

    # Generate the new time values with the desired interval
    new_time_values = np.arange(
        df_unknown1['Time (s)'].min(),
        df_unknown1['Time (s)'].max() + desired_time_interval,
        desired_time_interval
    )

    # Interpolate amplitude and instantaneous frequency values to match the new time values
    df_resampled = pd.DataFrame({
        'Time (s)': new_time_values,
        'Amplitude': np.interp(new_time_values, df_unknown1['Time (s)'], df_unknown1['Amplitude']),
        'Instantaneous_freq': np.interp(new_time_values, df_unknown1['Time (s)'], instantaneous_freq_full)
    })

    # Feature engineering for resampled data
    df_resampled['Amplitude_abs'] = df_resampled['Amplitude'].abs()
    df_resampled['Amplitude_roll_mean'] = df_resampled['Amplitude'].rolling(window=5, min_periods=1).mean()
    df_resampled['Amplitude_energy'] = df_resampled['Amplitude'] ** 2

    # Find the first non-zero amplitude index
    first_non_zero_index = df_resampled[df_resampled['Amplitude'] != 0].index[0]

    # Slice the DataFrame from the first non-zero index onward
    df_filtered = df_resampled.iloc[first_non_zero_index:]

    # Save the filtered DataFrame to a CSV file
    df_filtered.to_csv('Unknown1_filtered.csv', index=False)
    print(f"Filtered data saved with {len(df_filtered)} rows.")


Match found at index 229: x = 683446, y = 5072737
Trace found at index 229.
Filtered data saved with 4176 rows.


In [64]:
df_filtered 

Unnamed: 0,Time (s),Amplitude,Instantaneous_freq,Amplitude_abs,Amplitude_roll_mean,Amplitude_energy
325,0.325,426.359863,107.265458,426.359863,85.271973,1.817827e+05
326,0.326,852.719727,89.530207,852.719727,255.815918,7.271309e+05
327,0.327,1279.079590,71.794955,1279.079590,511.631836,1.636045e+06
328,0.328,1705.439453,54.059704,1705.439453,852.719727,2.908524e+06
329,0.329,1630.199463,51.018726,1630.199463,1178.759619,2.657550e+06
...,...,...,...,...,...,...
4496,4.496,-401.281250,35.729225,401.281250,-1956.241211,1.610266e+05
4497,4.497,275.878906,39.847800,275.878906,-1198.825195,7.610917e+04
4498,4.498,953.039062,43.966375,953.039062,-461.473145,9.082835e+05
4499,4.499,1630.199219,48.084950,1630.199219,255.814941,2.657549e+06


In [65]:
# df_unknown1 = df_unknown1[df_unknown1['Time (s)'] <= 2]
# df_unknown1

## Unknown well -2

The location of Unknown well 2 is:

X = 691636 Unit

Y = 5072723 Unit

Z = 135 Unit


The seismic trace passing through this location is LO-8-87_01._KNGD export.sgy. Let's identify the seismic trace at this location.

In [66]:
# Path to SEG-Y file
path = 'C://Users/sushila/Desktop/Data_shared_with Participants/Data/2D Seismic/Unknown-2/LO-8-87_01._KNGD export.sgy'

# Coordinates to search for
target_x = 691636
target_y = 5072723
target_z = 135

# Read and extract data
with segyio.open(path, "r", strict=False) as f:
    f.mmap()  # Memory map the file for efficient access

    # Extract coordinates from headers
    source_x = np.array(f.attributes(segyio.TraceField.SourceX))
    source_y = np.array(f.attributes(segyio.TraceField.SourceY))

    # Find the trace index that matches the given coordinates (allowing for floating-point precision)
    trace_index = None
    for i, (x, y) in enumerate(zip(source_x, source_y)):
        if np.isclose(x, target_x, atol=1e-3) and np.isclose(y, target_y, atol=1e-3): 
            print(f"Match found at index {i}: x = {x}, y = {y}")
            trace_index = i
            break

    if trace_index is None:
        print("No trace found with the given coordinates.")
        trace_data = None
    else:
        print(f"Trace found at index {trace_index}.")
        # Extract the trace data
        trace_data = f.trace[trace_index]
        
        # Retrieve sampling interval (in ms) and calculate time axis
        dt = segyio.dt(f) / 1000  # Convert to milliseconds
        time_axis = np.arange(len(trace_data)) * dt  # Time in milliseconds

# Convert time from milliseconds to seconds
time_axis_seconds = time_axis / 1000  # Now the time is in seconds

# Create a DataFrame with trace amplitude and time in seconds
if trace_data is not None:
    df_unknown2 = pd.DataFrame({
        "Time (s)": time_axis_seconds,  # Time in seconds
        "Amplitude": trace_data
    })

# Calculate instantaneous frequency
instantaneous_freq = instantaneous_frequency(trace_data, sample_rate=4)
print(len(instantaneous_freq))

# Add NaN to the start to align the length
instantaneous_freq_full = np.concatenate(([np.nan], instantaneous_freq))

df_unknown2['Instantaneous_freq'] = instantaneous_freq_full
# Feature engineering for unknown data (match training pipeline)
df_unknown2['Amplitude_abs'] = df_unknown2['Amplitude'].abs()
df_unknown2['Amplitude_roll_mean'] = df_unknown2['Amplitude'].rolling(window=5, min_periods=1).mean()
df_unknown2['Amplitude_energy'] = df_unknown2['Amplitude'] ** 2
# Find the first non-zero amplitude index
first_non_zero_index = df_unknown2[df_unknown2['Amplitude'] != 0].index[0]

# Slice the DataFrame from the first non-zero index onward
df_filtered = df_unknown2.iloc[first_non_zero_index:]

# Save the filtered DataFrame to a CSV file
df_filtered.to_csv('Unknown2.csv', index=False)

print(f"Filtered data saved starting from the first non-zero amplitude. Total rows: {len(df_filtered)}.")


Match found at index 484: x = 691638, y = 5072772
Trace found at index 484.
1250
Filtered data saved starting from the first non-zero amplitude. Total rows: 1238.


In [67]:
# df_unknown2 = df_unknown2[df_unknown2['Time (s)'] <= 2]
# df_unknown2

In [68]:
litho_data = pd.read_csv('Wells_Geology_StratSimpl.csv')

litho_data.rename(columns={'WellID': 'holeid'}, inplace=True)

# Replace hyphen '-' with underscore '_'
litho_data ['holeid'] = litho_data['holeid'].str.replace('-', '_')
litho_data

Unnamed: 0,holeid,From,To,Lithology,Strat_Simplified,Strat_Simplified_Viro,Stratigraphy
0,VIROVITICA_1,0.0,15.0,"humus, sand, clay and gravel",,QUATERNARY,Quaternery layers
1,VIROVITICA_1,15.0,700.0,"clay, sand and marl",,RHOMBOIDE LAYERS,Rhomboide layers
2,VIROVITICA_1,700.0,1750.0,marls and sandstones,,ABICHI LAYERS,Abichi layers
3,VIROVITICA_1,1750.0,2903.0,marls and sandstones,,MIOCENE,Miocene layers
4,VIROVITICA_1,2903.0,2956.0,coarse-grained basaltic volcanic rock,,DOLERITE,Dolerite (Basement rock)
5,VIROVITICA_3ALFA,0.0,86.0,sandy clays,Lonja,Lonja,Lonja formation
6,VIROVITICA_3ALFA,86.0,757.0,sandy clays,Bilogora,Bilogora,Bilogora formation
7,VIROVITICA_3ALFA,757.0,860.0,marly clay and sandstones,Bilogora,Bilogora,Bilogora formation
8,VIROVITICA_3ALFA,860.0,1517.0,marls and sandstones,KLOSTAR IVANIC,KLOSTAR IVANIC,KloStar Ivanic formation (Pepelana and Poljana...
9,VIROVITICA_3ALFA,1517.0,1854.0,marls and argillaceous sandstones,IVANIC GRAD,IVANIC GRAD,Ivanic Grad formation (Okoli sandstones)


In [69]:
litho_data['Lithology'].unique()

array(['humus, sand, clay and gravel', 'clay, sand and marl',
       'marls and sandstones', 'coarse-grained basaltic volcanic rock',
       'sandy clays', 'marly clay and sandstones',
       'marls and argillaceous sandstones',
       'sandy marls and argillaceous sandstones',
       'sandy marls and qaurtz argillaceous sandstones',
       'sandy marls and dolomite',
       'calcareous marls, sandy marls, sandstones and extrusive',
       'limestone dolomite, sandstones (in traces) and extrusive',
       'dolomite, calcite, extrusive, quartz',
       'clays, gravel, clayey sands with coal',
       'clayey sandstone that transitions into sandy marls interbedded with sandstones',
       'sandy marls interbedded with sandstones',
       'argillaceous sandstones interbedded with sandy marls',
       'argillaceous sandstones interbedded with marls', 'sandy marls',
       'marls',
       'sandy marls interbedded with sandstones and  silty conglomerates',
       'marls, calcareous marls inte

In [70]:
litho_data['Lithology'].value_counts()


marls, sandstones                                                                             5
marls and sandstones                                                                          4
silty marls and quartzitic sandstones                                                         3
quartzitic sandstones and silty marls                                                         3
marls, sandstones, silty marl interbedded with quartzitic sandstones, siltstone and shales    2
gravel, clays, clayey sands and poorly carbonized coal                                        2
sandy marls interbedded with sandstones                                                       2
alternations of clay and sandstone                                                            2
clayey marls                                                                                  2
marls and sandstone                                                                           2
sandy clays                             

In [71]:
# Rename some of the columns
lithology_mapping = {
    'marls, sandstones': 'marls and sandstones',
    'marls and sandstones': 'marls and sandstones',
    'silty marls and quartzitic sandstones': 'silty marls and quartzitic sandstones',
    'quartzitic sandstones and silty marls': 'silty marls and quartzitic sandstones',
    'marls, sandstones, silty marl interbedded with quartzitic sandstones, siltstone and shales': 'marls and sandstones',
    'gravel, clays, clayey sands and poorly carbonized coal': 'gravel and clays',
    'sandy marls interbedded with sandstones': 'sandy marls and sandstones',
    'alternations of clay and sandstone': 'clay and sandstones',
    'clayey marls': 'clayey marls',
    'marls and sandstone': 'marls and sandstones',
    'sandy clays': 'clay and sand',
    'clay and sand': 'clay and sand',
    'clay, sandy clays and sand': 'clay and sand',
    'clayey and sandy marls, also with calcareous marls (sandstones in traces)': 'clayey marls',
    'clays, marly clays sandy marls interbedded with sandstones': 'marly clays and sandstones',
    'clastic breccias': 'clastic breccias',
    'silty calcareous marls and calcarenite': 'silty calcareous marls',
    'quartzitic sandstones and siltstone': 'quartzitic sandstones',
    'sandstones': 'sandstones',
    'argillaceous sandstones with silty marls': 'argillaceous sandstones',
    'marly clays, clayey marls, sands and sandstones': 'clays and sandstones',
    'siltsone and sandstone': 'siltstone and sandstone',
    'humus, sand, clay and gravel': 'humus and sand',
    'marls, calcareous marls interbedded with sandstones': 'marls and sandstones',
    'clay, sand and marl': 'clay and sand',
    'sandy marls interbedded with sandstones and silty conglomerates': 'sandy marls and sandstones',
    'coarse-grained basaltic volcanic rock': 'basaltic volcanic rock',
    'marly clay and sandstones': 'marly clays and sandstones',
    'marls and argillaceous sandstones': 'marls and sandstones',
    'sandy marls and argillaceous sandstones': 'sandy marls and argillaceous sandstones',
    'sandy marls and qaurtz argillaceous sandstones': 'sandy marls and argillaceous sandstones',
    'sandy marls and dolomite': 'sandy marls and dolomite',
    'calcareous marls, sandy marls, sandstones and extrusive': 'calcareous marls and sandstones',
    'limestone dolomite, sandstones (in traces) and extrusive': 'limestone and sandstones',
    'dolomite, calcite, extrusive, quartz': 'dolomite and quartz',
    'clays, gravel, clayey sands with coal': 'clays and gravel',
    'clayey sandstone that transitions into sandy marls interbedded with sandstones': 'clayey sandstone and sandy marls',
    'argillaceous sandstones interbedded with sandy marls': 'argillaceous sandstones and sandy marls',
    'argillaceous sandstones interbedded with marls': 'argillaceous sandstones and marls',
    'sandy marls': 'sandy marls',
    'marls': 'marls',
    'siltsone, sandstone and limestones': 'siltstone and sandstone'
}


In [72]:
# Apply the mapping to standardize lithology names
litho_data['Lithology'] = litho_data['Lithology'].replace(lithology_mapping)


In [73]:
litho_data['Strat_Simplified_Viro'].value_counts()

KLOSTAR IVANIC       15
MOSLAVACKA GORA      15
IVANIC GRAD          10
Bilogora              6
Lonja                 4
PRECEC                4
QUATERNARY            1
RHOMBOIDE LAYERS      1
ABICHI LAYERS         1
MIOCENE               1
DOLERITE              1
TERTIARY BASEMENT     1
Name: Strat_Simplified_Viro, dtype: int64

In [74]:
litho_data = litho_data[['holeid', 'From', 'To', 'Lithology','Strat_Simplified_Viro']]


In [75]:
# Initialize LabelEncoders
lithology_encoder = LabelEncoder()
strat_simplified_encoder = LabelEncoder()

# Fit encoders using unique categories from litho_data
lithology_encoder.fit(litho_data['Lithology'])
strat_simplified_encoder.fit(litho_data['Strat_Simplified_Viro'])

# Encode Lithology and Strat_Simplified_Viro for litho_data
litho_data['Lithology_label'] = lithology_encoder.transform(litho_data['Lithology'])
litho_data['Strat_Simplified_Viro_label'] = strat_simplified_encoder.transform(litho_data['Strat_Simplified_Viro'])

litho_data



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  litho_data['Lithology_label'] = lithology_encoder.transform(litho_data['Lithology'])


Unnamed: 0,holeid,From,To,Lithology,Strat_Simplified_Viro,Lithology_label,Strat_Simplified_Viro_label
0,VIROVITICA_1,0.0,15.0,humus and sand,QUATERNARY,14,9
1,VIROVITICA_1,15.0,700.0,clay and sand,RHOMBOIDE LAYERS,6,10
2,VIROVITICA_1,700.0,1750.0,marls and sandstones,ABICHI LAYERS,17,0
3,VIROVITICA_1,1750.0,2903.0,marls and sandstones,MIOCENE,17,6
4,VIROVITICA_1,2903.0,2956.0,basaltic volcanic rock,DOLERITE,3,2
5,VIROVITICA_3ALFA,0.0,86.0,clay and sand,Lonja,6,5
6,VIROVITICA_3ALFA,86.0,757.0,clay and sand,Bilogora,6,1
7,VIROVITICA_3ALFA,757.0,860.0,marly clays and sandstones,Bilogora,18,1
8,VIROVITICA_3ALFA,860.0,1517.0,marls and sandstones,KLOSTAR IVANIC,17,4
9,VIROVITICA_3ALFA,1517.0,1854.0,marls and sandstones,IVANIC GRAD,17,3


In [76]:
label_mappings = {'Lithology_classes': list(lithology_encoder.classes_)}
# Convert to DataFrame and save to CSV
label_mappings_df = pd.DataFrame(label_mappings)
label_mappings_df.to_csv('lithology_mappings.csv', index=False)


label_mappings1 = {'Strat_Simplified_Viro_classes': list(strat_simplified_encoder.classes_)}
# Convert to DataFrame and save to CSV
label_mappings_df1 = pd.DataFrame(label_mappings1)
label_mappings_df1.to_csv('Strat_Simplified_Viro_mappings.csv', index=False)


In [77]:
litho_data.columns

Index(['holeid', 'From', 'To', 'Lithology', 'Strat_Simplified_Viro',
       'Lithology_label', 'Strat_Simplified_Viro_label'],
      dtype='object')

In [78]:
# Create a list to store the resampled data
resampled_data = []

# Iterate over each row in litho_data to generate depth range
depth_interval = 0.1  # Depth interval for resampling

for _, row in litho_data.iterrows():
    # Generate a depth range from 'From' to 'To' with a step size of 0.1
    depth_range = np.arange(row['From'], row['To'], depth_interval)
    
    # For each depth in the range, repeat the lithology and stratigraphy values
    for depth in depth_range:
        resampled_data.append({
            'holeid': row['holeid'],
            'Depth': depth,
            'Lithology': row['Lithology'],
            'Strat_Simplified_Viro': row['Strat_Simplified_Viro'],
            'Lithology_label': row['Lithology_label'],
            'Strat_Simplified_Viro_label': row['Strat_Simplified_Viro_label']
        })

# Create a new DataFrame with the resampled data
resampled_litho_data = pd.DataFrame(resampled_data)

resampled_litho_data['Depth'] = resampled_litho_data['Depth'].round(1)


resampled_litho_data.to_csv('resampled_litho_data.csv', index = False)


In [79]:
resampled_litho_data

Unnamed: 0,holeid,Depth,Lithology,Strat_Simplified_Viro,Lithology_label,Strat_Simplified_Viro_label
0,VIROVITICA_1,0.0,humus and sand,QUATERNARY,14,9
1,VIROVITICA_1,0.1,humus and sand,QUATERNARY,14,9
2,VIROVITICA_1,0.2,humus and sand,QUATERNARY,14,9
3,VIROVITICA_1,0.3,humus and sand,QUATERNARY,14,9
4,VIROVITICA_1,0.4,humus and sand,QUATERNARY,14,9
...,...,...,...,...,...,...
216400,REZOVACCKE_KRCCEVINE_2,4553.5,siltstone and sandstone,TERTIARY BASEMENT,26,11
216401,REZOVACCKE_KRCCEVINE_2,4553.6,siltstone and sandstone,TERTIARY BASEMENT,26,11
216402,REZOVACCKE_KRCCEVINE_2,4553.7,siltstone and sandstone,TERTIARY BASEMENT,26,11
216403,REZOVACCKE_KRCCEVINE_2,4553.8,siltstone and sandstone,TERTIARY BASEMENT,26,11


In [80]:
# Read all the csv files from well
folder_path = r'C:\Users\sushila\Desktop\Hackathon'

files = glob.glob(os.path.join(folder_path, '*ALL.csv'))

# List to store data from each file
data_frames = []

# Loop through each file and read it
for file in files:
    # Extract the part of the filename before 'ALL'
    file_name = os.path.basename(file)
    file_prefix = file_name.split('_ALL')[0]
    
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file)
    
    # Add a new column with the file prefix
    df['holeid'] = file_prefix
    
    # Append the DataFrame to the list
    data_frames.append(df)

combined_data = pd.concat(data_frames, ignore_index=True)
# Lets take only few columns
combined_data = combined_data[['Time (s)', 'Depth',  'Amplitude', 'Instantaneous_freq', 'Amplitude_abs',
       'Amplitude_roll_mean', 'Amplitude_energy', 'holeid']]
#Roundoff the depth so that it can be easily compared with depth from another dataframe
combined_data['Depth'] = combined_data['Depth'].round(1)
combined_data

Unnamed: 0,Time (s),Depth,Amplitude,Instantaneous_freq,Amplitude_abs,Amplitude_roll_mean,Amplitude_energy,holeid
0,0.072,68.6,657.93164,7.228370,657.93164,657.931641,432874.03,LONCCARICA_1
1,0.076,72.5,3289.66400,17.494930,3289.66400,1973.797852,10821890.00,LONCCARICA_1
2,0.080,76.3,6579.32800,20.437056,6579.32800,3508.974609,43287560.00,LONCCARICA_1
3,0.084,80.2,8333.81600,20.070932,8333.81600,4715.185059,69452500.00,LONCCARICA_1
4,0.088,84.1,7237.26170,20.777454,7237.26170,5219.600391,52377956.00,LONCCARICA_1
...,...,...,...,...,...,...,...,...
2400,1.740,3902.7,3813.90230,27.492617,3813.90230,3098.795312,14545851.00,VIROVITICA_3ALFA
2401,1.744,3908.3,-1668.58200,27.660799,1668.58200,3241.816406,2784166.00,VIROVITICA_3ALFA
2402,1.748,3914.5,-7151.06250,28.908798,7151.06250,1477.887500,51137696.00,VIROVITICA_3ALFA
2403,1.752,3919.9,-9296.38300,218.004944,9296.38300,-1525.559375,86422740.00,VIROVITICA_3ALFA


In [81]:
# Sort both dataframes by 'holeid' and 'Depth'
combined_data = combined_data.sort_values(by=['holeid', 'Depth'])
resampled_litho_data = resampled_litho_data.sort_values(by=['holeid', 'Depth'])

# Merge the two dataframes on 'holeid' and 'Depth' to find exact matches
exact_match_data = pd.merge(combined_data, resampled_litho_data[['holeid', 'Depth', 'Lithology', 'Strat_Simplified_Viro','Lithology_label', 'Strat_Simplified_Viro_label']], 
                            on=['holeid', 'Depth'], how='inner')

# Check how many rows had exact depth matches
print(f"Number of exact depth matches: {exact_match_data.shape[0]}")

# Display the first few rows of the exact match data
exact_match_data


Number of exact depth matches: 2401


Unnamed: 0,Time (s),Depth,Amplitude,Instantaneous_freq,Amplitude_abs,Amplitude_roll_mean,Amplitude_energy,holeid,Lithology,Strat_Simplified_Viro,Lithology_label,Strat_Simplified_Viro_label
0,0.072,68.6,657.93164,7.228370,657.93164,657.931641,432874.03,LONCCARICA_1,clays and gravel,Lonja,10,5
1,0.076,72.5,3289.66400,17.494930,3289.66400,1973.797852,10821890.00,LONCCARICA_1,clays and gravel,Lonja,10,5
2,0.080,76.3,6579.32800,20.437056,6579.32800,3508.974609,43287560.00,LONCCARICA_1,clays and gravel,Lonja,10,5
3,0.084,80.2,8333.81600,20.070932,8333.81600,4715.185059,69452500.00,LONCCARICA_1,clays and gravel,Lonja,10,5
4,0.088,84.1,7237.26170,20.777454,7237.26170,5219.600391,52377956.00,LONCCARICA_1,clays and gravel,Lonja,10,5
...,...,...,...,...,...,...,...,...,...,...,...,...
2396,1.740,3902.7,3813.90230,27.492617,3813.90230,3098.795312,14545851.00,VIROVITICA_3ALFA,limestone and sandstones,MOSLAVACKA GORA,15,7
2397,1.744,3908.3,-1668.58200,27.660799,1668.58200,3241.816406,2784166.00,VIROVITICA_3ALFA,limestone and sandstones,MOSLAVACKA GORA,15,7
2398,1.748,3914.5,-7151.06250,28.908798,7151.06250,1477.887500,51137696.00,VIROVITICA_3ALFA,limestone and sandstones,MOSLAVACKA GORA,15,7
2399,1.752,3919.9,-9296.38300,218.004944,9296.38300,-1525.559375,86422740.00,VIROVITICA_3ALFA,limestone and sandstones,MOSLAVACKA GORA,15,7


In [82]:
# Select only the importat columns
exact_match_data = exact_match_data[['Time (s)', 'Amplitude', 'Instantaneous_freq', 'Amplitude_abs',
       'Amplitude_roll_mean', 'Amplitude_energy','Lithology', 'Strat_Simplified_Viro', 'Lithology_label', 'Strat_Simplified_Viro_label']]
exact_match_data

Unnamed: 0,Time (s),Amplitude,Instantaneous_freq,Amplitude_abs,Amplitude_roll_mean,Amplitude_energy,Lithology,Strat_Simplified_Viro,Lithology_label,Strat_Simplified_Viro_label
0,0.072,657.93164,7.228370,657.93164,657.931641,432874.03,clays and gravel,Lonja,10,5
1,0.076,3289.66400,17.494930,3289.66400,1973.797852,10821890.00,clays and gravel,Lonja,10,5
2,0.080,6579.32800,20.437056,6579.32800,3508.974609,43287560.00,clays and gravel,Lonja,10,5
3,0.084,8333.81600,20.070932,8333.81600,4715.185059,69452500.00,clays and gravel,Lonja,10,5
4,0.088,7237.26170,20.777454,7237.26170,5219.600391,52377956.00,clays and gravel,Lonja,10,5
...,...,...,...,...,...,...,...,...,...,...
2396,1.740,3813.90230,27.492617,3813.90230,3098.795312,14545851.00,limestone and sandstones,MOSLAVACKA GORA,15,7
2397,1.744,-1668.58200,27.660799,1668.58200,3241.816406,2784166.00,limestone and sandstones,MOSLAVACKA GORA,15,7
2398,1.748,-7151.06250,28.908798,7151.06250,1477.887500,51137696.00,limestone and sandstones,MOSLAVACKA GORA,15,7
2399,1.752,-9296.38300,218.004944,9296.38300,-1525.559375,86422740.00,limestone and sandstones,MOSLAVACKA GORA,15,7


In [83]:
litho_data.columns

Index(['holeid', 'From', 'To', 'Lithology', 'Strat_Simplified_Viro',
       'Lithology_label', 'Strat_Simplified_Viro_label'],
      dtype='object')

In [84]:
# Define the feature columns
X = exact_match_data[['Time (s)', 'Amplitude', 'Instantaneous_freq'
                       , 'Amplitude_energy']]

# Define the target variables for separate models
y_lithology = exact_match_data['Lithology_label']
y_strat_simplified = exact_match_data['Strat_Simplified_Viro_label']


In [85]:
# Check the size of X (features)
print("Size of X:", X.shape)  # (number of rows, number of columns)

# Check the number of unique classes in y_lithology
print("Number of unique classes in y_lithology:", y_lithology.nunique())

# Check the number of unique classes in y_strat_simplified
print("Number of unique classes in y_strat_simplified:", y_strat_simplified.nunique())


Size of X: (2401, 4)
Number of unique classes in y_lithology: 26
Number of unique classes in y_strat_simplified: 9


In [86]:
# Check class distribution for Lithology_label
print("Lithology_label class distribution:")
print(y_lithology.value_counts())

# Check class distribution for Strat_Simplified_Viro_label
print("\nStrat_Simplified_Viro_label class distribution:")
print(y_strat_simplified.value_counts())


Lithology_label class distribution:
17    714
11    302
22    180
13    152
28    131
6     127
0     122
18     98
8      93
9      80
10     79
15     51
19     50
26     38
7      37
20     31
2      28
25     21
24     21
1      11
23     10
4       9
16      7
27      4
5       3
21      2
Name: Lithology_label, dtype: int64

Strat_Simplified_Viro_label class distribution:
4     731
7     636
1     385
3     333
5     113
8      93
6      67
0      28
11     15
Name: Strat_Simplified_Viro_label, dtype: int64


In [87]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.multioutput import MultiOutputClassifier
import numpy as np

# List of classifiers to evaluate
classifiers = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42)
}

# Split data into training and test sets (80% train, 20% test)
X_train, X_test, y_lithology_train, y_lithology_test = train_test_split(X, y_lithology, test_size=0.2, random_state=42)
_, _, y_strat_simplified_train, y_strat_simplified_test = train_test_split(X, y_strat_simplified, test_size=0.2, random_state=42)

# Reshape y to make sure it's 2D (required by MultiOutputClassifier)
y_lithology_train = y_lithology_train.values.reshape(-1, 1)
y_lithology_test = y_lithology_test.values.reshape(-1, 1)
y_strat_simplified_train = y_strat_simplified_train.values.reshape(-1, 1)
y_strat_simplified_test = y_strat_simplified_test.values.reshape(-1, 1)

# Loop through the classifiers and evaluate them
for name, clf in classifiers.items():
    print(f"\nEvaluating model: {name}")
    
    # For Lithology model
    lithology_model = MultiOutputClassifier(clf)  # MultiOutput for both targets
    lithology_model.fit(X_train, y_lithology_train)
    y_lithology_pred = lithology_model.predict(X_test)
    print("Lithology Model Evaluation:")
    print(classification_report(y_lithology_test, y_lithology_pred[:, 0]))  # Lithology is the first column
    
    # For Strat_Simplified_Viro model
    strat_simplified_model = MultiOutputClassifier(clf)  # MultiOutput for both targets
    strat_simplified_model.fit(X_train, y_strat_simplified_train)
    y_strat_simplified_pred = strat_simplified_model.predict(X_test)
    print("Strat_Simplified_Viro Model Evaluation:")
    print(classification_report(y_strat_simplified_test, y_strat_simplified_pred[:, 0]))  # Strat_Simplified_Viro is the second column



Evaluating model: Random Forest
Lithology Model Evaluation:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        26
           1       0.00      0.00      0.00         5
           2       1.00      0.38      0.55         8
           6       0.63      0.83      0.72        29
           7       0.20      0.11      0.14         9
           8       0.53      0.47      0.50        19
           9       0.41      0.58      0.48        12
          10       0.85      0.79      0.81        14
          11       0.53      0.45      0.49        64
          13       0.44      0.23      0.30        31
          15       0.75      0.30      0.43        10
          16       0.00      0.00      0.00         0
          17       0.57      0.81      0.67       137
          18       0.77      0.50      0.61        20
          19       0.92      1.00      0.96        11
          20       0.38      0.50      0.43         6
          21       0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Strat_Simplified_Viro Model Evaluation:
              precision    recall  f1-score   support

           0       0.50      0.17      0.25         6
           1       0.75      0.70      0.72        82
           3       0.90      0.81      0.85        74
           4       0.62      0.66      0.64       155
           5       0.64      0.45      0.53        20
           6       0.27      0.40      0.32        10
           7       0.64      0.75      0.69       108
           8       0.62      0.36      0.46        22
          11       0.00      0.00      0.00         4

    accuracy                           0.67       481
   macro avg       0.55      0.48      0.50       481
weighted avg       0.68      0.67      0.67       481


Evaluating model: Decision Tree
Lithology Model Evaluation:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        26
           1       1.00      0.60      0.75         5
           2       0.80      0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [88]:
unknown1 = pd.read_csv('unknown1.csv')
unknown2 = pd.read_csv('unknown2.csv')

In [89]:
unknown1 

Unnamed: 0,Time (s),Amplitude,Instantaneous_freq,Amplitude_abs,Amplitude_roll_mean,Amplitude_energy
0,0.328,1705.439500,54.059704,1705.439500,341.087891,2.908524e+06
1,0.332,1404.479500,41.895794,1404.479500,621.983789,1.972563e+06
2,0.336,100.319336,27.184258,100.319336,642.047656,1.006397e+04
3,0.340,-1203.840800,24.052597,1203.840800,401.279492,1.449233e+06
4,0.344,-1805.760700,20.421970,1805.760700,40.127344,3.260772e+06
...,...,...,...,...,...,...
1039,4.484,-3611.521500,30.679041,3611.521500,3390.814453,1.304309e+07
1040,4.488,-5016.000000,31.741348,5016.000000,1344.287109,2.516026e+07
1041,4.492,-3511.201200,222.052399,3511.201200,-1023.264844,1.232853e+07
1042,4.496,-401.281250,35.729225,401.281250,-2327.425000,1.610266e+05


In [90]:

# Prepare the feature set for unknown1 (same features as during training)
X_unknown1 = unknown1[['Time (s)', 'Amplitude', 'Instantaneous_freq',
                        'Amplitude_energy']]

# Prepare the feature set for unknown2 (same features as during training)
X_unknown2 = unknown2[['Time (s)', 'Amplitude', 'Instantaneous_freq', 
                        'Amplitude_energy']]

# Predict Lithology and Strat_Simplified_Viro for unknown1
unknown1_pred = lithology_model.predict(X_unknown1)
unknown1_strat_pred = strat_simplified_model.predict(X_unknown1)

# Predict Lithology and Strat_Simplified_Viro for unknown2
unknown2_pred = lithology_model.predict(X_unknown2)
unknown2_strat_pred = strat_simplified_model.predict(X_unknown2)

# Add the predictions for both Lithology and Strat_Simplified_Viro to the datasets
unknown1['Lithology_label'] = unknown1_pred[:, 0]  # Lithology predictions for unknown1
unknown1['Strat_Simplified_Viro_label'] = unknown1_strat_pred[:, 0]  # Strat predictions for unknown1

unknown1.to_csv('unknown1_litho.csv', index = False)

unknown2['Lithology_label'] = unknown2_pred[:, 0]  # Lithology predictions for unknown2
unknown2['Strat_Simplified_Viro_label'] = unknown2_strat_pred[:, 0]  # Strat predictions for unknown2

unknown2.to_csv('unknown2_litho.csv', index = False)
