In [None]:
import pandas as pd

import matplotlib.pyplot as plt
plt.rcParams.update({
    'font.family':'Times New Roman', 
    'font.size': 14,  # 
    'axes.titlesize': 16,
    'axes.labelsize': 14,
    'xtick.labelsize': 12,
    'ytick.labelsize': 12,
    'legend.fontsize': 14,
})
import seaborn as sns
import plotly.express as px
import numpy as np
from sklearn.model_selection import train_test_split 

import warnings
import pickle
warnings.filterwarnings("ignore")

# 1. Load Data

## I. Coal mine A

In [None]:
list_A = []
id_A = []

In [None]:
def read_data_A(num):  
    data = pd.read_csv(  
        f'origin data/22-3 coal mine A hp{num}.csv',   
        usecols=[0, 1, 2],   
        names=['tunnel1', 'tunnel2', 'time'],  
        skiprows=1  
    )  
    data = data.set_index("time")  
    data.index = pd.to_datetime(data.index)  

    list_A.append(data)  
    id_A.append(num)  

def show_data(data, name):  
    fig, ax = plt.subplots(figsize=(8, 5))  
    colors = ['#FF6B6B', '#4ECDC4']
    data.plot(ax=ax, color=colors)  
    ax.set_xlabel('Time')  
    ax.set_ylabel('Pressure (MPa)')  
    ax.set_title(f'Coal Mine {name}# Hydraulic Support Pressure Data')  
    ax.legend(loc='upper right')  
    plt.savefig(f'{name}.svg', format='svg', bbox_inches='tight')  
    plt.show()  
    print(data.info())

In [None]:
read_data_A(30)

In [None]:
show_data(list_A[0], 'A 30')

In [None]:
read_data_A(35)

In [None]:
show_data(list_A[1], 'A 35')

In [None]:
read_data_A(40)

In [None]:
show_data(list_A[2], 'A 40')

In [None]:
read_data_A(55)

In [None]:
show_data(list_A[3], 'A 55')

In [None]:
read_data_A(75)

In [None]:
show_data(list_A[4], 'A 75')

In [None]:
read_data_A(90)

In [None]:
show_data(list_A[5], 'A 90')

## II. Coal mine B

In [None]:
list_B = []
id_B = []
def read_data_B(num):
    data = pd.read_csv(
        f'origin data/22-4 coal mine B hp{num}.csv', 
        usecols=[0, 1, 2], 
        names=['time', 'tunnel1', 'tunnel2'],
        skiprows=1
        )
    
    data = data.set_index("time")

    data.index = pd.to_datetime(data.index)
    
    list_B.append(data)
    id_B.append(num)

In [None]:
read_data_B(1)

In [None]:
show_data(list_B[-1], 'B 1')

In [None]:
read_data_B(5)

In [None]:
show_data(list_B[-1], 'B 5')


In [None]:
read_data_B(9)

In [None]:
show_data(list_B[-1], 'B 9')


In [None]:
read_data_B(15)

In [None]:
show_data(list_B[-1], 'B 15')


In [None]:
read_data_B(20)

In [None]:
show_data(list_B[-1], 'B 20')


In [None]:
read_data_B(25)

In [None]:
show_data(list_B[-1], 'B 25')


In [None]:
read_data_B(30)

In [None]:
show_data(list_B[-1], 'B 30')


In [None]:
read_data_B(35)

In [None]:
show_data(list_B[-1], 'B 35')


# 2. Preprocess

In [None]:
def resample_and_average(dataframes_list, resample_freq='5T'):
    resampled_list = []
    for df in dataframes_list:
        if not pd.api.types.is_datetime64_any_dtype(df.index):
            df.index = pd.to_datetime(df.index)
        
        resampled_df = df.resample(resample_freq).mean()
        resampled_df['average'] = resampled_df.mean(axis=1) 
        
        resampled_df = resampled_df.interpolate(method='time')

        resampled_list.append(resampled_df)
    
    return resampled_list

In [None]:
resam_list_A = resample_and_average(list_A)

In [None]:
resam_list_B = resample_and_average(list_B)

In [None]:
title = '90# Coal Mining Hydraulic Support'
fig = px.line(resam_list_B[4], y=['tunnel1', 'tunnel2'], title=title)

fig.update_layout(
    template='plotly_white', 
    title={'text': title, 'x':0.5, 'xanchor': 'center'}, 
    xaxis_title="Time",
    yaxis_title="Value",
    legend_title="Tunnel",
    font=dict(
        family="Times New Roman, Times, serif", 
        size=12,
        color="Black"
    )
)

fig.update_layout(legend=dict(
    x=1.01,
    y=1.01,
    bordercolor="Black",
    borderwidth=1
))

fig.update_layout(width=800, height=400)

fig.show()

In [None]:
processed_df_list_A = []
for df in resam_list_A:   
    df = df[~((df.index.month == 4) & (df.index.day >= 15) & (df.index.day <= 22))]
    
    processed_df_list_A.append(df)

ndarray_list_A = [df['average'].to_numpy() for df in processed_df_list_A]

ndarray_list_B = [df['average'].to_numpy() for df in resam_list_B]

In [None]:
px.line(ndarray_list_B[3])

In [None]:
# Function to perform min-max normalization on a numpy array
def min_max_normalize(arr):
    return (arr - np.min(arr)) / (np.max(arr) - np.min(arr))

def split_normal(ndarray_list):
    # Lists to store the divided parts
    first_part = []
    second_part = []
    third_part = []
    for arr in ndarray_list:
        n = len(arr)
        first_part.append(arr[:int(n * 0.6)])
        second_part.append(arr[int(n * 0.6):int(n * 0.8)])
        third_part.append(arr[int(n * 0.8):])

    # Applying min-max normalization to each part
    train = [min_max_normalize(arr) for arr in first_part]
    valid = [min_max_normalize(arr) for arr in second_part]
    test = [min_max_normalize(arr) for arr in third_part]

    return train, valid, test

In [None]:
train_A, valid_A, test_A = split_normal(ndarray_list_A)

In [None]:
train_A[1].shape, valid_A[1].shape, test_A[1].shape

In [None]:
train_B, valid_B, test_B = split_normal(ndarray_list_B)

In [None]:
train_B[1].shape, valid_B[1].shape, test_B[1].shape

# 3.Dataset

In [None]:
def generate_subsequences(data_list, sub_length=288, ratio=0.5):
    all_subsequences = []

    for time_series in data_list:
        step = int(sub_length * (1 - ratio))
        num_subsequences = (len(time_series) - sub_length) // step + 1

        for i in range(int(num_subsequences)):
            start_index = int(i * step)
            end_index = start_index + sub_length
            
            if end_index > len(time_series):
                break
            
            subsequence = time_series[start_index:end_index]
            all_subsequences.append(subsequence)
    
    return np.array(all_subsequences)

In [None]:
train_B_dataset = generate_subsequences(train_B, ratio=0.75)
train_B_dataset.shape

In [None]:
train_A_dataset = generate_subsequences(train_A, ratio=0.75)
train_A_dataset.shape

# 4. Download Data

In [None]:
def save_to_file(data, filename):
    with open(filename, 'wb') as f:
        pickle.dump(data, f)

datasets = {
    'dataset/train_A.pkl': train_A,
    'dataset/valid_A.pkl': valid_A,
    'dataset/test_A.pkl': test_A,
    'dataset/train_B.pkl': train_B,
    'dataset/valid_B.pkl': valid_B,
    'dataset/test_B.pkl': test_B
}

for filename, data in datasets.items():
    save_to_file(data, filename)