In [33]:
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import numpy as np
from sklearn.model_selection import train_test_split 

import warnings
warnings.filterwarnings("ignore")

# 1 获取数据

## 1.1 读取数据

In [34]:
datalist = []
dataid = []
def read_data(num):
    data = pd.read_excel(
        f'origin data/{num}#.xlsx', 
        usecols=[0, 1, 2, 8], 
        names=['hsid', 'tunnel1', 'tunnel2', 'time']
        )
    
    print(data.info())
    
    datalist.append(data)
    dataid.append(num)

In [35]:
read_data(30)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65535 entries, 0 to 65534
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   hsid     65535 non-null  int64  
 1   tunnel1  65535 non-null  float64
 2   tunnel2  65535 non-null  float64
 3   time     65535 non-null  object 
dtypes: float64(2), int64(1), object(1)
memory usage: 2.0+ MB
None


In [36]:
read_data(35)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118834 entries, 0 to 118833
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   hsid     118834 non-null  int64  
 1   tunnel1  118834 non-null  float64
 2   tunnel2  118834 non-null  float64
 3   time     118834 non-null  object 
dtypes: float64(2), int64(1), object(1)
memory usage: 3.6+ MB
None


In [37]:
read_data(40)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118819 entries, 0 to 118818
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   hsid     118819 non-null  int64  
 1   tunnel1  118819 non-null  float64
 2   tunnel2  118819 non-null  float64
 3   time     118819 non-null  object 
dtypes: float64(2), int64(1), object(1)
memory usage: 3.6+ MB
None


In [38]:
read_data(55)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118830 entries, 0 to 118829
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   hsid     118830 non-null  int64  
 1   tunnel1  118830 non-null  float64
 2   tunnel2  118830 non-null  float64
 3   time     118830 non-null  object 
dtypes: float64(2), int64(1), object(1)
memory usage: 3.6+ MB
None


In [39]:
read_data(75)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118806 entries, 0 to 118805
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   hsid     118806 non-null  int64  
 1   tunnel1  118806 non-null  float64
 2   tunnel2  118806 non-null  float64
 3   time     118806 non-null  object 
dtypes: float64(2), int64(1), object(1)
memory usage: 3.6+ MB
None


In [40]:
read_data(90)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118809 entries, 0 to 118808
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   hsid     118809 non-null  int64  
 1   tunnel1  118809 non-null  float64
 2   tunnel2  118809 non-null  float64
 3   time     118809 non-null  object 
dtypes: float64(2), int64(1), object(1)
memory usage: 3.6+ MB
None


## 1.2 预处理

In [41]:
prep_data = []
for data in datalist:
    # 转换格式
    data['time'] = pd.to_datetime(data['time'])
    # 重采样
    resampled_data = data.set_index("time").resample('5T').mean()
    # 填充缺失值
    fill_data = resampled_data.interpolate(method='time')
    # 计算平均值
    fill_data['tunnel_average'] = (fill_data.tunnel1+fill_data.tunnel2)/2 
    prep_data.append(fill_data)

In [61]:
# 创建线图
title = '90# Coal Mining Hydraulic Support'
fig = px.line(prep_data[4], y=['tunnel1', 'tunnel2'], title=title)

# 更新布局和样式
fig.update_layout(
    template='plotly_white',  # 使用白色背景的模板
    title={'text': title, 'x':0.5, 'xanchor': 'center'},  # 居中标题
    xaxis_title="Time",
    yaxis_title="Value",
    legend_title="Tunnel",
    font=dict(
        family="Times New Roman, Times, serif",  # 更改字体为更适合学术的
        size=12,
        color="Black"
    )
)

# 设置图例位置
fig.update_layout(legend=dict(
    x=1.01,
    y=1.01,
    bordercolor="Black",
    borderwidth=1
))

# 调整图形尺寸（根据需要调整或删除这一行）
fig.update_layout(width=800, height=400)

# 显示图表
fig.show()

## 1.3 标准化

In [43]:
min_max = []
for data in prep_data:
    data_min = data.tunnel_average.min()
    data_max = data.tunnel_average.max()
    min_max.append([data_min, data_max])

In [44]:
norm_data = []
for i in range(len(prep_data)):
    data = prep_data[i]
    data_min = min_max[i][0]
    data_max = min_max[i][1]
    norm = (data.tunnel_average - data_min)/(data_max-data_min)
    norm_data.append(norm)

In [45]:
fig = px.line(norm_data[2])
fig.show()

# 3.生成数据集

In [55]:
def generate_subsequences(time_series, sub_length=288, step=12):
    # 计算可以生成多少个子序列
    num_subsequences = (len(time_series) - sub_length) // step + 1
    
    # 存储所有子序列的列表
    subsequences = []
    
    for i in range(num_subsequences):
        # 计算当前子序列的起始索引
        start_index = i * step
        # 计算当前子序列的结束索引
        end_index = start_index + sub_length
        
        # 如果结束索引超出时间序列长度，结束循环
        if end_index > len(time_series):
            break
        
        # 提取子序列并添加到列表中
        subsequence = time_series[start_index:end_index]
        subsequences.append(subsequence)
    
    return np.array(subsequences)

def create_dataset(step):
    datalist = []
    for i in range(len(norm_data)):
        data = generate_subsequences(norm_data[i].values, 288, step=step)
        datalist.append(data)
    dataset = np.concatenate(datalist)
    print(dataset.shape)

    np.save(f'dataset/dataset_{step}.npy', dataset)

In [56]:
create_dataset(12)

(8164, 288)


In [57]:
create_dataset(1)

(97913, 288)


In [58]:
create_dataset(288)

(345, 288)


In [59]:
create_dataset(144)

(685, 288)


In [60]:
create_dataset(72)

(1365, 288)
