# Create CSV database

This converts the `.dat` and `.hea` files in the `ctu-data/` folder into `.csv` files stored in the `ctu-data_csv/` folder.

Code adapted from: https://github.com/fabiom91/CTU-CHB_Physionet.org/blob/master/create_csv_database.ipynb

## Import required pacakges

In [2]:
import math
from os import listdir
import pandas as pd
from tqdm import tqdm
import wfdb

import warnings
warnings.filterwarnings("ignore")

## Define functions

In [3]:
def get_all_records():
    '''
    Get list of record names e.g. ['1347', '2040', '1054', ...]
    '''
    rec_list = []
    for file in listdir("ctu-data/"):
        rec = file[:file.find('.')] # 获取文件名的前缀，即文件名中 . 前面的部分
        try:
            rec = int(rec) # 将前缀转换为整数，如果成功则添加到 rec_list 列表中
            rec_list.append(rec)
        except:
            # 如果转换失败，忽略该记录
            pass
    rec_list = [str(i) for i in rec_list] # 将所有记录转换为字符串列表
    return rec_list


def consec_repeat(repeats, max_value, len):
    '''
    Creates list with consecutive numbers, where each number repeats a specified
    amount of times
    Inputs:
    - repeats - int, number of times to repeat each number
    - max_value - int, maximum number to reach
    - len - int, length of list (as may be to a quarter second so not 4 end rep)
    '''
    # Round up max_value to nearest integer if required
    max_value = math.ceil(max_value) # 将 max_value 向上取整
    # Create list of repeated consecutive numbers
    list = [x//repeats for x in range((max_value+1)*repeats)] # 创建一个列表，其中每个数字重复 repeats 次
    # Trim list to desired length (to deal with odd quarter seconds and with
    # the zero-based indexing (whilst max_value will be one higher))
    list = list[:len] # 将列表裁剪到指定的长度 len
    return(list)


def create_signals_database(rec):
    '''
    Read the signal files (with FHR and UC), add tme in seconds and minutes,
    and save to csv file.
    Inputs:
    - rec - name of record (e.g. '1347')
    '''
    # Read signal file and save to dataframe
    sample = wfdb.rdsamp("ctu-data/%s" % rec) # 读取指定记录的信号文件
    df = pd.DataFrame(sample[0], columns=['FHR','UC']) # 将信号数据转换为 DataFrame，并设置列名
    df.index.name = 'quarter_second' # 将索引名设置为 "quarter_second"
    # Find the length of the record in quarter seconds, seconds and minutes
    q_sec = len(df.index) # 计算记录的长度（以四分之一秒为单位）
    sec = q_sec/4
    min = sec/60
    # Add column with time in seconds and minutes
    df['second'] = consec_repeat(repeats=4, max_value=int(sec), len=q_sec) # 添加表示时间（秒）的列
    df['minute'] = consec_repeat(repeats=4*60, max_value=int(min), len=q_sec) # 添加表示时间（分钟）的列
    # Save to csv file
    df.to_csv('ctu-data_csv/%s.csv' % rec) # 将 DataFrame 保存为 CSV 文件


def create_ann_dataframe(rec):
    '''
    Read the metadata and save as dataframe
    Inputs:
    - rec - name of record (e.g. '1347')
    '''
    sample = wfdb.rdsamp("ctu-data/%s" % rec)
    ann = sample[1]['comments'][1:] # 获取信号文件中的注释
    ann_dic = {} # 初始化一个空字典，用于存储注释
    for a in ann:
        if '--' in a: # 移除包含 '--' 的注释
            ann.remove(a)

    for a in ann:
        key = a[:a.find('  ')]
        if a.find('  ') == -1:
            key = a[:a.find(' ')]
        inv = a[::-1]
        value = inv[:inv.find(' ')][::-1]
        value = float(value)
        ann_dic[key] = [value]
        
    df1 = pd.DataFrame.from_dict(ann_dic).T
    df1 = df1.rename(columns={0:rec})
    return df1


def append_ann_dataframes(df,df1):
    '''
    Join together the meta data from each record into a single file
    Inputs:
    df - dataframe, to be combined
    df1 - dataframe, to be combined
    '''
    rec = df1.columns[0]
    df[rec] = df1[rec] # 将 df1 的数据添加到 df 中，列名为记录名
    return df

## Convert the files

In [4]:
df = pd.DataFrame()
for rec in tqdm(get_all_records()):
    create_signals_database(rec)
    df = append_ann_dataframes(df,create_ann_dataframe(rec))
df.to_csv('ctu-data_csv/metadata.csv', index_label='parameter')

print('DONE!')

100%|██████████| 1104/1104 [00:29<00:00, 37.43it/s]

DONE!



