In [1]:
# --- 匯入套件 ---
import os
import glob
import pandas as pd
import numpy as np
import json
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
# --- Step 1: 讀取多個 CSV ---
csv_folder = "./dance_csv/"  # 你的 CSV 資料夾
csv_files = glob.glob(os.path.join(csv_folder, "*.csv"))
print(f"找到 {len(csv_files)} 個 CSV 檔案")

找到 251 個 CSV 檔案


In [3]:
def parse_point(s):
    s = s.strip("()")
    parts = [float(p.strip(" '")) for p in s.split(",")]
    return parts

def load_pose_csv(path):
    df = pd.read_csv(path)
    pose_cols = [c for c in df.columns if c != "frame"]
    poses = []
    for i, row in df.iterrows():
        pose = []
        for c in pose_cols:
            pose += parse_point(row[c])
        poses.append(pose)
    poses = np.array(poses)
    delta_poses = np.diff(poses, axis=0)
    delta_poses = np.vstack([delta_poses, np.zeros_like(delta_poses[0])])
    return delta_poses

In [4]:
# --- Step 2: 收集所有資料以便標準化 ---
all_data = []
file_data = {}
for path in csv_files:
    delta = load_pose_csv(path)
    if len(delta) < 10:
        print(f"⚠️ 檔案太短，略過：{os.path.basename(path)}（只有 {len(delta)} 幀）")
        continue
    all_data.append(delta)
    file_data[path] = delta

all_data = np.concatenate(all_data, axis=0)
print("總資料 shape:", all_data.shape)

scaler = StandardScaler()
scaler.fit(all_data)

⚠️ 檔案太短，略過：Ballet_12.csv（只有 9 幀）
⚠️ 檔案太短，略過：Ballet_15.csv（只有 3 幀）
⚠️ 檔案太短，略過：TaiwaneseAboriginalDance_237.csv（只有 4 幀）
總資料 shape: (271178, 99)


0,1,2
,copy,True
,with_mean,True
,with_std,True


In [5]:
# --- Step 3: Dataset ---
class PoseDataset(Dataset):
    def __init__(self, data, seq_len=10):
        self.data = torch.tensor(data, dtype=torch.float32)
        self.seq_len = seq_len
    def __len__(self):
        return max(0, len(self.data) - self.seq_len)
    def __getitem__(self, idx):
        seq = self.data[idx:idx+self.seq_len]
        return seq, seq

In [6]:
# --- Step 4: LSTM Autoencoder ---
class LSTMAutoencoder(nn.Module):
    def __init__(self, input_dim=99, hidden_dim=128, latent_dim=32):
        super().__init__()
        self.encoder = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc_enc = nn.Linear(hidden_dim, latent_dim)
        self.fc_dec = nn.Linear(latent_dim, hidden_dim)
        self.decoder = nn.LSTM(hidden_dim, input_dim, batch_first=True)
    def forward(self, x):
        enc_out, (h, c) = self.encoder(x)
        latent = self.fc_enc(enc_out[:, -1, :])
        dec_input = self.fc_dec(latent).unsqueeze(1).repeat(1, x.size(1), 1)
        dec_out, _ = self.decoder(dec_input)
        return dec_out, latent

In [7]:
# --- Step 5: 訓練模型 ---
device = "cuda" if torch.cuda.is_available() else "cpu"
model = LSTMAutoencoder().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()

In [8]:
seq_len = 10
dataset = PoseDataset(all_data, seq_len)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

In [9]:
epochs = 30
for epoch in range(epochs):
    total_loss = 0
    for batch, _ in dataloader:
        batch = batch.to(device)
        optimizer.zero_grad()
        recon, latent = model(batch)
        loss = loss_fn(recon, batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(dataloader):.4f}")

Epoch 1/30, Loss: 0.0312
Epoch 2/30, Loss: 0.0238
Epoch 3/30, Loss: 0.0212
Epoch 4/30, Loss: 0.0188
Epoch 5/30, Loss: 0.0177
Epoch 6/30, Loss: 0.0171
Epoch 7/30, Loss: 0.0168
Epoch 8/30, Loss: 0.0165
Epoch 9/30, Loss: 0.0163
Epoch 10/30, Loss: 0.0160
Epoch 11/30, Loss: 0.0157
Epoch 12/30, Loss: 0.0155
Epoch 13/30, Loss: 0.0154
Epoch 14/30, Loss: 0.0153
Epoch 15/30, Loss: 0.0153
Epoch 16/30, Loss: 0.0152
Epoch 17/30, Loss: 0.0151
Epoch 18/30, Loss: 0.0150
Epoch 19/30, Loss: 0.0154
Epoch 20/30, Loss: 0.0166
Epoch 21/30, Loss: 0.0154
Epoch 22/30, Loss: 0.0149
Epoch 23/30, Loss: 0.0150
Epoch 24/30, Loss: 0.0146
Epoch 25/30, Loss: 0.0146
Epoch 26/30, Loss: 0.0149
Epoch 27/30, Loss: 0.0146
Epoch 28/30, Loss: 0.0144
Epoch 29/30, Loss: 0.0143
Epoch 30/30, Loss: 0.0142


In [10]:
# --- Step 6: 為每個 CSV 生成符號 ---
model.eval()
symbol_dict = {}

for path, data in file_data.items():
    if len(data) < seq_len:
        continue
    data = scaler.transform(data)
    ds = PoseDataset(data, seq_len)
    if len(ds) == 0:
        print(f"⚠️ {os.path.basename(path)} 長度不足，跳過")
        continue
    dl = DataLoader(ds, batch_size=64, shuffle=False)

    latents = []
    with torch.no_grad():
        for batch, _ in dl:
            batch = batch.to(device)
            _, latent = model(batch)
            latents.append(latent.cpu().numpy())

    latents = np.concatenate(latents, axis=0)
    print(f"{os.path.basename(path)} latent 數量:", len(latents))

    num_symbols = min(26, len(latents))
    if num_symbols < 2:
        print(f"⚠️ {os.path.basename(path)} 樣本太少，跳過此檔案")
        continue

    kmeans = KMeans(n_clusters=num_symbols, random_state=42, n_init=10)
    labels = kmeans.fit_predict(latents)
    symbols = [chr(65 + l) for l in labels]
    symbol_seq = "".join(symbols)
    print(f"{os.path.basename(path)} → 符號長度 {len(symbol_seq)}，前50個：{symbol_seq[:50]}")

    # 🔹 加入到字典中
    symbol_dict[os.path.basename(path)] = symbol_seq

Ballet_1.csv latent 數量: 231
Ballet_1.csv → 符號長度 231，前50個：QNEJAPGTRXCVOKHVLOYXYMPOYBLXOMPOYBYXMPILPQKJMFDWBX
Ballet_10.csv latent 數量: 60
Ballet_10.csv → 符號長度 60，前50個：MCMCWMEYRPUXSLNZDVFHQJGKIOMEEETTETEETBEBBABABABABA
Ballet_11.csv latent 數量: 822
Ballet_11.csv → 符號長度 822，前50個：MBLXGXNLNBMYMLHBBBBBXBGLHJDNINBBBBLXPXOZTYUCANVNBN
Ballet_13.csv latent 數量: 8
Ballet_13.csv → 符號長度 8，前50個：HBAEGDFC
Ballet_14.csv latent 數量: 1157
Ballet_14.csv → 符號長度 1157，前50個：BQJJKJYBBBBBBBBBBBBBKQBCXQBBBBBZXBBBYJBQYJXJJJJKEY
Ballet_16.csv latent 數量: 38
Ballet_16.csv → 符號長度 38，前50個：BYYYBHBYYYYYVHQHZXCKRDWDFOMILJAPTUSEGN
Ballet_17.csv latent 數量: 471
Ballet_17.csv → 符號長度 471，前50個：WVVBPETXGQUCYMHOSFNDLWVBJAZBPETXGQUCYMHOSISSNDLDLS
Ballet_18.csv latent 數量: 51
Ballet_18.csv → 符號長度 51，前50個：HMMHMMHHMHCYBHHCYMZMHHXMQSWPOJVTIUKRHCMBCYBAGMEDFN
Ballet_19.csv latent 數量: 219
Ballet_19.csv → 符號長度 219，前50個：GBBBFFVVWRHHGTFVFWRVVWQYYVVDRUHNTZNXHACBTPARWRWTWN
Ballet_2.csv latent 數量: 5
Ballet_2.csv → 符號長度 5，前50個：CAEDB
Ballet_20.csv l

In [11]:
# --- Step 7: 儲存 JSON ---
with open("symbol_sequences08.json", "w", encoding="utf-8") as f:
    json.dump(symbol_dict, f, indent=2, ensure_ascii=False)

print(f"✅ 已儲存 symbol_sequences08.json，共 {len(symbol_dict)} 段舞")

✅ 已儲存 symbol_sequences08.json，共 246 段舞
