## 1. Data Loader

In [1]:
import os
import pandas as pd

# 연주 feature 및 점수 파일이 있는 폴더 경로
midi_feature_base_path = "midi_data/bach"

# 연주자 폴더 목록
performers = ["hyunsung", "jeo", "jihyun", "jinhee", "jungwook"]

# 청크 길이 정의 (100개 요소 = 10초)
segment_length = 100

# 데이터 로드 및 청크 분할
for performer in performers:
    # 각 연주자 폴더 경로
    feature_folder = os.path.join(midi_feature_base_path, performer)
    print(f"Checking files in performer folder: {feature_folder}")
    
    # 각 폴더에서 파일 읽기
    for file_name in os.listdir(feature_folder):
        # target 파일 제외 및 feature 파일 확인
        if "target" not in file_name and file_name.endswith(".csv") and not file_name.endswith("_score.csv"):
            # Feature 파일 경로
            feature_file_path = os.path.join(feature_folder, file_name)
            
            # 해당 Feature의 Score 파일명 생성
            base_name = file_name.replace(".csv", "")
            score_file_name = f"{base_name}_score.csv"
            score_file_path = os.path.join(feature_folder, score_file_name)

            if os.path.exists(score_file_path):
                print(f"\nProcessing {file_name} and {score_file_name}")
                
                # Feature와 Score 파일 읽기
                features = pd.read_csv(feature_file_path)
                scores = pd.read_csv(score_file_path).iloc[:, 1:].values.flatten().tolist()

                # 문제 해결: `note`와 `velocity` 값 검증 및 변환
                features["note"] = features["note"].apply(
                    lambda x: 0 if pd.isna(x) or x == "[]" else int(str(x).split(",")[0].replace("[", "").replace("]", "").strip())
                )
                features["velocity"] = features["velocity"].apply(
                    lambda x: 0 if pd.isna(x) or x == "[]" else int(str(x).split(",")[0].replace("[", "").replace("]", "").strip())
                )
                features["dynamic"] = features["dynamic"].fillna("unknown")
                features["accent"] = features["accent"].fillna(0).astype(int)
                features["pedal"] = features["pedal"].fillna(0).apply(
                    lambda x: int(str(x).split(",")[0].replace("[", "").replace("]", "").strip())
                )

                # MIDI 데이터를 10초씩 나누기 위한 문자열 생성
                midi_features = features.apply(
                    lambda row: (
                        f"time_{row['sec']:.1f}: note_{row['note']}_velocity_{row['velocity']}_"
                        f"dynamic_{row['dynamic']}_accent_{row['accent']}_pedal_{row['pedal']}"
                    ),
                    axis=1
                ).tolist()

                # 10초씩 나누기 (각 시퀀스는 100개의 요소)
                midi_segments = [
                    "; ".join(midi_features[i:i + segment_length]) 
                    for i in range(0, len(midi_features), segment_length)
                ]

                # 세그먼트와 스코어 길이 맞추기
                if len(midi_segments) > len(scores):
                    # 세그먼트가 스코어보다 많으면 잘라냄
                    midi_segments = midi_segments[:len(scores)]
                elif len(midi_segments) < len(scores):
                    # 세그먼트가 스코어보다 적으면 패딩 추가
                    padding = "; ".join([
                        f"time_{i * 0.1:.1f}: note_0_velocity_0_dynamic_unknown_accent_0_pedal_0"
                        for i in range(segment_length)
                    ])
                    while len(midi_segments) < len(scores):
                        midi_segments.append(padding)

                # 파일별 데이터 출력
                print(f"\n--- File: {file_name} ---")
                print(f"Number of segments: {len(midi_segments)}")
                print(f"Number of scores: {len(scores)}")
                print("\nSegments:")
                for idx, segment in enumerate(midi_segments[:5]):  # 상위 5개만 출력
                    print(f"  Segment {idx + 1}: {segment[:100]}...")  # 세그먼트 일부 출력
                print("\nScores:")
                print(scores[:5])  # 상위 5개 점수 출력
            else:
                print(f"Score file not found for: {file_name}")

Checking files in performer folder: midi_data/bach/hyunsung

Processing bach_pedal_bad.csv and bach_pedal_bad_score.csv

--- File: bach_pedal_bad.csv ---
Number of segments: 55
Number of scores: 55

Segments:
  Segment 1: time_0.0: note_0_velocity_0_dynamic_unknown_accent_0_pedal_0; time_0.1: note_0_velocity_0_dynamic_un...
  Segment 2: time_10.0: note_0_velocity_0_dynamic_unknown_accent_0_pedal_106; time_10.1: note_72_velocity_55_dyna...
  Segment 3: time_20.0: note_74_velocity_65_dynamic_mp_accent_0_pedal_0; time_20.1: note_0_velocity_0_dynamic_unk...
  Segment 4: time_30.0: note_0_velocity_0_dynamic_unknown_accent_0_pedal_0; time_30.1: note_59_velocity_53_dynami...
  Segment 5: time_40.0: note_0_velocity_0_dynamic_unknown_accent_0_pedal_0; time_40.1: note_65_velocity_58_dynami...

Scores:
[96.5, 81.0, 99.12, 100.0, 85.6]

Processing bach_dynamic_perfect1.csv and bach_dynamic_perfect1_score.csv

--- File: bach_dynamic_perfect1.csv ---
Number of segments: 55
Number of scores: 55

Segm


--- File: bach_note_perfect1.csv ---
Number of segments: 55
Number of scores: 55

Segments:
  Segment 1: time_0.0: note_60_velocity_74_dynamic_mp_accent_0_pedal_126; time_0.1: note_0_velocity_0_dynamic_unk...
  Segment 2: time_10.0: note_0_velocity_0_dynamic_unknown_accent_0_pedal_0; time_10.1: note_72_velocity_69_dynami...
  Segment 3: time_20.0: note_74_velocity_94_dynamic_f_accent_1_pedal_0; time_20.1: note_0_velocity_0_dynamic_unkn...
  Segment 4: time_30.0: note_0_velocity_0_dynamic_unknown_accent_0_pedal_126; time_30.1: note_59_velocity_54_dyna...
  Segment 5: time_40.0: note_0_velocity_0_dynamic_unknown_accent_0_pedal_0; time_40.1: note_65_velocity_52_dynami...

Scores:
[97.5, 81.0, 99.12, 44.0, 89.2]

Processing bach_dynamic_bad1.csv and bach_dynamic_bad1_score.csv

--- File: bach_dynamic_bad1.csv ---
Number of segments: 55
Number of scores: 55

Segments:
  Segment 1: time_0.0: note_60_velocity_87_dynamic_mf_accent_1_pedal_0; time_0.1: note_64_velocity_87_dynamic_mf_...
  Segm


--- File: bach_dynamic_perfect1.csv ---
Number of segments: 55
Number of scores: 55

Segments:
  Segment 1: time_0.0: note_60_velocity_69_dynamic_mp_accent_0_pedal_0; time_0.1: note_64_velocity_67_dynamic_mp_...
  Segment 2: time_9.5: note_0_velocity_0_dynamic_unknown_accent_0_pedal_64; time_9.6: note_64_velocity_62_dynamic...
  Segment 3: time_19.5: note_67_velocity_69_dynamic_mp_accent_0_pedal_0; time_19.6: note_0_velocity_0_dynamic_unk...
  Segment 4: time_30.0: note_0_velocity_0_dynamic_unknown_accent_0_pedal_0; time_30.1: note_0_velocity_0_dynamic_...
  Segment 5: time_40.0: note_53_velocity_64_dynamic_mp_accent_0_pedal_86; time_40.1: note_0_velocity_0_dynamic_un...

Scores:
[91.0, 90.0, 94.46, 76.0, 91.6]

Processing bach_dynamic_perfect2.csv and bach_dynamic_perfect2_score.csv

--- File: bach_dynamic_perfect2.csv ---
Number of segments: 55
Number of scores: 55

Segments:
  Segment 1: time_0.0: note_60_velocity_58_dynamic_p_accent_0_pedal_92; time_0.1: note_0_velocity_0_dynamic_


--- File: bach_dynamic_bad2.csv ---
Number of segments: 55
Number of scores: 55

Segments:
  Segment 1: time_0.0: note_0_velocity_0_dynamic_unknown_accent_0_pedal_0; time_0.1: note_60_velocity_90_dynamic_...
  Segment 2: time_10.0: note_0_velocity_0_dynamic_unknown_accent_0_pedal_0; time_10.1: note_72_velocity_72_dynami...
  Segment 3: time_20.0: note_74_velocity_58_dynamic_p_accent_0_pedal_0; time_20.1: note_0_velocity_0_dynamic_unkn...
  Segment 4: time_30.0: note_0_velocity_0_dynamic_unknown_accent_0_pedal_100; time_30.1: note_59_velocity_94_dyna...
  Segment 5: time_40.0: note_0_velocity_0_dynamic_unknown_accent_0_pedal_0; time_40.1: note_65_velocity_67_dynami...

Scores:
[98.5, 65.0, 99.07, 0.0, 89.2]

Processing bach_note_perfect1.csv and bach_note_perfect1_score.csv

--- File: bach_note_perfect1.csv ---
Number of segments: 55
Number of scores: 55

Segments:
  Segment 1: time_0.0: note_60_velocity_57_dynamic_p_accent_0_pedal_110; time_0.1: note_0_velocity_0_dynamic_unkn...
  Seg

In [7]:
import random
import pandas as pd
from transformers import BertTokenizer, BertModel, Trainer, TrainingArguments
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from sklearn.metrics import mean_squared_error

class BertForMultipleRegression(nn.Module):
    def __init__(self):
        super(BertForMultipleRegression, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.regressor = nn.Linear(self.bert.config.hidden_size, 5)  # 5 outputs for 5 scores

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        predictions = self.regressor(pooled_output)

        if labels is not None:
            loss_fn = nn.MSELoss()
            loss = loss_fn(predictions, labels)
            return loss, predictions
        return predictions

# Instantiate the model
model = BertForMultipleRegression()

In [12]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

# 연주 feature 및 점수 파일이 있는 폴더 경로
midi_feature_base_path = "midi_data/bach"

# 연주자 폴더 목록
performers = ["hyunsung", "jeo", "jihyun", "jinhee", "jungwook"]

# 청크 길이 정의 (100개 요소 = 10초)
segment_length = 100

# 전체 데이터 저장
all_segments = []
all_scores = []

# 데이터 로드 및 청크 분할
for performer in performers:
    # 각 연주자 폴더 경로
    feature_folder = os.path.join(midi_feature_base_path, performer)
    print(f"Checking files in performer folder: {feature_folder}")
    
    # 각 폴더에서 파일 읽기
    for file_name in os.listdir(feature_folder):
        # target 파일 제외 및 feature 파일 확인
        if "target" not in file_name and file_name.endswith(".csv") and not file_name.endswith("_score.csv"):
            # Feature 파일 경로
            feature_file_path = os.path.join(feature_folder, file_name)
            
            # 해당 Feature의 Score 파일명 생성
            base_name = file_name.replace(".csv", "")
            score_file_name = f"{base_name}_score.csv"
            score_file_path = os.path.join(feature_folder, score_file_name)

            if os.path.exists(score_file_path):
                print(f"\nProcessing {file_name} and {score_file_name}")
                
                # Feature와 Score 파일 읽기
                features = pd.read_csv(feature_file_path)
                scores = pd.read_csv(score_file_path).iloc[:, 1:].values.flatten().tolist()

                # 문제 해결: `note`와 `velocity` 값 검증 및 변환
                features["note"] = features["note"].apply(
                    lambda x: 0 if pd.isna(x) or x == "[]" else int(str(x).split(",")[0].replace("[", "").replace("]", "").strip())
                )
                features["velocity"] = features["velocity"].apply(
                    lambda x: 0 if pd.isna(x) or x == "[]" else int(str(x).split(",")[0].replace("[", "").replace("]", "").strip())
                )
                features["dynamic"] = features["dynamic"].fillna("unknown")
                features["accent"] = features["accent"].fillna(0).astype(int)
                features["pedal"] = features["pedal"].fillna(0).apply(
                    lambda x: int(str(x).split(",")[0].replace("[", "").replace("]", "").strip())
                )

                # MIDI 데이터를 10초씩 나누기 위한 문자열 생성
                midi_features = features.apply(
                    lambda row: (
                        f"time_{row['sec']:.1f}: note_{row['note']}_velocity_{row['velocity']}_"
                        f"dynamic_{row['dynamic']}_accent_{row['accent']}_pedal_{row['pedal']}"
                    ),
                    axis=1
                ).tolist()

                # 10초씩 나누기 (각 시퀀스는 100개의 요소)
                midi_segments = [
                    "; ".join(midi_features[i:i + segment_length]) 
                    for i in range(0, len(midi_features), segment_length)
                ]

                # 세그먼트와 스코어 길이 맞추기
                if len(midi_segments) > len(scores):
                    # 세그먼트가 스코어보다 많으면 잘라냄
                    midi_segments = midi_segments[:len(scores)]
                elif len(midi_segments) < len(scores):
                    # 세그먼트가 스코어보다 적으면 패딩 추가
                    padding = "; ".join([
                        f"time_{i * 0.1:.1f}: note_0_velocity_0_dynamic_unknown_accent_0_pedal_0"
                        for i in range(segment_length)
                    ])
                    while len(midi_segments) < len(scores):
                        midi_segments.append(padding)

                # 전체 데이터 저장
                all_segments.extend(midi_segments)
                all_scores.extend(scores)
            else:
                print(f"Score file not found for: {file_name}")

# 데이터셋 분리 (8:2 비율로 나누기)
train_segments, test_segments, train_scores, test_scores = train_test_split(
    all_segments, all_scores, test_size=0.2, random_state=42
)

print(f"\nTotal Data: {len(all_segments)} segments")
print(f"Training Data: {len(train_segments)} segments")
print(f"Testing Data: {len(test_segments)} segments")

Checking files in performer folder: midi_data/bach/hyunsung

Processing bach_pedal_bad.csv and bach_pedal_bad_score.csv

Processing bach_dynamic_perfect1.csv and bach_dynamic_perfect1_score.csv

Processing bach_dynamic_perfect2.csv and bach_dynamic_perfect2_score.csv

Processing bach_pedal_perfect.csv and bach_pedal_perfect_score.csv

Processing bach_note_bad1.csv and bach_note_bad1_score.csv

Processing bach_note_bad2.csv and bach_note_bad2_score.csv

Processing bach_dynamic_bad2.csv and bach_dynamic_bad2_score.csv

Processing bach_note_perfect1.csv and bach_note_perfect1_score.csv

Processing bach_dynamic_bad1.csv and bach_dynamic_bad1_score.csv

Processing bach_note_perfect2.csv and bach_note_perfect2_score.csv
Checking files in performer folder: midi_data/bach/jeo

Processing bach_pedal_bad.csv and bach_pedal_bad_score.csv

Processing bach_dynamic_perfect1.csv and bach_dynamic_perfect1_score.csv

Processing bach_dynamic_perfect2.csv and bach_dynamic_perfect2_score.csv

Processing b

In [None]:
# 커스텀 Dataset 클래스 정의
class MidiDataset(Dataset):
    def __init__(self, segments, scores, tokenizer, max_length=512):
        self.segments = segments
        self.scores = scores
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.segments)

    def __getitem__(self, idx):
        segment = self.segments[idx]
        score = self.scores[idx]
        
        encoding = self.tokenizer(
            segment,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(score, dtype=torch.float)
        }

# BERT 모델 기반 회귀 모델 정의
class BertForMultipleRegression(nn.Module):
    def __init__(self):
        super(BertForMultipleRegression, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.regressor = nn.Linear(self.bert.config.hidden_size, 1)  # 1개의 점수 예측

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        predictions = self.regressor(pooled_output).squeeze(-1)

        if labels is not None:
            loss_fn = nn.MSELoss()
            loss = loss_fn(predictions, labels)
            return loss, predictions
        return predictions

# 데이터 준비
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Dataset 생성
train_dataset = MidiDataset(train_segments, train_scores, tokenizer)
test_dataset = MidiDataset(test_segments, test_scores, tokenizer)

# 모델 초기화
model = BertForMultipleRegression()

# MSE 평가 함수 정의
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    mse = mean_squared_error(labels, predictions)
    return {"mse": mse}

# TrainingArguments 설정
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=10,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="mse"
)

# Trainer 객체 생성
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# 학습 실행
trainer.train()

# 평가
eval_results = trainer.evaluate()
print("\nEvaluation Results:", eval_results)



### Backup Code

In [9]:
import random
import pandas as pd
from transformers import BertTokenizer, BertModel, Trainer, TrainingArguments
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from sklearn.metrics import mean_squared_error

# 임의의 MIDI 데이터 생성 함수 (초 단위)
def generate_midi_data(seconds):
    data = []
    for i in range(seconds * 10):  # 초 x 10 (0.1초 간격)
        time = f"time_{i * 0.1:.1f}"
        notes = [f"note_{random.randint(40, 60)}_velocity_{random.randint(50, 70)}" for _ in range(random.randint(1, 3))]
        entry = f"{time}: " + ", ".join(notes)
        data.append(entry)
    return data  # 데이터 목록 반환

# 94초 MIDI 데이터 생성
midi_data_94s = generate_midi_data(94)

# 10초씩 나누기 (각 시퀀스는 100개의 요소, 즉 10초 데이터)
segment_length = 100
midi_segments = ["; ".join(midi_data_94s[i:i + segment_length]) for i in range(0, len(midi_data_94s), segment_length)]

# 마지막 시퀀스가 부족할 경우 패딩 추가
if len(midi_segments[-1].split('; ')) < segment_length:
    last_segment = midi_segments.pop()
    padding = "; ".join([f"time_{(len(last_segment.split('; ')) + i) * 0.1:.1f}: note_0_velocity_0" for i in range(segment_length - len(last_segment.split('; ')))])
    midi_segments.append(last_segment + "; " + padding)

# BERT tokenizer 초기화
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# 점수 데이터 생성 및 정규화 (0에서 1 사이로)
scores = [[random.uniform(0, 100) for _ in range(3)] for _ in range(len(midi_segments))]
normalized_scores = [[score / 100 for score in s] for s in scores]  # 정규화

# 커스텀 BERT 모델 정의 (회귀를 위한 FFN 추가)
class BertForMultipleRegression(nn.Module):
    def __init__(self):
        super(BertForMultipleRegression, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.regressor = nn.Linear(self.bert.config.hidden_size, 3)  # 3개의 점수 예측

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        predictions = self.regressor(pooled_output)

        if labels is not None:
            loss_fn = nn.MSELoss()
            loss = loss_fn(predictions, labels)
            return loss, predictions
        return predictions

model = BertForMultipleRegression()

# PyTorch Dataset 클래스 정의
class MidiDataset(Dataset):
    def __init__(self, midi_segments, scores):
        self.midi_segments = midi_segments
        self.scores = scores

    def __len__(self):
        return len(self.midi_segments)

    def __getitem__(self, idx):
        midi_data = self.midi_segments[idx]
        score = self.scores[idx]
        inputs = tokenizer(midi_data, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
        item = {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "labels": torch.tensor(score, dtype=torch.float)  # 다중 점수 예측을 위한 레이블
        }
        return item

# Dataset 생성
train_dataset = MidiDataset(midi_segments, normalized_scores)

# MSE 손실 계산 함수
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions
    mse = mean_squared_error(labels, preds)
    return {"mse": mse}

# training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=50,
    per_device_train_batch_size=2,
    learning_rate=1e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

# 입력 데이터와 예측할 데이터 시각화
input_data = pd.DataFrame({
    'Segment': [f'Segment {i + 1}' for i in range(len(midi_segments))],
    'MIDI Data': midi_segments,
    'Scores': scores  # 원래 점수 표시
})

print("\nInput Data and Predicted Scores:")
print(input_data)

# 예측 결과 출력 (역정규화 포함)
for i, data in enumerate(midi_segments, start=1):
    inputs = tokenizer(data, return_tensors="pt", truncation=True, padding="max_length", max_length=100) # 512
    inputs = {key: value.to(training_args.device) for key, value in inputs.items() if key != "token_type_ids"}  # token_type_ids 제거
    with torch.no_grad():
        outputs = model(**inputs)  # 하나의 출력만 받음
        predicted_scores = outputs.squeeze().tolist()  # 직접 예측값 추출
        # 예측 결과를 원래 점수 범위(0-100)로 역정규화
        original_predicted_scores = [score * 100 for score in predicted_scores]
        print(f"Segment {i} Predicted Evaluation Scores: {original_predicted_scores}")

Step,Training Loss


KeyboardInterrupt: 