In [1]:
import pandas as pd
import numpy as np
from numpy.fft import fft
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from scipy.interpolate import interp1d
from scipy.signal import resample
from scipy.stats import skew, kurtosis, entropy, uniform
from scipy.signal import find_peaks, stft
from sklearn.model_selection import KFold, RandomizedSearchCV, train_test_split
from sklearn.metrics import accuracy_score, f1_score
from skorch import NeuralNetClassifier
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from collections import Counter

In [2]:
# 1. 데이터 로드 및 전처리

# 자이로 데이터 로드 (RegisterDate를 datetime으로 변환)
gyro_data = pd.read_csv('./원본 데이터/자이로 데이터.csv')
gyro_data['RegisterDate'] = pd.to_datetime(gyro_data['RegisterDate'])

# UCI-HAR 데이터 로드
uci_har_path = './원본 데이터/UCI HAR Dataset/'
gyro_x_train = pd.read_csv(uci_har_path + 'train/Inertial Signals/body_gyro_x_train.txt', sep='\s+', header=None).values
gyro_y_train = pd.read_csv(uci_har_path + 'train/Inertial Signals/body_gyro_y_train.txt', sep='\s+', header=None).values
gyro_z_train = pd.read_csv(uci_har_path + 'train/Inertial Signals/body_gyro_z_train.txt', sep='\s+', header=None).values

# X, Y, Z 축 데이터를 DataFrame으로 병합
uci_har_gyro_df = pd.DataFrame({
    'X': np.mean(gyro_x_train, axis=1),
    'Y': np.mean(gyro_y_train, axis=1),
    'Z': np.mean(gyro_z_train, axis=1)
})

In [3]:
# 2. 주파수 업샘플링 (UCI-HAR 데이터를 50Hz에서 100Hz로 업샘플링)
current_freq = 50  # UCI-HAR 데이터 주파수
desired_freq = 100  # 자이로 데이터 주파수

# 보간법을 사용한 업샘플링
t_current = np.linspace(0, len(uci_har_gyro_df) / current_freq, num=len(uci_har_gyro_df))
t_new = np.linspace(0, len(uci_har_gyro_df) / current_freq, num=len(uci_har_gyro_df) * 2)

# 새로운 DataFrame에 업샘플링된 데이터 저장
uci_har_gyro_df_upsampled = pd.DataFrame({
    'X': np.interp(t_new, t_current, uci_har_gyro_df['X']),
    'Y': np.interp(t_new, t_current, uci_har_gyro_df['Y']),
    'Z': np.interp(t_new, t_current, uci_har_gyro_df['Z'])
})

# 자이로 데이터를 UCI-HAR 데이터의 길이로 슬라이싱
n_samples_uci = len(uci_har_gyro_df)
gyro_sliced = gyro_data.iloc[:n_samples_uci].copy()  # UCI-HAR 데이터 길이만큼 자이로 데이터를 슬라이싱

# 데이터 정규화 (MinMaxScaler로 바로 inplace로 변환)
scaler = MinMaxScaler()
gyro_sliced[['X', 'Y', 'Z']] = scaler.fit_transform(gyro_sliced[['X', 'Y', 'Z']])
uci_har_gyro_df[['X', 'Y', 'Z']] = scaler.fit_transform(uci_har_gyro_df[['X', 'Y', 'Z']])

In [4]:
# RMS 함수 정의
def rms(values):
    return np.sqrt(np.mean(values**2))

# 엔트로피 계산 함수 정의
def calc_entropy(values):
    # 확률 밀도 함수 계산 후 엔트로피 계산
    value_prob = np.histogram(values, bins=30, density=True)[0]  # 확률 밀도 함수
    return entropy(value_prob + 1e-6)  # 엔트로피 계산

# FFT 특징 계산 함수
def fft_features(values, n=10):
    fft_vals = np.abs(fft(values))  # FFT 계산 후 절댓값을 취함
    return np.mean(fft_vals[:n])  # 주파수 성분의 상위 N개 평균 계산

# STFT 특징 계산 함수
def stft_features(values, n=10):
    _, _, Zxx = stft(values)
    Zxx_flat = np.abs(Zxx).flatten()  # STFT 결과를 1차원으로 변환
    Zxx_mean = np.mean(Zxx_flat)  # 플랫한 결과의 평균값을 계산
    return Zxx_mean

# RMS, Skewness, Kurtosis, Entropy 계산 및 피크 탐지 함수
def calculate_features(df, axis):
    df[f'rms_{axis}'] = rms(df[axis])
    df[f'skew_{axis}'] = skew(df[axis])
    df[f'kurtosis_{axis}'] = kurtosis(df[axis])
    df[f'entropy_{axis}'] = calc_entropy(df[axis])
    
    # 피크 탐지
    peaks, _ = find_peaks(df[axis], height=0)
    df[f'peaks_{axis}'] = 0
    df.loc[peaks, f'peaks_{axis}'] = 1
    
    return df

# 각 축에 대해 특성 계산
for axis in ['X', 'Y', 'Z']:
    uci_har_gyro_df_upsampled = calculate_features(uci_har_gyro_df_upsampled, axis)
    gyro_sliced = calculate_features(gyro_sliced, axis)

# 푸리에 변환 (FFT) 및 STFT 계산 함수
def calculate_fft_stft(df, axis, n=10):
    # FFT 계산
    df[f'fft_{axis}'] = fft_features(df[axis].values, n=n)
    
    # STFT 계산
    stft_result = stft_features(df[axis].values)
    df[f'stft_{axis}'] = stft_result
    
    return df

# 각 축에 대해 FFT 및 STFT 계산
for axis in ['X', 'Y', 'Z']:
    uci_har_gyro_df_upsampled = calculate_fft_stft(uci_har_gyro_df_upsampled, axis)
    gyro_sliced = calculate_fft_stft(gyro_sliced, axis)

# 데이터 정규화 (MinMaxScaler)
scaler = MinMaxScaler()
# 컬럼 이름의 대소문자를 일치시킵니다.
columns_to_scale = ['fft_X', 'fft_Y', 'fft_Z', 'stft_X', 'stft_Y', 'stft_Z']
uci_har_gyro_df_upsampled[columns_to_scale] = scaler.fit_transform(uci_har_gyro_df_upsampled[columns_to_scale])
gyro_sliced[columns_to_scale] = scaler.fit_transform(gyro_sliced[columns_to_scale])

# 최종 피처 세트 구축
X_train_features = np.column_stack((
    uci_har_gyro_df_upsampled[['X', 'Y', 'Z']].values,
    uci_har_gyro_df_upsampled[['rms_X', 'rms_Y', 'rms_Z']].values,
    uci_har_gyro_df_upsampled[['skew_X', 'skew_Y', 'skew_Z']].values,
    uci_har_gyro_df_upsampled[['entropy_X', 'entropy_Y', 'entropy_Z']].values,
    uci_har_gyro_df_upsampled[['fft_X', 'fft_Y', 'fft_Z']].values,
    uci_har_gyro_df_upsampled[['peaks_X', 'peaks_Y', 'peaks_Z']].values,
    uci_har_gyro_df_upsampled[['stft_X', 'stft_Y', 'stft_Z']].values
))

X_gyro_features = np.column_stack((
    gyro_sliced[['X', 'Y', 'Z']].values,
    gyro_sliced[['rms_X', 'rms_Y', 'rms_Z']].values,
    gyro_sliced[['skew_X', 'skew_Y', 'skew_Z']].values,
    gyro_sliced[['entropy_X', 'entropy_Y', 'entropy_Z']].values,
    gyro_sliced[['fft_X', 'fft_Y', 'fft_Z']].values,
    gyro_sliced[['peaks_X', 'peaks_Y', 'peaks_Z']].values,
    gyro_sliced[['stft_X', 'stft_Y', 'stft_Z']].values
))

# 라벨 추가 (UCI-HAR 데이터에 라벨링 있음)
y_train = pd.read_csv(uci_har_path + 'train/y_train.txt', header=None).values.flatten()

# y_train을 X_train_features의 크기에 맞추어 반복 (업샘플링)
y_train_upsampled = np.repeat(y_train, 2)  # 레이블을 2배로 확장

# 데이터 크기 일치 확인
print(f"X_train_features shape: {X_train_features.shape}")
print(f"y_train_upsampled shape: {y_train_upsampled.shape}")

X_train_features shape: (14704, 21)
y_train_upsampled shape: (14704,)


In [5]:
# 데이터셋을 학습용과 검증용으로 나누기
X_train_features, X_val, y_train_upsampled, y_val = train_test_split(X_train_features, 
                                                                     y_train_upsampled, 
                                                                     test_size=0.2, 
                                                                     random_state=42)

# 데이터 차원 확장 (batch_size, sequence_length, input_size) 형식으로 맞추기
X_train_features = X_train_features.reshape(X_train_features.shape[0], 1, X_train_features.shape[1])
X_val = X_val.reshape(X_val.shape[0], 1, X_val.shape[1])

# y_train은 그대로 둡니다. 이 변수는 차원 확장이 필요하지 않습니다.
y_train = y_train_upsampled.astype(np.int64)
y_val = y_val.astype(np.int64)

# 데이터셋 및 데이터로더 정의
train_dataset = TensorDataset(torch.tensor(X_train_features, dtype=torch.float32), torch.tensor(y_train, dtype=torch.long))
val_dataset = TensorDataset(torch.tensor(X_val, dtype=torch.float32), torch.tensor(y_val, dtype=torch.long))

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)

In [6]:
# 모델 정의 (LSTM, GRU, CNN-LSTM, BiLSTM, Transformer)
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        h0 = torch.zeros(self.lstm.num_layers, x.size(0), self.lstm.hidden_size).to(x.device)
        c0 = torch.zeros(self.lstm.num_layers, x.size(0), self.lstm.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

class GRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(GRUModel, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        h0 = torch.zeros(self.gru.num_layers, x.size(0), self.gru.hidden_size).to(x.device)
        out, _ = self.gru(x, h0)
        out = self.fc(out[:, -1, :])
        return out

class CNNLSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(CNNLSTMModel, self).__init__()
        self.conv = nn.Conv1d(in_channels=1, out_channels=64, kernel_size=3, padding=1)
        self.lstm = nn.LSTM(64, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x = x.unsqueeze(1)  # CNN에 맞춰 입력 차원 확장
        x = self.conv(x.permute(0, 2, 1))
        out, _ = self.lstm(x.permute(0, 2, 1))
        out = self.fc(out[:, -1, :])
        return out

class BiLSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(BiLSTMModel, self).__init__()
        self.bilstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, x):
        h0 = torch.zeros(self.bilstm.num_layers * 2, x.size(0), self.bilstm.hidden_size).to(x.device)
        c0 = torch.zeros(self.bilstm.num_layers * 2, x.size(0), self.bilstm.hidden_size).to(x.device)
        out, _ = self.bilstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

class TransformerModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_heads, num_layers, num_classes):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Linear(input_size, hidden_size)
        self.transformer = nn.Transformer(d_model=hidden_size, nhead=num_heads, num_encoder_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer(x)
        out = self.fc(x[:, -1, :])
        return out

In [None]:
# num_classes 정의
num_classes = len(np.unique(y_train))  # y_train 데이터에서 고유한 클래스 수 계산

# 레이블 값을 1씩 빼서 0부터 시작하도록 변환
y_train = y_train - 1
y_val = y_val - 1

# 모델 리스트 정의
models = [
    LSTMModel(input_size=X_train_features.shape[2], hidden_size=128, num_layers=2, num_classes=num_classes),
    GRUModel(input_size=X_train_features.shape[2], hidden_size=128, num_layers=2, num_classes=num_classes),
    CNNLSTMModel(input_size=X_train_features.shape[2], hidden_size=128, num_layers=2, num_classes=num_classes),
    BiLSTMModel(input_size=X_train_features.shape[2], hidden_size=128, num_layers=2, num_classes=num_classes),
    TransformerModel(input_size=X_train_features.shape[2], hidden_size=128, num_heads=4, num_layers=2, num_classes=num_classes)
]

# RandomizedSearchCV 하이퍼파라미터 최적화
params = {
    'lr': uniform(0.0001, 0.01),
    'module__hidden_size': [64, 128, 256],
    'module__num_layers': [1, 2, 3],
    'batch_size': [16, 32, 64]
}

# 결과 저장
best_params_per_model = {}
best_scores_per_model = {}

# 각 모델에 대해 RandomizedSearchCV 실행
for model in models:
    print(f"Optimizing model: {model.__class__.__name__}")

    # NeuralNetClassifier로 모델 래핑
    net = NeuralNetClassifier(
        module=model,
        module__input_size=X_train_features.shape[2],
        module__num_classes=num_classes,
        criterion=nn.CrossEntropyLoss,
        optimizer=optim.Adam,
        max_epochs=20,
        iterator_train__shuffle=True,
        device='cuda' if torch.cuda.is_available() else 'cpu',
        verbose=1  # 매 에포크마다 로그 출력
    )

    # RandomizedSearchCV 설정
    rs = RandomizedSearchCV(
        net,
        params,
        refit=True,
        cv=KFold(n_splits=5, shuffle=True, random_state=42),
        scoring='accuracy',
        n_iter=10,
        verbose=2,
        random_state=42
    )

    # 모델 최적화
    X_train_np = X_train_features.astype(np.float32)  # numpy 형식으로 변환
    y_train_np = y_train.astype(np.int64)
    
    rs.fit(X_train_np, y_train_np)  # numpy 데이터 사용

    # 최적 하이퍼파라미터와 점수 저장
    best_params_per_model[model.__class__.__name__] = rs.best_params_
    best_scores_per_model[model.__class__.__name__] = rs.best_score_

    # 결과 출력
    print(f"Best parameters for {model.__class__.__name__}: {rs.best_params_}")
    print(f"Best cross-validation accuracy for {model.__class__.__name__}: {rs.best_score_:.4f}")

# 최적 모델로 훈련 후 검증 데이터 평가
train_acc = net.score(X_train_np, y_train_np)
val_acc = net.score(X_val_np, y_val_np)  # X_val_np와 y_val_np도 numpy로 변환 필요

print(f"Training Accuracy: {train_acc:.4f}")
print(f"Validation Accuracy: {val_acc:.4f}")

# 최종 결과 출력
print("Best parameters for each model:")
print(best_params_per_model)
print("Best cross-validation accuracy for each model:")
print(best_scores_per_model)

Optimizing model: LSTMModel
Fitting 5 folds for each of 10 candidates, totalling 50 fits
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m1.7012[0m       [32m0.3438[0m        [35m1.4605[0m  0.6054
      2        [36m1.4610[0m       0.3379        [35m1.4598[0m  0.5545
      3        [36m1.4249[0m       0.3294        [35m1.4317[0m  0.5565
      4        [36m1.4173[0m       [32m0.3512[0m        [35m1.3877[0m  0.5575
      5        [36m1.3935[0m       [32m0.3634[0m        [35m1.3852[0m  0.5565
      6        [36m1.3594[0m       [32m0.3858[0m        [35m1.3443[0m  0.5585
      7        [36m1.3412[0m       [32m0.3996[0m        1.3595  0.5555
      8        [36m1.3202[0m       0.3889        1.4124  0.5565
      9        [36m1.2975[0m       [32m0.4230[0m        [35m1.2715[0m  0.5545
     10        [36m1.2837[0m       [32m0.4299[0m        [35m1.2548[0m  0.5555
  

In [None]:
def train_and_evaluate_model(model, train_loader, val_loader, num_epochs, learning_rate):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    train_losses, val_losses = [], []
    train_accuracies, val_accuracies = []

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0

        for features, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        train_loss = running_loss / len(train_loader)
        train_acc = 100 * correct / total
        train_losses.append(train_loss)
        train_accuracies.append(train_acc)

        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        with torch.no_grad():
            for features, labels in val_loader:
                outputs = model(features)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()

        val_loss /= len(val_loader)
        val_acc = 100 * val_correct / val_total
        val_losses.append(val_loss)
        val_accuracies.append(val_acc)

        print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%')

    # 마지막 평가 결과
    accuracy = accuracy_score(val_correct, val_total)
    f1 = f1_score(val_correct, val_total, average='weighted')

    return accuracy, f1, train_losses, val_losses, train_accuracies, val_accuracies

In [None]:
# 학습 곡선 시각화
def plot_learning_curves(train_losses, val_losses, train_accuracies, val_accuracies):
    epochs = range(1, len(train_losses) + 1)

    plt.figure(figsize=(12, 5))

    # 손실 시각화
    plt.subplot(1, 2, 1)
    plt.plot(epochs, train_losses, label='Train Loss')
    plt.plot(epochs, val_losses, label='Validation Loss')
    plt.title('Loss Over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    # 정확도 시각화
    plt.subplot(1, 2, 2)
    plt.plot(epochs, train_accuracies, label='Train Accuracy')
    plt.plot(epochs, val_accuracies, label='Validation Accuracy')
    plt.title('Accuracy Over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy (%)')
    plt.legend()

    plt.tight_layout()
    plt.show()

In [None]:
# 가중치 기반 앙상블 예측 함수
def weighted_ensemble_predict(models, features, model_f1_scores):
    model_predictions = []
    weights = []

    for i, model in enumerate(models):
        model.eval()
        with torch.no_grad():
            outputs = model(features)
            _, predicted = torch.max(outputs.data, 1)
            model_predictions.append(predicted.cpu().numpy())
            weights.append(model_f1_scores[i])

    model_predictions = np.array(model_predictions)
    weights = np.array(weights)

    weighted_predictions = np.zeros(model_predictions.shape[1], dtype=int)
    for i in range(model_predictions.shape[1]):
        weighted_sum = Counter()
        for j in range(model_predictions.shape[0]):
            weighted_sum[model_predictions[j, i]] += weights[j]
        weighted_predictions[i] = weighted_sum.most_common(1)[0][0]

    return weighted_predictions

In [None]:


results = {}
for model in models:
    accuracy, f1, train_losses, val_losses, train_accuracies, val_accuracies = train_and_evaluate_model(
        model, train_loader, val_loader, num_epochs=num_epochs, learning_rate=learning_rate
    )
    results[model.__class__.__name__] = {'accuracy': accuracy, 'f1': f1}
    plot_learning_curves(train_losses, val_losses, train_accuracies, val_accuracies)

best_model_name = max(results, key=lambda x: results[x]['f1'])
best_model = [model for model in models if model.__class__.__name__ == best_model_name][0]

In [None]:
# 5. 모델 저장 및 불러오기
def save_model(model, filename):
    torch.save(model.state_dict(), filename)

def load_model(model, filename):
    model.load_state_dict(torch.load(filename))
    model.eval()

save_model(best_model, './모델/best_model.pth')

In [None]:
# 6. 자이로 데이터 예측 및 라벨링 추가
gyro_tensor = torch.tensor(X_gyro_features, dtype=torch.float32)
predicted_labels = weighted_ensemble_predict(models, gyro_tensor, model_f1_scores)
gyro_sliced['predicted_label'] = predicted_labels
gyro_sliced.to_csv('./원본 데이터/자이로 데이터_라벨링.csv', index=False)

In [None]:
# 7. 예측 결과 시각화
plt.figure(figsize=(10, 6))
gyro_sliced['predicted_label'].value_counts().sort_index().plot(kind='bar')
plt.title('Predicted Activity Distribution')
plt.xlabel('Activity Label')
plt.ylabel('Count')
plt.show()

plt.figure(figsize=(20, 8))
gyro_sliced['predicted_label_numeric'] = gyro_sliced['predicted_label'].factorize()[0]
plt.plot(gyro_sliced['RegisterDate'], gyro_sliced['predicted_label_numeric'], label='Predicted Label')
plt.title('Activity Prediction Over Time')
plt.xlabel('Time')
plt.ylabel('Predicted Activity (Numeric)')
plt.xticks(rotation=45)
plt.legend()
plt.show()

print("자이로 데이터에 예측된 라벨을 추가한 결과 파일이 저장되었습니다.")