In [1]:
import pandas as pd
import numpy as np
from scipy.signal import find_peaks
from scipy.fft import fft
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
# 데이터 불러오기
training_data = pd.read_csv('./원본 데이터/pml-training.csv')
testing_data = pd.read_csv('./원본 데이터/pml-testing.csv')

# 데이터 샘플링 (샘플 데이터의 크기와 컬럼 확인)
print(training_data.head())
print(training_data.columns)

   Unnamed: 0 user_name  raw_timestamp_part_1  raw_timestamp_part_2  \
0           1  carlitos            1323084231                788290   
1           2  carlitos            1323084231                808298   
2           3  carlitos            1323084231                820366   
3           4  carlitos            1323084232                120339   
4           5  carlitos            1323084232                196328   

     cvtd_timestamp new_window  num_window  roll_belt  pitch_belt  yaw_belt  \
0  05/12/2011 11:23         no          11       1.41        8.07     -94.4   
1  05/12/2011 11:23         no          11       1.41        8.07     -94.4   
2  05/12/2011 11:23         no          11       1.42        8.07     -94.4   
3  05/12/2011 11:23         no          12       1.48        8.05     -94.4   
4  05/12/2011 11:23         no          12       1.48        8.07     -94.4   

   ...  gyros_forearm_x gyros_forearm_y gyros_forearm_z accel_forearm_x  \
0  ...             0.03

  training_data = pd.read_csv('./원본 데이터/pml-training.csv')


In [3]:
print(training_data.describe())

         Unnamed: 0  raw_timestamp_part_1  raw_timestamp_part_2    num_window  \
count  19622.000000          1.962200e+04          19622.000000  19622.000000   
mean    9811.500000          1.322827e+09         500656.144277    430.640047   
std     5664.527827          2.049277e+05         288222.879958    247.909554   
min        1.000000          1.322490e+09            294.000000      1.000000   
25%     4906.250000          1.322673e+09         252912.250000    222.000000   
50%     9811.500000          1.322833e+09         496380.000000    424.000000   
75%    14716.750000          1.323084e+09         751890.750000    644.000000   
max    19622.000000          1.323095e+09         998801.000000    864.000000   

          roll_belt    pitch_belt      yaw_belt  total_accel_belt  \
count  19622.000000  19622.000000  19622.000000      19622.000000   
mean      64.407197      0.305283    -11.205061         11.312608   
std       62.750255     22.351242     95.193926          7.7423

In [4]:
print(f"Training data length: {len(training_data)}")
print(f"Testing data length: {len(testing_data)}")

Training data length: 19622
Testing data length: 20


## **특징 추출 함수 정의**

In [5]:
def extract_features(df, window_size=3, overlap=0.8, is_training=False):
    features = []
    labels = []
    step = int(window_size * (1 - overlap))

    for start in range(0, len(df) - window_size, step):
        end = start + window_size
        window = df.iloc[start:end]

        feature_vector = []

        for axis in ['gyros_forearm_x', 'gyros_forearm_y', 'gyros_forearm_z']:
            gyro_data = window[axis].values

            # 각속도 (1차 미분)
            gyro_diff = np.diff(gyro_data, axis=0)

            # 가속도 (2차 미분)
            gyro_acc = np.diff(gyro_diff, axis=0)

            # 피크 탐지
            peaks, _ = find_peaks(gyro_data)
            num_peaks = len(peaks)

            # 이동 평균
            moving_avg = window[axis].rolling(window=5).mean().mean()

            # 이동 표준 편차
            moving_std = window[axis].rolling(window=5).std().mean()

            # 변화율
            rate_of_change = (gyro_data[-1] - gyro_data[0]) / window_size

            # Fourier 변환 (주파수 영역 특징)
            fft_vals = fft(gyro_data)
            fft_mean = np.mean(np.abs(fft_vals))

            # Wavelet 변환, 에너지 등 추가 기능

            # 특징 벡터
            feature_vector.extend([
                gyro_diff.mean(), # 평균 속도
                gyro_acc.mean(),  # 평균 가속도
                num_peaks,        # 피크 개수
                moving_avg,       # 이동 평균
                moving_std,       # 이동 표준 편차
                rate_of_change,   # 변화율
                fft_mean          # Fourier 평균
            ])

        features.append(feature_vector)

        # 훈련 데이터일 경우 레이블을 추가
        if is_training:
            labels.append(window['classe'].iloc[-1])

    if is_training:
        return np.array(features), np.array(labels)
    else:
        return np.array(features)

# 훈련 데이터에서 특징 추출
X_train, y_train = extract_features(training_data, window_size=10, overlap=0.5, is_training=True)

# 테스트 데이터에서 특징 추출 (레이블 없이)
X_test = extract_features(testing_data, window_size=10, overlap=0.5, is_training=False)

# 표준화
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [6]:
print(f"testing data로부터 추출된 특징들 수: {X_test.shape}")

testing data로부터 추출된 특징들 수: (2, 21)


In [7]:
# 모델 초기화 및 훈련
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [8]:
# 테스트 데이터에 대한 예측
y_pred = model.predict(X_test)
print(f'Predicted classes: {y_pred}')

Predicted classes: ['E' 'E']


In [9]:
# 훈련 데이터에 대해 교차 검증 수행
cv_scores = cross_val_score(model, X_train, y_train, cv=5)
print(f"교차 검증 점수: {cv_scores}")
print(f"평균 교차 검증 점수: {cv_scores.mean():.2f}")

# 훈련 데이터로 예측 및 혼동 행렬 생성
y_train_pred = model.predict(X_train)
conf_matrix = confusion_matrix(y_train, y_train_pred)
print("Confusion Matrix:")
print(conf_matrix)

# 분류 보고서 생성
class_report = classification_report(y_train, y_train_pred)
print("Classification Report:")
print(class_report)

교차 검증 점수: [0.36050955 0.22802548 0.30318471 0.23469388 0.3494898 ]
평균 교차 검증 점수: 0.30
Confusion Matrix:
[[1115    0    0    0    0]
 [   0  759    0    0    0]
 [   0    0  684    0    0]
 [   0    0    0  644    0]
 [   0    0    0    0  721]]
Classification Report:
              precision    recall  f1-score   support

           A       1.00      1.00      1.00      1115
           B       1.00      1.00      1.00       759
           C       1.00      1.00      1.00       684
           D       1.00      1.00      1.00       644
           E       1.00      1.00      1.00       721

    accuracy                           1.00      3923
   macro avg       1.00      1.00      1.00      3923
weighted avg       1.00      1.00      1.00      3923

