In [151]:
# 알고리즘 4개

#**스마트폰 센서 데이터 기반 모션 분류**
# 단계3 : 단계별 모델링


## 0.미션

단계별로 나눠서 모델링을 수행하고자 합니다.  

* 단계1 : 정적(0), 동적(1) 행동 분류 모델 생성
* 단계2 : 세부 동작에 대한 분류모델 생성
    * 단계1 모델에서 0으로 예측 -> 정적 행동 3가지 분류 모델링
    * 단계1 모델에서 1으로 예측 -> 동적 행동 3가지 분류 모델링 
* 모델 통합
    * 두 단계 모델을 통합하고, 새로운 데이터에 대해서 최종 예측결과와 성능평가가 나오도록 함수로 만들기
* 성능 비교
    * 기본 모델링의 성능과 비교
    * 모든 모델링은 [다양한 알고리즘 + 성능 튜닝]을 수행해야 합니다.
    * 성능 가이드
            * Accuracy : 0.980~1.00


## 1.환경설정

* 세부 요구사항
    - 경로 설정 : 다음의 두가지 방법 중 하나를 선택하여 폴더를 준비하고 데이터를 로딩하시오.
        * 1) 로컬 수행(Ananconda)
            * 제공된 압축파일을 다운받아 압축을 풀고
            * anaconda의 root directory(보통 C:/Users/< ID > 에 project 폴더를 만들고, 복사해 넣습니다.
        * 2) 구글콜랩
            * 구글 드라이브 바로 밑에 project 폴더를 만들고, 
            * 데이터 파일을 복사해 넣습니다.
    
    - 기본적으로 필요한 라이브러리를 import 하도록 코드가 작성되어 있습니다. 
        * 필요하다고 판단되는 라이브러리를 추가하세요.


### (1) 경로 설정

#### 1) 로컬 수행(Anaconda)
* project 폴더에 필요한 파일들을 넣고, 본 파일을 열었다면, 별도 경로 지정이 필요하지 않습니다.

In [152]:
path = 'C:/Users/wl/Desktop/본수업 시작/미니프로젝트/3차 - 240404/1일차/'

#### 2) 구글 콜랩 수행

* 구글 드라이브 연결

In [153]:
# from google.colab import drive
# drive.mount('/content/drive')

In [154]:
# path = '/content/drive/MyDrive/project/'

### (2) 라이브러리 불러오기

#### 1) 라이브러리 로딩

In [155]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings

warnings.filterwarnings(action='ignore')
%config InlineBackend.figure_format='retina'
# 필요하다고 판단되는 라이브러리를 추가하세요.

#### 2) 제공 함수 생성
* 변수 중요도를 시각화할 수 있는 함수를 제공합니다.
* 입력 : 
    * importance : 트리모델의 변수 중요도(예: model.feature_importances_)
    * names : 변수 이름 목록(예 : x_train.columns
    * result_only  : 변수 중요도 순으로 데이터프레임만 return할지, 그래프도 포함할지 결정. False이면 결과 데이터프레임 + 그래프
    * topn : 중요도 상위 n개만 표시. all 이면 전체.
* 출력 : 
    * 중요도 그래프 : 중요도 내림차순으로 정렬
    * 중요도 데이터프레임 : 중요도 내림차순으로 정렬

In [156]:
# 변수의 특성 중요도 계산하기
def plot_feature_importance(importance, names, result_only = False, topn = 'all'):
    feature_importance = np.array(importance)
    feature_name = np.array(names)

    data={'feature_name':feature_name,'feature_importance':feature_importance}
    fi_temp = pd.DataFrame(data)

    #변수의 특성 중요도 순으로 정렬하기
    fi_temp.sort_values(by=['feature_importance'], ascending=False,inplace=True)
    fi_temp.reset_index(drop=True, inplace = True)

    if topn == 'all' :
        fi_df = fi_temp.copy()
    else :
        fi_df = fi_temp.iloc[:topn]

    #변수의 특성 중요도 그래프로 그리기
    if result_only == False :
        plt.figure(figsize=(10,20))
        sns.barplot(x='feature_importance', y='feature_name', data = fi_df)

        plt.xlabel('importance')
        plt.ylabel('feature name')
        plt.grid()

    return fi_df

### (3) 데이터 불러오기

* 주어진 데이터셋
    * data01_train.csv : 학습 및 검증용
    * data01_test.csv : 테스트용
    * feature.csv : feature 이름을 계층구조로 정리한 데이터

* 세부 요구사항
    * 칼럼 삭제 : data01_train.csv와 data01_test.csv 에서 'subject' 칼럼은 불필요하므로 삭제합니다.

#### 1) 데이터로딩

In [157]:
df_train = pd.read_csv(path +'data01_train.csv')
df_test = pd.read_csv(path +'data01_test.csv')
df_feature = pd.read_csv(path +'features.csv')
df_train.head()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",subject,Activity
0,0.288508,-0.009196,-0.103362,-0.988986,-0.962797,-0.967422,-0.989,-0.962596,-0.96565,-0.929747,...,-0.816696,-0.042494,-0.044218,0.307873,0.07279,-0.60112,0.331298,0.165163,21,STANDING
1,0.265757,-0.016576,-0.098163,-0.989551,-0.994636,-0.987435,-0.990189,-0.99387,-0.987558,-0.937337,...,-0.693515,-0.062899,0.388459,-0.765014,0.771524,0.345205,-0.769186,-0.147944,15,LAYING
2,0.278709,-0.014511,-0.108717,-0.99772,-0.981088,-0.994008,-0.997934,-0.982187,-0.995017,-0.942584,...,-0.829311,0.000265,-0.525022,-0.891875,0.021528,-0.833564,0.202434,-0.032755,11,STANDING
3,0.289795,-0.035536,-0.150354,-0.231727,-0.006412,-0.338117,-0.273557,0.014245,-0.347916,0.008288,...,-0.408956,-0.255125,0.612804,0.747381,-0.072944,-0.695819,0.287154,0.111388,17,WALKING
4,0.394807,0.034098,0.091229,0.088489,-0.106636,-0.388502,-0.010469,-0.10968,-0.346372,0.584131,...,-0.563437,-0.044344,-0.845268,-0.97465,-0.887846,-0.705029,0.264952,0.137758,17,WALKING_DOWNSTAIRS


#### 2) 기본 정보 조회

In [158]:
df_train.shape

(5881, 563)

In [159]:
df_test.shape

(1471, 563)

In [160]:
df_train.describe()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-skewness(),fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",subject
count,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,...,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0
mean,0.274811,-0.017799,-0.109396,-0.603138,-0.509815,-0.604058,-0.628151,-0.525944,-0.605374,-0.46549,...,-0.305883,-0.623548,0.008524,-0.001185,0.00934,-0.007099,-0.491501,0.059299,-0.054594,17.381568
std,0.067614,0.039422,0.058373,0.448807,0.501815,0.417319,0.424345,0.485115,0.413043,0.544995,...,0.322808,0.310371,0.33973,0.447197,0.60819,0.476738,0.509069,0.29734,0.278479,8.938316
min,-0.503823,-0.684893,-1.0,-1.0,-0.999844,-0.999667,-1.0,-0.999419,-1.0,-1.0,...,-0.979261,-0.999765,-0.97658,-1.0,-1.0,-1.0,-1.0,-1.0,-0.980143,1.0
25%,0.262919,-0.024877,-0.121051,-0.992774,-0.97768,-0.980127,-0.993602,-0.977865,-0.980112,-0.936067,...,-0.541969,-0.845985,-0.122361,-0.294369,-0.481718,-0.373345,-0.811397,-0.018203,-0.141555,8.0
50%,0.277154,-0.017221,-0.108781,-0.943933,-0.844575,-0.856352,-0.948501,-0.849266,-0.849896,-0.878729,...,-0.342923,-0.712677,0.010278,0.005146,0.011448,-0.000847,-0.709441,0.182893,0.003951,19.0
75%,0.288526,-0.01092,-0.098163,-0.24213,-0.034499,-0.26269,-0.291138,-0.068857,-0.268539,-0.01369,...,-0.127371,-0.501158,0.154985,0.28503,0.499857,0.356236,-0.51133,0.248435,0.111932,26.0
max,1.0,1.0,1.0,1.0,0.916238,1.0,1.0,0.967664,1.0,1.0,...,0.989538,0.956845,1.0,1.0,0.998702,0.996078,0.977344,0.478157,1.0,30.0


In [161]:
df_test.describe()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-skewness(),fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",subject
count,1471.0,1471.0,1471.0,1471.0,1471.0,1471.0,1471.0,1471.0,1471.0,1471.0,...,1471.0,1471.0,1471.0,1471.0,1471.0,1471.0,1471.0,1471.0,1471.0,1471.0
mean,0.273198,-0.017281,-0.108123,-0.614634,-0.515427,-0.607533,-0.63995,-0.53076,-0.609256,-0.481055,...,-0.311513,-0.632273,0.009324,0.015666,0.006272,-0.00151,-0.481737,0.055771,-0.064194,17.539089
std,0.079989,0.045957,0.049082,0.44848,0.506094,0.424243,0.422994,0.489381,0.418536,0.542756,...,0.313792,0.296179,0.324864,0.452616,0.608954,0.483028,0.522714,0.298124,0.281645,9.122876
min,-1.0,-1.0,-0.418354,-0.999717,-0.999873,-1.0,-0.999867,-1.0,-0.999879,-0.948723,...,-0.995357,-0.994664,-0.937468,-0.990492,-0.995222,-0.969066,-0.99938,-0.995073,-1.0,1.0
25%,0.263787,-0.024792,-0.120733,-0.992669,-0.979082,-0.98107,-0.993498,-0.979214,-0.980659,-0.936791,...,-0.546342,-0.844547,-0.119166,-0.265533,-0.485998,-0.3803,-0.81406,-0.017413,-0.148445,8.0
50%,0.277322,-0.017187,-0.108124,-0.952426,-0.867309,-0.86989,-0.958705,-0.873891,-0.863451,-0.890491,...,-0.347433,-0.706699,0.005049,0.023421,-0.005036,0.002408,-0.708911,0.178814,-0.002243,19.0
75%,0.288058,-0.010238,-0.096606,-0.245405,-0.030639,-0.260223,-0.29726,-0.058824,-0.256657,-0.030692,...,-0.125796,-0.51691,0.135698,0.31269,0.518184,0.374583,-0.486534,0.248126,0.096674,26.0
max,0.63151,0.359587,0.543939,0.899922,0.78259,0.931308,0.950758,0.602458,0.784041,0.821218,...,0.941113,0.89421,0.980889,0.991899,0.994366,0.979522,1.0,0.432496,0.992766,30.0


In [162]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5881 entries, 0 to 5880
Columns: 563 entries, tBodyAcc-mean()-X to Activity
dtypes: float64(561), int64(1), object(1)
memory usage: 25.3+ MB


In [163]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1471 entries, 0 to 1470
Columns: 563 entries, tBodyAcc-mean()-X to Activity
dtypes: float64(561), int64(1), object(1)
memory usage: 6.3+ MB


## 2.데이터 전처리

* 세부 요구사항
    - Label 추가 : data 에 Activity_dynamic 를 추가합니다. Activity_dynamic은 과제1에서 is_dynamic과 동일한 값입니다.
    - x와 y1, y2로 분할하시오.
        * y1 : Activity
        * y2 : Activity_dynamic
    - train : val = 8 : 2 혹은 7 : 3
    - random_state 옵션을 사용하여 다른 모델과 비교를 위해 성능이 재현되도록 합니다.

In [164]:
df_train['Activity']

0                 STANDING
1                   LAYING
2                 STANDING
3                  WALKING
4       WALKING_DOWNSTAIRS
               ...        
5876               SITTING
5877      WALKING_UPSTAIRS
5878                LAYING
5879      WALKING_UPSTAIRS
5880               SITTING
Name: Activity, Length: 5881, dtype: object

In [165]:
df_train['Activity_dynamic'] = df_train['Activity'].replace({'STANDING': 0, 'SITTING': 0, 'LAYING': 0, 'WALKING': 1, 'WALKING_UPSTAIRS': 1, 'WALKING_DOWNSTAIRS': 1})
df_train['Activity_dynamic'].value_counts()

0    3234
1    2647
Name: Activity_dynamic, dtype: int64

In [166]:
x = df_train.drop(['Activity', 'Activity_dynamic'], axis=1)
y1 = df_train['Activity']
y2 = df_train['Activity_dynamic']

In [167]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(x, y1, test_size=0.3, shuffle=True, random_state=1)

# 3-1에 추가 모델링시 y2 = df_train['Activity_dynamic'] 적용
# x_train2, x_val2, y_train2, y_val2 = train_test_split(x, y2, test_size=0.3, shuffle=True, random_state=1)

## **3.단계별 모델링**

![](https://github.com/DA4BAM/image/blob/main/step%20by%20step.png?raw=true)

### (1) 단계1 : 정적/동적 행동 분류 모델

* 세부 요구사항
    * 정적 행동(Laying, Sitting, Standing)과 동적 행동(동적 : Walking, Walking-Up, Walking-Down)을 구분하는 모델 생성.
    * 몇가지 모델을 만들고 가장 성능이 좋은 모델을 선정하시오.

#### 1) Logistic Regression

In [168]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# model_lr = LogisticRegression()
model_lr = LogisticRegression()
model_lr.fit(x_train, y_train)
y_pred = model_lr.predict(x_val)


print('Accuracy :',accuracy_score(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

Accuracy : 0.9807365439093484
[[327   2   0   0   1   1]
 [  0 284   8   0   0   0]
 [  0  16 315   0   0   0]
 [  0   0   0 297   0   0]
 [  0   0   0   3 234   2]
 [  0   0   0   1   0 274]]
                    precision    recall  f1-score   support

            LAYING       1.00      0.99      0.99       331
           SITTING       0.94      0.97      0.96       292
          STANDING       0.98      0.95      0.96       331
           WALKING       0.99      1.00      0.99       297
WALKING_DOWNSTAIRS       1.00      0.98      0.99       239
  WALKING_UPSTAIRS       0.99      1.00      0.99       275

          accuracy                           0.98      1765
         macro avg       0.98      0.98      0.98      1765
      weighted avg       0.98      0.98      0.98      1765



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### 2) RandomForest

In [169]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# model_lr = LogisticRegression()
model_lr = RandomForestClassifier()
model_lr.fit(x_train, y_train)
y_pred = model_lr.predict(x_val)


print('Accuracy :',accuracy_score(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

Accuracy : 0.9779036827195468
[[330   0   0   0   0   1]
 [  0 280  12   0   0   0]
 [  0  16 315   0   0   0]
 [  0   0   0 294   2   1]
 [  0   0   0   1 234   4]
 [  0   0   0   0   2 273]]
                    precision    recall  f1-score   support

            LAYING       1.00      1.00      1.00       331
           SITTING       0.95      0.96      0.95       292
          STANDING       0.96      0.95      0.96       331
           WALKING       1.00      0.99      0.99       297
WALKING_DOWNSTAIRS       0.98      0.98      0.98       239
  WALKING_UPSTAIRS       0.98      0.99      0.99       275

          accuracy                           0.98      1765
         macro avg       0.98      0.98      0.98      1765
      weighted avg       0.98      0.98      0.98      1765



### (2) 단계2-1 : 정적 동작 세부 분류

* 세부 요구사항
    * 정적 행동(Laying, Sitting, Standing)인 데이터 추출
    * Laying, Sitting, Standing 를 분류하는 모델을 생성
    * 몇가지 모델을 만들고 가장 성능이 좋은 모델을 선정하시오.

In [170]:
df_train['Activity'].value_counts()

LAYING                1115
STANDING              1087
SITTING               1032
WALKING                998
WALKING_UPSTAIRS       858
WALKING_DOWNSTAIRS     791
Name: Activity, dtype: int64

In [171]:
df1 = df_train[ (df_train['Activity'] == 'STANDING') | (df_train['Activity'] == 'LAYING') | (df_train['Activity'] == 'SITTING') ]
df1

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",subject,Activity,Activity_dynamic
0,0.288508,-0.009196,-0.103362,-0.988986,-0.962797,-0.967422,-0.989000,-0.962596,-0.965650,-0.929747,...,-0.042494,-0.044218,0.307873,0.072790,-0.601120,0.331298,0.165163,21,STANDING,0
1,0.265757,-0.016576,-0.098163,-0.989551,-0.994636,-0.987435,-0.990189,-0.993870,-0.987558,-0.937337,...,-0.062899,0.388459,-0.765014,0.771524,0.345205,-0.769186,-0.147944,15,LAYING,0
2,0.278709,-0.014511,-0.108717,-0.997720,-0.981088,-0.994008,-0.997934,-0.982187,-0.995017,-0.942584,...,0.000265,-0.525022,-0.891875,0.021528,-0.833564,0.202434,-0.032755,11,STANDING,0
7,0.272026,-0.001329,-0.125491,-0.992068,-0.912985,-0.972451,-0.994752,-0.943141,-0.976428,-0.925446,...,-0.024442,0.076332,0.741277,0.729812,-0.817201,0.037746,0.136129,16,STANDING,0
8,0.284338,0.021956,-0.006925,-0.980153,-0.838394,-0.782357,-0.983683,-0.816199,-0.743923,-0.914011,...,0.021212,-0.009465,-0.282762,0.563343,-0.782072,0.242834,-0.025285,22,STANDING,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5874,0.257476,-0.413865,0.017374,-0.919666,0.075259,-0.630716,-0.937986,0.176796,-0.613510,-0.866296,...,-0.000793,0.267189,-0.186202,0.098099,0.792970,-0.034020,-0.928148,1,LAYING,0
5875,0.277378,-0.013298,-0.104322,-0.996596,-0.987491,-0.973345,-0.996372,-0.987746,-0.973512,-0.942156,...,0.122320,0.136275,-0.708377,-0.507788,-0.818263,0.222620,0.035430,27,STANDING,0
5876,0.277194,-0.012389,-0.131974,-0.994046,-0.940578,-0.917337,-0.994261,-0.932830,-0.908088,-0.936219,...,-0.034888,-0.261437,-0.391477,-0.877612,-0.912365,0.114009,0.080146,21,SITTING,0
5878,0.267981,-0.018348,-0.107440,-0.991303,-0.989881,-0.990313,-0.992386,-0.988852,-0.991237,-0.936099,...,0.060173,0.228739,0.684400,-0.216665,0.620363,-0.437247,-0.571840,19,LAYING,0


In [172]:
# Activity_dynamic는 모든 정적행동에 동일하여 제거
df1.drop('Activity_dynamic', axis=1, inplace=True)
df1.columns

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Index(['tBodyAcc-mean()-X', 'tBodyAcc-mean()-Y', 'tBodyAcc-mean()-Z',
       'tBodyAcc-std()-X', 'tBodyAcc-std()-Y', 'tBodyAcc-std()-Z',
       'tBodyAcc-mad()-X', 'tBodyAcc-mad()-Y', 'tBodyAcc-mad()-Z',
       'tBodyAcc-max()-X',
       ...
       'fBodyBodyGyroJerkMag-kurtosis()', 'angle(tBodyAccMean,gravity)',
       'angle(tBodyAccJerkMean),gravityMean)',
       'angle(tBodyGyroMean,gravityMean)',
       'angle(tBodyGyroJerkMean,gravityMean)', 'angle(X,gravityMean)',
       'angle(Y,gravityMean)', 'angle(Z,gravityMean)', 'subject', 'Activity'],
      dtype='object', length=563)

In [126]:
# train/test분리
x = df1.drop(['Activity'], axis=1)
y = df1['Activity']

from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.3, shuffle=True, random_state=1)

In [127]:
# 모델구축1 - Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
model_lr = LogisticRegression()
model_lr.fit(x_train, y_train)
y_pred = model_lr.predict(x_val)


print('Accuracy :',accuracy_score(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

Accuracy : 0.9660144181256437
[[320   0   0]
 [  1 303  13]
 [  0  19 315]]
              precision    recall  f1-score   support

      LAYING       1.00      1.00      1.00       320
     SITTING       0.94      0.96      0.95       317
    STANDING       0.96      0.94      0.95       334

    accuracy                           0.97       971
   macro avg       0.97      0.97      0.97       971
weighted avg       0.97      0.97      0.97       971



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [128]:
accuracy_all = {}
accuracy_all['Logistic Regression'] = round(accuracy_score(y_val, y_pred), 3)
accuracy_all

{'Logistic Regression': 0.966}

In [129]:
# 모델구축2 - Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
model_rf = RandomForestClassifier()
model_rf.fit(x_train, y_train)
y_pred = model_rf.predict(x_val)


print('Accuracy :',accuracy_score(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

Accuracy : 0.964984552008239
[[320   0   0]
 [  0 298  19]
 [  0  15 319]]
              precision    recall  f1-score   support

      LAYING       1.00      1.00      1.00       320
     SITTING       0.95      0.94      0.95       317
    STANDING       0.94      0.96      0.95       334

    accuracy                           0.96       971
   macro avg       0.97      0.97      0.97       971
weighted avg       0.97      0.96      0.96       971



In [130]:
accuracy_all['Random Forest'] = round(accuracy_score(y_val, y_pred), 3)
accuracy_all

{'Logistic Regression': 0.966, 'Random Forest': 0.965}

In [131]:
# 모델구축3 - SVM

In [132]:
# 라벨 인코딩
from sklearn.preprocessing import LabelEncoder

# 라벨 인코더 초기화
label_encoder = LabelEncoder()

# 라벨 인코더를 훈련 데이터의 타깃 라벨에 맞춰 학습시키고 변환합니다.
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.fit_transform(y_val)

In [133]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
model_svm = SVC(C=1, gamma=0.01, kernel = 'rbf', random_state=1)
model_svm.fit(x_train, y_train_encoded)
y_pred = model_svm.predict(x_val)


print('SVM :',accuracy_score(y_val_encoded, y_pred))
print(confusion_matrix(y_val_encoded, y_pred))
print(classification_report(y_val_encoded, y_pred))

SVM : 0.9495365602471678
[[320   0   0]
 [  2 291  24]
 [  0  23 311]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       320
           1       0.93      0.92      0.92       317
           2       0.93      0.93      0.93       334

    accuracy                           0.95       971
   macro avg       0.95      0.95      0.95       971
weighted avg       0.95      0.95      0.95       971



In [134]:
accuracy_all['SVM'] = round(accuracy_score(y_val_encoded, y_pred), 3)
accuracy_all

{'Logistic Regression': 0.966, 'Random Forest': 0.965, 'SVM': 0.95}

In [135]:
# 모델구축4 - XGBoost
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
model_xgb = XGBClassifier()
model_xgb.fit(x_train, y_train_encoded)
y_pred = model_xgb.predict(x_val)


print('XGBoost :',accuracy_score(y_val_encoded, y_pred))
print(confusion_matrix(y_val_encoded, y_pred))
print(classification_report(y_val_encoded, y_pred))

XGBoost : 0.9794026776519053
[[320   0   0]
 [  0 304  13]
 [  0   7 327]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       320
           1       0.98      0.96      0.97       317
           2       0.96      0.98      0.97       334

    accuracy                           0.98       971
   macro avg       0.98      0.98      0.98       971
weighted avg       0.98      0.98      0.98       971



In [137]:
accuracy_all['XGBoost'] = round(accuracy_score(y_val_encoded, y_pred), 3)
accuracy_all

{'Logistic Regression': 0.966,
 'Random Forest': 0.965,
 'SVM': 0.95,
 'XGBoost': 0.979}

### (3) 단계2-2 : 동적 동작 세부 분류

* 세부 요구사항
    * 동적 행동(Walking, Walking Upstairs, Walking Downstairs)인 데이터 추출
    * Walking, Walking Upstairs, Walking Downstairs 를 분류하는 모델을 생성
    * 몇가지 모델을 만들고 가장 성능이 좋은 모델을 선정하시오.

In [138]:
df_train['Activity'].value_counts()

LAYING                1115
STANDING              1087
SITTING               1032
WALKING                998
WALKING_UPSTAIRS       858
WALKING_DOWNSTAIRS     791
Name: Activity, dtype: int64

In [139]:
df2 = df_train[ (df_train['Activity'] == 'WALKING') | (df_train['Activity'] == 'WALKING_UPSTAIRS') | (df_train['Activity'] == 'WALKING_DOWNSTAIRS') ]

# Activity_dynamic는 모든 동적행동에 동일하여 제거
df2.drop(['Activity_dynamic'], axis=1, inplace=True)
df2.columns

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Index(['tBodyAcc-mean()-X', 'tBodyAcc-mean()-Y', 'tBodyAcc-mean()-Z',
       'tBodyAcc-std()-X', 'tBodyAcc-std()-Y', 'tBodyAcc-std()-Z',
       'tBodyAcc-mad()-X', 'tBodyAcc-mad()-Y', 'tBodyAcc-mad()-Z',
       'tBodyAcc-max()-X',
       ...
       'fBodyBodyGyroJerkMag-kurtosis()', 'angle(tBodyAccMean,gravity)',
       'angle(tBodyAccJerkMean),gravityMean)',
       'angle(tBodyGyroMean,gravityMean)',
       'angle(tBodyGyroJerkMean,gravityMean)', 'angle(X,gravityMean)',
       'angle(Y,gravityMean)', 'angle(Z,gravityMean)', 'subject', 'Activity'],
      dtype='object', length=563)

In [140]:
# train/test분리
x = df2.drop(['Activity'], axis=1)
y = df2['Activity']

from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.3, shuffle=True, random_state=1)

In [203]:
# 모델구축1 - Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
model_lr = LogisticRegression(solver='lbfgs', max_iter=300)
model_lr.fit(x_train, y_train)
y_pred = model_lr.predict(x_val)


print('Accuracy :',accuracy_score(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

Accuracy : 0.9841359773371104
[[331   0   0   0   0   0]
 [  0 284   8   0   0   0]
 [  0  16 315   0   0   0]
 [  0   0   0 297   0   0]
 [  0   0   0   2 235   2]
 [  0   0   0   0   0 275]]
                    precision    recall  f1-score   support

            LAYING       1.00      1.00      1.00       331
           SITTING       0.95      0.97      0.96       292
          STANDING       0.98      0.95      0.96       331
           WALKING       0.99      1.00      1.00       297
WALKING_DOWNSTAIRS       1.00      0.98      0.99       239
  WALKING_UPSTAIRS       0.99      1.00      1.00       275

          accuracy                           0.98      1765
         macro avg       0.98      0.98      0.98      1765
      weighted avg       0.98      0.98      0.98      1765



In [142]:
accuracy_all = {}
accuracy_all['Logistic Regression'] = round(accuracy_score(y_val, y_pred), 3)
accuracy_all

{'Logistic Regression': 0.995}

In [143]:
# 모델구축2 - Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
model_rf = RandomForestClassifier()
model_rf.fit(x_train, y_train)
y_pred = model_rf.predict(x_val)


print('Accuracy :',accuracy_score(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

Accuracy : 0.9849056603773585
[[304   2   1]
 [  2 236   5]
 [  0   2 243]]
                    precision    recall  f1-score   support

           WALKING       0.99      0.99      0.99       307
WALKING_DOWNSTAIRS       0.98      0.97      0.98       243
  WALKING_UPSTAIRS       0.98      0.99      0.98       245

          accuracy                           0.98       795
         macro avg       0.98      0.98      0.98       795
      weighted avg       0.98      0.98      0.98       795



In [144]:
accuracy_all['Random Forest'] = round(accuracy_score(y_val, y_pred), 3)
accuracy_all

{'Logistic Regression': 0.995, 'Random Forest': 0.985}

In [145]:
# 라벨 인코딩
from sklearn.preprocessing import LabelEncoder

# 라벨 인코더 초기화
label_encoder = LabelEncoder()

# 라벨 인코더를 훈련 데이터의 타깃 라벨에 맞춰 학습시키고 변환합니다.
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.fit_transform(y_val)

In [146]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
model_svm = SVC(C=1, gamma=0.01, kernel = 'rbf', random_state=1)
model_svm.fit(x_train, y_train_encoded)
y_pred = model_svm.predict(x_val)


print('SVM :',accuracy_score(y_val_encoded, y_pred))
print(confusion_matrix(y_val_encoded, y_pred))
print(classification_report(y_val_encoded, y_pred))

SVM : 1.0
[[307   0   0]
 [  0 243   0]
 [  0   0 245]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       307
           1       1.00      1.00      1.00       243
           2       1.00      1.00      1.00       245

    accuracy                           1.00       795
   macro avg       1.00      1.00      1.00       795
weighted avg       1.00      1.00      1.00       795



In [147]:
accuracy_all['SVM'] = round(accuracy_score(y_val_encoded, y_pred), 3)
accuracy_all

{'Logistic Regression': 0.995, 'Random Forest': 0.985, 'SVM': 1.0}

In [148]:
# 모델구축4 - XGBoost
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
model_xgb = XGBClassifier()
model_xgb.fit(x_train, y_train_encoded)
y_pred = model_xgb.predict(x_val)


print('XGBoost :',accuracy_score(y_val_encoded, y_pred))
print(confusion_matrix(y_val_encoded, y_pred))
print(classification_report(y_val_encoded, y_pred))

XGBoost : 0.9949685534591195
[[306   0   1]
 [  1 242   0]
 [  1   1 243]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       307
           1       1.00      1.00      1.00       243
           2       1.00      0.99      0.99       245

    accuracy                           0.99       795
   macro avg       1.00      0.99      0.99       795
weighted avg       0.99      0.99      0.99       795



In [149]:
accuracy_all['XGBoost'] = round(accuracy_score(y_val_encoded, y_pred), 3)
accuracy_all

{'Logistic Regression': 0.995,
 'Random Forest': 0.985,
 'SVM': 1.0,
 'XGBoost': 0.995}

### (4) 분류 모델 합치기


* 세부 요구사항
    * 두 단계 모델을 통합하고, 새로운 데이터(test)에 대해서 최종 예측결과와 성능평가가 나오도록 함수로 만들기
    * 데이터 파이프라인 구축 : test데이터가 로딩되어 전처리 과정을 거치고, 예측 및 성능 평가 수행

![](https://github.com/DA4BAM/image/blob/main/pipeline%20function.png?raw=true)

#### 1) 함수 만들어서 분류모델 합치기

In [26]:
def model_classication(df_train):    

In [199]:


#######################################################################################################################################
df1 = df_train[ (df_train['Activity'] == 'STANDING') | (df_train['Activity'] == 'LAYING') | (df_train['Activity'] == 'SITTING') ]
df2 = df_train[ (df_train['Activity'] == 'WALKING') | (df_train['Activity'] == 'WALKING_UPSTAIRS') | (df_train['Activity'] == 'WALKING_DOWNSTAIRS') ]

# Activity_dynamic는 모든 동적행동에 동일하여 제거
df1.drop(['Activity_dynamic'], axis=1, inplace=True)
df2.drop(['Activity_dynamic'], axis=1, inplace=True)

# train/test분리
x1 = df1.drop(['Activity'], axis=1)
y1 = df1['Activity']

x2 = df2.drop(['Activity'], axis=1)
y2 = df2['Activity']

#######################################################################################################################################
from sklearn.model_selection import train_test_split
x_train1, x_val1, y_train1, y_val1 = train_test_split(x1, y1, test_size=0.3, shuffle=True, random_state=1)
x_train2, x_val2, y_train2, y_val2 = train_test_split(x2, y2, test_size=0.3, shuffle=True, random_state=1)


#######################################################################################################################################
# 모델구축1 - Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
model_lr = LogisticRegression()

# 정적모델
model_lr.fit(x_train1, y_train1)
y_pred = model_lr.predict(x_val1)
accuracy_all_1 = []
accuracy_all_1.append(['logistic regression', round(accuracy_score(y_val1, y_pred), 3)])


# 동적모델
model_lr.fit(x_train2, y_train2)
y_pred = model_lr.predict(x_val2)
accuracy_all_2 = []
accuracy_all_2.append(['logistic regression', round(accuracy_score(y_val2, y_pred), 3)])



#######################################################################################################################################
# 모델구축2 - Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
model_rf = RandomForestClassifier()

# 정적모델
model_rf.fit(x_train1, y_train1)
y_pred = model_rf.predict(x_val1)
accuracy_all_1.append(['Random Forest', round(accuracy_score(y_val1, y_pred), 3)])

# 동적모델
model_rf.fit(x_train2, y_train2)
y_pred = model_rf.predict(x_val2)
accuracy_all_2.append(['Random Forest', round(accuracy_score(y_val2, y_pred), 3)])


#######################################################################################################################################
# 모델구축3 - SVM

# 라벨 인코딩
from sklearn.preprocessing import LabelEncoder

# 라벨 인코더 초기화
label_encoder = LabelEncoder()

# 라벨 인코더를 훈련 데이터의 타깃 라벨에 맞춰 학습시키고 변환합니다.
y_train1_encoded = label_encoder.fit_transform(y_train1)
y_train2_encoded = label_encoder.fit_transform(y_train2)

y_val1_encoded = label_encoder.fit_transform(y_val1)
y_val2_encoded = label_encoder.fit_transform(y_val2)
#####################################################################

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
model_svm = SVC(C=1, gamma=0.01, kernel = 'rbf', random_state=1)

# 정적모델
model_svm.fit(x_train1, y_train1_encoded)
y_pred = model_svm.predict(x_val1)
accuracy_all_1.append(['SVM', round(accuracy_score(y_val1_encoded, y_pred), 3)])

# 동적모델
model_svm.fit(x_train2, y_train2_encoded)
y_pred = model_svm.predict(x_val2)
accuracy_all_2.append(['SVM', round(accuracy_score(y_val2_encoded, y_pred), 3)])

#######################################################################################################################################
# 모델구축4 - XGBoost
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
model_xgb = XGBClassifier()

# 정적모델
model_xgb.fit(x_train1, y_train1_encoded)
y_pred = model_xgb.predict(x_val1)
accuracy_all_1.append(['XGBoostt', round(accuracy_score(y_val1_encoded, y_pred), 3)])

# 동적모델
model_xgb.fit(x_train2, y_train2_encoded)
y_pred = model_xgb.predict(x_val2)
accuracy_all_2.append(['XGBoost', round(accuracy_score(y_val2_encoded, y_pred), 3)])

#######################################################################################################################################
display('model1 : ', accuracy_all_1)
print()
display('model2 : ', accuracy_all_2)
print()

avg_total=[]
for i in range(len(accuracy_all_1)):
    avg = (accuracy_all_1[i][1] + accuracy_all_2[i][1])/2
    avg = round(avg, 3)
    avg_total.append( [ accuracy_all_1[i][0], avg ] )
print('avg_total : ', avg_total)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


'model1 : '

[['logistic regression', 0.966],
 ['Random Forest', 0.96],
 ['SVM', 0.95],
 ['XGBoostt', 0.979]]




'model2 : '

[['logistic regression', 0.995],
 ['Random Forest', 0.982],
 ['SVM', 1.0],
 ['XGBoost', 0.995]]


avg_total :  [['logistic regression', 0.98], ['Random Forest', 0.971], ['SVM', 0.975], ['XGBoostt', 0.987]]


In [197]:
# x = df_train.drop(['Activity', 'Activity_dynamic'], axis=1)
# y = df_train['Activity']


# # 정적+동적모델만 합칠경우 / 정적,동적으로 분류모델 통합안함
# x_train_all, x_val_all, y_train_all, y_val_all = train_test_split(x, y, test_size=0.3, shuffle=True, random_state=1)

# x_train_stop = x_train_all[ (x_train_all['Activity'] == 'STANDING') | (x_train_all['Activity'] == 'LAYING') | (x_train_all['Activity'] == 'SITTING') ]
# x_train_act = x_train_all[ (x_train_all['Activity'] == 'WALKING') | (x_train_all['Activity'] == 'WALKING_UPSTAIRS') | (x_train_all['Activity'] == 'WALKING_DOWNSTAIRS') ]




In [None]:


# 모델구축2 - Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
model_rf = RandomForestClassifier()
model_rf.fit(x_train, y_train)
y_pred = model_rf.predict(x_val)


print('Accuracy :',accuracy_score(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

accuracy_all['Random Forest'] = round(accuracy_score(y_val, y_pred), 3)
accuracy_all

# 라벨 인코딩
from sklearn.preprocessing import LabelEncoder

# 라벨 인코더 초기화
label_encoder = LabelEncoder()

# 라벨 인코더를 훈련 데이터의 타깃 라벨에 맞춰 학습시키고 변환합니다.
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.fit_transform(y_val)

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
model_svm = SVC(C=1, gamma=0.01, kernel = 'rbf', random_state=1)
model_svm.fit(x_train, y_train_encoded)
y_pred = model_svm.predict(x_val)


print('SVM :',accuracy_score(y_val_encoded, y_pred))
print(confusion_matrix(y_val_encoded, y_pred))
print(classification_report(y_val_encoded, y_pred))

accuracy_all['SVM'] = round(accuracy_score(y_val_encoded, y_pred), 3)
accuracy_all

# 모델구축4 - XGBoost
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
model_xgb = XGBClassifier()
model_xgb.fit(x_train, y_train_encoded)
y_pred = model_xgb.predict(x_val)


print('XGBoost :',accuracy_score(y_val_encoded, y_pred))
print(confusion_matrix(y_val_encoded, y_pred))
print(classification_report(y_val_encoded, y_pred))

accuracy_all['XGBoost'] = round(accuracy_score(y_val_encoded, y_pred), 3)
accuracy_all

#### 2) test 셋으로 예측하고 평가하기

* 성능 평가