In [2]:
import pandas as pd
from pathlib import Path

data_path = Path('../../data_analysis_lect/datasets/product_inspection/product_inspection.csv')
df = pd.read_csv(data_path)
df['date'] = pd.to_datetime(df['date'])
df.head()

Unnamed: 0,date,inspection_step,value,upper_spec,target,lower_spec
0,2022-01-01,A,21.2,22.0,21.3,20.6
1,2022-01-02,A,21.7,22.0,21.3,20.6
2,2022-01-03,A,21.4,22.0,21.3,20.6
3,2022-01-04,A,21.5,22.0,21.3,20.6
4,2022-01-05,A,21.5,22.0,21.3,20.6


In [3]:
# 특정 변수의 그룹별 평균치 계산
df.groupby('inspection_step')['value'].mean()

inspection_step
A    21.295105
B    31.628671
C    28.792308
Name: value, dtype: float64

In [7]:
df.groupby('inspection_step')['value'].transform('std')

0      0.259286
1      0.259286
2      0.259286
3      0.259286
4      0.259286
         ...   
424    1.259112
425    1.259112
426    1.259112
427    1.259112
428    1.259112
Name: value, Length: 429, dtype: float64

In [5]:
# inspection_step 변수의 고유값 별 가장 빠른 date를 가지는 행만 추출
temp = df.sort_values(['inspection_step','date']).drop_duplicates('inspection_step')
temp

Unnamed: 0,date,inspection_step,value,upper_spec,target,lower_spec,normalized1
0,2022-01-01,A,21.2,22.0,21.3,20.6,-0.366795
143,2022-01-01,B,31.6,32.1,31.6,31.1,-0.159778
286,2022-01-01,C,29.7,32.5,28.9,25.3,0.720899


In [4]:
# data normalization 1 평균/표준편차를 이용한 표준화
df['normalized1'] = df.groupby('inspection_step')['value'].transform(lambda x: (x - x.mean())/x.std())
#transform메서드: 원래 행의 개수를 유지하면서 각 그룹별 값을 반환
df['normalized1']

0     -0.366795
1      1.561575
2      0.404553
3      0.790227
4      0.790227
         ...   
424    1.356267
425   -1.502891
426    0.800320
427    0.482636
428   -1.344049
Name: normalized1, Length: 429, dtype: float64

In [11]:
df.value_counts()

inspection_step  date        value  upper_spec  target  lower_spec  normalized1  normalized2
A                2022-01-01  21.2   22.0        21.3    20.6        -0.366795     0.0           1
C                2022-01-10  30.7   32.5        28.9    25.3         1.515109     1.0           1
                 2022-01-08  30.0   32.5        28.9    25.3         0.959162     0.3           1
                 2022-01-07  29.8   32.5        28.9    25.3         0.800320     0.1           1
                 2022-01-06  30.3   32.5        28.9    25.3         1.197425     0.6           1
                                                                                               ..
A                2022-05-21  21.6   22.0        21.3    20.6         1.175901     0.4           1
                 2022-05-20  21.6   22.0        21.3    20.6         1.175901     0.4           1
                 2022-05-19  21.7   22.0        21.3    20.6         1.561575     0.5           1
                 2022-05-

In [13]:
df.groupby('inspection_step')['value'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
inspection_step,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A,143.0,21.295105,0.259286,20.5,21.1,21.3,21.5,21.9
B,143.0,31.628671,0.179445,31.1,31.5,31.6,31.8,32.0
C,143.0,28.792308,1.259112,25.1,27.9,28.7,29.8,32.5


In [6]:
# "temp"에 저장된 서브 데이터셋에서 "inspection_step" 열을 인덱스로 설정
# 인덱스와 value 열만 남김
temp = temp.set_index('inspection_step')['value']
temp

inspection_step
A    21.2
B    31.6
C    29.7
Name: value, dtype: float64

In [9]:
# data normalization 2
# 최대 - 최소 표준화
df = df.set_index('inspection_step')
df['normalized2'] = df['value'] - temp
df = df.reset_index()
df

Unnamed: 0,inspection_step,date,value,upper_spec,target,lower_spec,normalized1,normalized2
0,A,2022-01-01,21.2,22.0,21.3,20.6,-0.366795,0.0
1,A,2022-01-02,21.7,22.0,21.3,20.6,1.561575,0.5
2,A,2022-01-03,21.4,22.0,21.3,20.6,0.404553,0.2
3,A,2022-01-04,21.5,22.0,21.3,20.6,0.790227,0.3
4,A,2022-01-05,21.5,22.0,21.3,20.6,0.790227,0.3
...,...,...,...,...,...,...,...,...
424,C,2022-05-19,30.5,32.5,28.9,25.3,1.356267,0.8
425,C,2022-05-20,26.9,32.5,28.9,25.3,-1.502891,-2.8
426,C,2022-05-21,29.8,32.5,28.9,25.3,0.800320,0.1
427,C,2022-05-22,29.4,32.5,28.9,25.3,0.482636,-0.3
