In [1]:
import pandas as pd

In [2]:
tomato = pd.read_csv('tomato_anomaly.csv', index_col=0)
strawberry = pd.read_csv('strawberry_anomaly.csv', index_col=0)
paprica = pd.read_csv('paprica_anomaly.csv', index_col=0)

In [3]:
tomato

Unnamed: 0,xco2,xinsunadd,xintemp1,xsthum,yield_output
mysb4_1,1.568226,1.983854,0.331212,1.796899,649.31
mysb4_2,1.815418,2.229121,0.686742,1.891823,11725.0
mysb4_3,0.919963,2.38445,0.353057,0.80456,8069.44
mysb2_2,0.231546,1.975161,0.717848,3.032744,12148.15
mysb6_2,2.32479,1.861574,0.574999,1.989953,5439.81


In [4]:
strawberry

Unnamed: 0,xco2,xinsunadd,xintemp1,xsthum,yield_output
mysb6_6,0.296633,3.128465,1.057159,1.392112,1759.67
mysb6_1,0.0,0.260533,1.217159,1.610267,1090.28
mysb4_4,0.25891,1.731777,0.343589,0.78581,1671.79
mysb6_5,0.033354,2.677468,0.810482,2.041166,1267.36


In [5]:
paprica

Unnamed: 0,xco2,xinsunadd,xintemp1,xsthum,yield_output
mysb2_1,0.285799,1.947975,1.512216,1.056203,5705.79


# 데이터 전처리
- 컬럼 명세
    - yield_output : 각 농가별 생산량 정보 입력 (10a당 생산량)
    - deviation : 작물별 생산량의 중앙값으로부터의 편차
    
- scaling
    - RobustScaler를 이용하여 평균이 아닌 중앙값을 이용해 센터링함.
<br>
- 데이터 양이 적고, 생산량 격차가 크므로, 평균 대신 중앙값을 대표값으로 활용

In [6]:
def get_dev(df):
    '''중앙값으로부터의 편차를 계산해 컬럼에 추가하는 함수'''
    median_ = df['yield_output'].median()
    df['deviation'] = df['yield_output'] - median_
    return df

tomato = get_dev(tomato)
strawberry = get_dev(strawberry)
paprica = get_dev(paprica)

In [7]:
tomato

Unnamed: 0,xco2,xinsunadd,xintemp1,xsthum,yield_output,deviation
mysb4_1,1.568226,1.983854,0.331212,1.796899,649.31,-7420.13
mysb4_2,1.815418,2.229121,0.686742,1.891823,11725.0,3655.56
mysb4_3,0.919963,2.38445,0.353057,0.80456,8069.44,0.0
mysb2_2,0.231546,1.975161,0.717848,3.032744,12148.15,4078.71
mysb6_2,2.32479,1.861574,0.574999,1.989953,5439.81,-2629.63


In [8]:
from sklearn.preprocessing import RobustScaler
tomato_scaler = RobustScaler()
tomato['scaled'] = tomato_scaler.fit_transform(tomato[['deviation']])

strawberry_scaler = RobustScaler()
strawberry['scaled'] = strawberry_scaler.fit_transform(strawberry[['deviation']])

paprica_scaler = RobustScaler()
paprica['scaled'] = paprica_scaler.fit_transform(paprica[['deviation']])

In [23]:
import joblib
joblib.dump(tomato_scaler, 'tomato_scaler.joblib')
joblib.dump(strawberry_scaler, 'strawberry_scaler.joblib')
joblib.dump(paprica_scaler, 'paprica_scaler.joblib')

['paprica_scaler.joblib']

## 인풋 데이터 형태
- 인덱스 : 각 농가
- 컬럼 : 이상치 탐지한 환경 데이터
- 값 : 탐지된 이상치들의 총 합

In [9]:
tomato

Unnamed: 0,xco2,xinsunadd,xintemp1,xsthum,yield_output,deviation,scaled
mysb4_1,1.568226,1.983854,0.331212,1.796899,649.31,-7420.13,-1.180574
mysb4_2,1.815418,2.229121,0.686742,1.891823,11725.0,3655.56,0.581615
mysb4_3,0.919963,2.38445,0.353057,0.80456,8069.44,0.0,0.0
mysb2_2,0.231546,1.975161,0.717848,3.032744,12148.15,4078.71,0.64894
mysb6_2,2.32479,1.861574,0.574999,1.989953,5439.81,-2629.63,-0.418385


In [10]:
strawberry

Unnamed: 0,xco2,xinsunadd,xintemp1,xsthum,yield_output,deviation,scaled
mysb6_6,0.296633,3.128465,1.057159,1.392112,1759.67,290.095,0.616345
mysb6_1,0.0,0.260533,1.217159,1.610267,1090.28,-379.295,-0.805862
mysb4_4,0.25891,1.731777,0.343589,0.78581,1671.79,202.215,0.429632
mysb6_5,0.033354,2.677468,0.810482,2.041166,1267.36,-202.215,-0.429632


In [11]:
paprica

Unnamed: 0,xco2,xinsunadd,xintemp1,xsthum,yield_output,deviation,scaled
mysb2_1,0.285799,1.947975,1.512216,1.056203,5705.79,0.0,0.0


# 모델학습
- Feature : 'xco2'이상치 총합, 'xinsunadd' 이상치 총합, 'xintemp1' 이상치 총합, 'xsthum' 이상치 총합 
- Target : median으로부터의 편차를 스케일링한 값

- Ridge 선형회귀 모델을 사용하여 가중치에 규제를 줌으로써, 적은 데이터 셋에도 출력값이 튀지 않도록 설정.
    - 추후 데이터 양이 매우 많아지면 다른 모델 적용을 검토

In [12]:
FEATURE = ['xco2', 'xinsunadd', 'xintemp1', 'xsthum']
TARGET = 'scaled'

In [13]:
from sklearn.linear_model import Ridge

def train_ridge(df):
    '''Ridge 선형회귀 모델로 학습'''
    x = df[FEATURE]
    y = df[TARGET]
    
    ridge = Ridge(alpha=2)
    ridge.fit(x, y)
    
    return ridge

In [14]:
model_tomato = train_ridge(tomato.drop('mysb4_1', axis=0))
model_strawberry = train_ridge(strawberry)
model_paprica = train_ridge(paprica)