# 0. 기본 라이브러리 import

In [1]:
import pandas as pd
import numpy as np
import random
import os
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'Malgun Gothic'
sns.set(font="Malgun Gothic")
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

# 데이터 분할 및 평가 지표

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from math import sqrt
from sklearn.model_selection import train_test_split

# LightGBM 라이브러리

In [3]:
from lightgbm import LGBMRegressor

# 파라미터 튜닝 함수

In [4]:
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score
import optuna

# 빈도테이블 생성

In [5]:
def count(x):
    data1=pd.DataFrame(x.value_counts()).reset_index()
    data1.columns=['category','빈도수']
    return data1

# Barplot 생성

In [6]:
def bargraph(x,y,x_label,y_label,figsize1,figsize2,data):
    plt.figure(figsize=(figsize1, figsize2))
    if data[x].dtypes=='int64':
        data[x]=data[x].astype('str')
    data=data.sort_values(by=y,ascending=False).reset_index()
    
    for index, row in data.iterrows():
        g=sns.barplot(x,y,data=data)
        plt.xlabel(x_label)
        plt.ylabel(y_label)
        g.text(index,row[y],row[y],color='black',ha='center')

# 성능 지표 추출

In [7]:
# 성능지표 추출
def Evaluation_metric(actual,pred):
    print(f'MAE: {round(mean_absolute_error(actual,pred),2)}')
    print(f'MSE: {round(mean_squared_error(actual,pred),2)}')
    print(f'RMSE: {round(sqrt(mean_squared_error(actual,pred)),2)}')
    #print(f'RMSPE: {round(sqrt(mean_squared_error(actual,pred))/np.mean(actual)*100,2)}%')
    print(f'RMSPE: {round(np.sqrt(np.mean(((actual - pred) / actual) ** 2)) * 100, 2)}%')

# 1. 시드 고정

In [8]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

# 2. 데이터 불러오기

In [9]:
# Train, Valid, Test Split용 데이터
data=pd.read_csv(os.listdir()[10],encoding='EUC-KR')
# 지표 확인용 데이터
data_real=pd.read_csv(os.listdir()[10],encoding='EUC-KR')
# 배정신청인원 0명 제거
data=data.query('합계!=0')
#필요 컬럼 추출
selected_columns_1=['구분','합계', '작물 종류','농지면적(실제경작)','전년대비농경체증감률','고령농경체비율','전년도이탈인원','전년도활용여부']
selected_columns_2=['비고', '지자체명_시도', '지자체명_시군구', '구분', '농업경영체','합계', '작물 종류','농지면적(실제경작)','전년대비농경체증감률','고령농경체비율','전년도이탈인원','전년도활용여부'] #원본 데이터 컬럼
data=data[selected_columns_1]
data_real=data_real[selected_columns_2]
# 농지면적(실제경작) 컬럼명 변경
data.rename(columns={'농지면적(실제경작)':'농지면적','작물 종류':'작물종류'},inplace=True)

# 3. 이상치 제거: 농지 면적, 합계

In [10]:
data=data.query('0.04<농지면적<2.5')
data=data.query('합계<11')

# 4. 데이터 분할 Train, Valid, Test

In [11]:
# 작물 종류 기준 층화 추출 Train Test 비율 8:2
X=data.drop(columns=['합계']) #독립변수 Set
Y=data['합계'] #Target 변수 set

# Train, Test 분할
X_train, X_test, Y_train, Y_test=train_test_split(X,Y,test_size=0.2,stratify=X['작물종류'],random_state=42)

# 2차 Train, Validation 분할
X_train, X_valid, Y_train, Y_valid=train_test_split(X_train,Y_train,test_size=0.2,stratify=X_train['작물종류'],random_state=42)

# 5. 연속형 변수 정규화 & 범주형 변수 라벨 인코딩 진행

In [12]:
# 연속형 변수 정규화
min_max_scaler=MinMaxScaler()
for i in X_train.columns:
    if (X_train[i].dtypes!='object'):
        X_train[i]=min_max_scaler.fit_transform(X_train[[i]])
        X_valid[i]=min_max_scaler.transform(X_valid[[i]])
        X_test[i]=min_max_scaler.transform(X_test[[i]])
    else:
        pass

In [13]:
# 범주형 변수 라벨 인코딩
label_encoder=LabelEncoder()
for i in X_train.columns:
    if X_train[i].dtypes=='object':
        X_train[i]=label_encoder.fit_transform(X_train[i])
        X_valid[i]=label_encoder.transform(X_valid[i])
        X_test[i]=label_encoder.transform(X_test[i])
    else:
        pass

# 6. LightGBM parameter Tuning

- 주요 파라미터 설명

- num_leaves: 트리가 가질 수 있는 최대 잎의 수

- max_depth: 최대 트리 깊이

- learning_rate: 학습률

- n_estimator: 생성할 부스팅 트리 개수

- subsample: 훈련데이터 샘플링 비율

-------------------------------------------------------------------------------------------------------------


- 초기 파라미터
- model_reg=LGBMRegressor(n_jobs=-1,n_estimators=150,learning_rate=0.05,random_state=42,objective='regression')

-------------------------------------------------------------------------------------------------------------

# 7. 초기 기본 모델

In [14]:
model_reg=LGBMRegressor(n_jobs=-1,n_estimators=200,learning_rate=0.05,random_state=42,objective='regression')
model_reg.fit(X_train,Y_train)
pred=model_reg.predict(X_test)
# 인원 수 예측이기에 예측된 결과에 반올림 적용
pred=pd.DataFrame(pred,columns=['예측값'])
pred['예측값']=round(pred['예측값'],0)
pred
print('-'*100)
Evaluation_metric(Y_test,pred=pred['예측값'])
print('-'*100)
# Test data Set
test_index=X_test.index.tolist()
data_test=data_real.loc[test_index]
data_test=data_test.reset_index()
data_test.drop(columns=['index'],inplace=True)
data_test['predict']=pred
result=data_test.groupby(['지자체명_시도','지자체명_시군구'])[['합계','predict']].sum().reset_index()
result_1=result.query('합계>10')
Evaluation_metric(result_1['합계'],pred=result_1['predict'])

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000245 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 447
[LightGBM] [Info] Number of data points in the train set: 7784, number of used features: 7
[LightGBM] [Info] Start training from score 3.371274
----------------------------------------------------------------------------------------------------
MAE: 1.25
MSE: 2.97
RMSE: 1.72
RMSPE: 125.02%
----------------------------------------------------------------------------------------------------
MAE: 8.2
MSE: 140.0
RMSE: 11.83
RMSPE: 21.14%


# 8. Bayesian Optimization 튜닝

In [15]:
# 목적 함수 정의
results = {
    'subsample': [],
    'n_estimators': [],
    'learning_rate': [],
    'target': []  # 최적화 결과인 target 값 저장
}

def xgb_cv(subsample, n_estimators, learning_rate):
    params = {
        'subsample': subsample,
        'n_estimators': int(n_estimators),
        'learning_rate': learning_rate,
        'n_jobs':-1,
        'objective':'regression',
        'verbosity':0
    }
    
    # XGBoost Regressor 모델 초기화
    model_reg = LGBMRegressor(**params)
    
    #model Train 학습
    model_reg.fit(X_train,Y_train)
    #model validation
    scores = -cross_val_score(model_reg, X_valid, Y_valid, cv=5, scoring='neg_mean_squared_error').mean()
    results['subsample'].append(subsample)
    results['n_estimators'].append(n_estimators)
    results['learning_rate'].append(learning_rate)
    results['target'].append(scores)
    return scores

# Bayesian Optimization 수행
xgbBO = BayesianOptimization(
    xgb_cv,
    {'subsample': (0.80, 1.0),
     'n_estimators': (200, 300),
     'learning_rate': (0.03,0.06)}
)

# 최적화
xgbBO.maximize(init_points=10, n_iter=15)

idx_of_min=results['target'].index(min(results['target']))
min_pam={}
for key,value in results.items():
    if key=='target':
        pass
    else:
        if (key=='max_depth') or (key=='n_estimators'):
            min_pam[key]=int(round(value[idx_of_min],0))
        else:
            min_pam[key]=value[idx_of_min]
            
print(f'최종 파라미터는 {min_pam}입니다')

model_reg = LGBMRegressor(**min_pam)
model_reg.fit(X_train,Y_train)

#예측값 
pred=model_reg.predict(X_test)
# 인원 수 예측이기에 예측된 결과에 반올림 적용
pred=pd.DataFrame(pred,columns=['예측값'])
#반올림
pred['예측값']=round(pred['예측값'],0)
#평가지표
print('농업경영체별 평가지표 결과')
Evaluation_metric(Y_test,pred=pred['예측값'])
print('-'*100)

test_index=X_test.index.tolist()
data_test=data_real.loc[test_index]
data_test=data_test.reset_index()
data_test.drop(columns=['index'],inplace=True)
#예측값
data_test['predict']=pred
result=data_test.groupby(['지자체명_시도','지자체명_시군구'])[['합계','predict']].sum().reset_index()
result_1=result.query('합계>10')
print('배정신청인원 10명 이상 시군구 단위 평가지표')
Evaluation_metric(result_1['합계'],pred=result_1['predict'])

|   iter    |  target   | learni... | n_esti... | subsample |
-------------------------------------------------------------
| [0m1        [0m | [0m3.594    [0m | [0m0.05741  [0m | [0m227.0    [0m | [0m0.847    [0m |
| [0m2        [0m | [0m3.527    [0m | [0m0.03855  [0m | [0m244.7    [0m | [0m0.8274   [0m |
| [95m3        [0m | [95m3.637    [0m | [95m0.05541  [0m | [95m288.1    [0m | [95m0.9865   [0m |
| [0m4        [0m | [0m3.521    [0m | [0m0.04048  [0m | [0m235.3    [0m | [0m0.871    [0m |
| [0m5        [0m | [0m3.539    [0m | [0m0.04219  [0m | [0m228.9    [0m | [0m0.8933   [0m |
| [0m6        [0m | [0m3.493    [0m | [0m0.03225  [0m | [0m230.6    [0m | [0m0.9037   [0m |
| [0m7        [0m | [0m3.615    [0m | [0m0.05548  [0m | [0m252.2    [0m | [0m0.8799   [0m |
| [0m8        [0m | [0m3.493    [0m | [0m0.03163  [0m | [0m237.0    [0m | [0m0.9612   [0m |
| [0m9        [0m | [0m3.606    [0m | [0m0.0546   

# 9. OPTUNA 라이브러리 활용 파라미터 튜닝

파라미터 튜닝 코드

- optuna.trial.Trial.suggest_categorical() : 리스트 범위 내에서 값을 선택한다.
- optuna.trial.Trial.suggest_int() : 범위 내에서 정수형 값을 선택한다.
- optuna.trial.Trial.suggest_float() : 범위 내에서 소수형 값을 선택한다.
- optuna.trial.Trial.suggest_uniform() : 범위 내에서 균일분포 값을 선택한다.
- optuna.trial.Trial.suggest_discrete_uniform() : 범위 내에서 이산 균일분포 값을 선택한다.
- optuna.trial.Trial.suggest_loguniform() : 범위 내에서 로그 함수 값을 선택한다.

In [17]:
def objective(trial):
    params={
        'n_jobs':-1,
        'objective':'regression',
        'verbosity':0,
        'subsample': trial.suggest_categorical('subsample',[0.8,0.85,0.90,0.95,1]),
        'n_estimators': trial.suggest_int('n_estimators',200,400),
        'learning_rate': trial.suggest_float('learning_rate',0.03,0.05)
    }
    # 학습 모델 생성
    model_reg= LGBMRegressor(**params)
    
    #트레인 셋 학습
    model_reg.fit(X_train,Y_train)
    
    #검증 데이터 셋으로 모델 검증
    score = mean_absolute_error(model_reg.predict(X_valid), Y_valid)
    return score

study=optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=150)

# 최적화된 파라미터 값 확인
best_params = study.best_params
print(best_params)

model_reg =  LGBMRegressor(**best_params)
model_reg.fit(X_train,Y_train)

#예측값 
pred=model_reg.predict(X_test)
# 인원 수 예측이기에 예측된 결과에 반올림 적용
pred=pd.DataFrame(pred,columns=['예측값'])
#반올림
pred['예측값']=round(pred['예측값'],0)
#평가지표
print('농업경영체별 평가지표 결과')
Evaluation_metric(Y_test,pred=pred['예측값'])
print('-'*100)

test_index=X_test.index.tolist()
data_test=data_real.loc[test_index]
data_test=data_test.reset_index()
data_test.drop(columns=['index'],inplace=True)
#예측값
data_test['predict']=pred
result=data_test.groupby(['지자체명_시도','지자체명_시군구'])[['합계','predict']].sum().reset_index()
result_1=result.query('합계>10')
print('배정신청인원 10명 이상 시군구 단위 평가지표')
Evaluation_metric(result_1['합계'],pred=result_1['predict'])

[I 2023-11-28 11:11:16,190] A new study created in memory with name: no-name-14ea8d2a-daf8-485c-ae7e-876f70784d21
[I 2023-11-28 11:11:16,406] Trial 0 finished with value: 1.322859975546507 and parameters: {'subsample': 0.8, 'n_estimators': 277, 'learning_rate': 0.03495557000618027}. Best is trial 0 with value: 1.322859975546507.
[I 2023-11-28 11:11:16,576] Trial 1 finished with value: 1.3228084238145807 and parameters: {'subsample': 0.95, 'n_estimators': 354, 'learning_rate': 0.031732134014734854}. Best is trial 1 with value: 1.3228084238145807.
[I 2023-11-28 11:11:16,740] Trial 2 finished with value: 1.3220283961506396 and parameters: {'subsample': 0.95, 'n_estimators': 356, 'learning_rate': 0.044453065545376806}. Best is trial 2 with value: 1.3220283961506396.
[I 2023-11-28 11:11:16,909] Trial 3 finished with value: 1.3205266473999875 and parameters: {'subsample': 1, 'n_estimators': 307, 'learning_rate': 0.03559648166113955}. Best is trial 3 with value: 1.3205266473999875.
[I 2023-11

{'subsample': 0.8, 'n_estimators': 388, 'learning_rate': 0.038863229772208976}
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000132 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 447
[LightGBM] [Info] Number of data points in the train set: 7784, number of used features: 7
[LightGBM] [Info] Start training from score 3.371274
농업경영체별 평가지표 결과
MAE: 1.24
MSE: 2.98
RMSE: 1.72
RMSPE: 127.16%
----------------------------------------------------------------------------------------------------
배정신청인원 10명 이상 시군구 단위 평가지표
MAE: 7.91
MSE: 129.02
RMSE: 11.36
RMSPE: 20.72%
