In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
train = pd.read_csv("dataset/income_prediction/train.csv")
test = pd.read_csv("dataset/income_prediction/test.csv")
sample_submission = pd.read_csv("dataset/income_prediction/sample_submission.csv")

In [3]:
train_ohe = pd.get_dummies(train.iloc[:,1:]).astype(int)

In [7]:
correlation = train_ohe.corr()['Income']
# 절댓값이 0.2보다 큰 상관관계를 갖는 컬럼 추출
relevant_columns = correlation[abs(correlation) > 0.2].index.tolist()

print("절댓값이 0.2보다 큰 상관관계를 갖는 컬럼:")
print(relevant_columns)

절댓값이 0.2보다 큰 상관관계를 갖는 컬럼:
['Working_Week (Yearly)', 'Income', 'Education_Status_Children', 'Employment_Status_Not Working', 'Industry_Status_Not in universe or children', 'Occupation_Status_Unknown', 'Martial_Status_Single', 'Household_Status_Child <18 never marr not in subfamily', 'Household_Summary_Child under 18 never married', 'Tax_Status_Married Filling Jointly both under 65 (MFJ)', 'Tax_Status_Nonfiler']


# 모델링

## Tabnet (baseline1)

In [129]:
categorical_vars = train.select_dtypes(include=['object']).columns.tolist()

print(categorical_vars)
print(len(categorical_vars))

['ID', 'Gender', 'Education_Status', 'Employment_Status', 'Industry_Status', 'Occupation_Status', 'Race', 'Hispanic_Origin', 'Martial_Status', 'Household_Status', 'Household_Summary', 'Citizenship', 'Birth_Country', 'Birth_Country (Father)', 'Birth_Country (Mother)', 'Tax_Status', 'Income_Status']
17


In [73]:
from pytorch_tabnet.tab_model import TabNetRegressor

import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

#import pandas as pd
#import numpy as np
np.random.seed(721)

### 전처리

In [None]:
target = 'Income' #타겟 지정

from sklearn.preprocessing import StandardScaler

# 데이터 타입이 정수인 열만 선택 (타겟을 제외)
int_columns = train.select_dtypes(include=['int']).drop(columns=[target]).columns

# 수치형 열에 대해 표준화 수행
for column in int_columns:
    # 수치형 열에 대한 데이터 추출
    column_data = train[column].values.reshape(-1, 1)  # 행 벡터로 변환하여 스케일러에 입력
    # 표준화 수행
    scaler = StandardScaler()
    standardized_data = scaler.fit_transform(column_data)
    # 결과를 원래의 데이터프레임에 대입
    train[column] = standardized_data.flatten()  # 스케일링된 데이터를 다시 열에 대입

# 결과 확인
print("정규화된 데이터:")
print(train)

target(수치형) 표준화를 위해 평균과 표준편차 미리 저장

In [None]:
t_std = train[target].std()

In [None]:
train[target] = (train[target] - t_mean) / t_std

인코딩

In [139]:
train2 = train.copy()

In [143]:
train_ohe = pd.get_dummies(train2[categorical_vars].iloc[:,1:])
train_ohe= train_ohe.astype(int)

In [144]:
categorical_dims =  {}

for col in categorical_vars:
    print(col, train[col].nunique())
    l_enc = LabelEncoder()
#    train[col] = train[col].fillna("VV_likely")
    train2[col] = l_enc.fit_transform(train2[col].values)
    categorical_dims[col] = len(l_enc.classes_)

ID 20000
Gender 2
Education_Status 17
Employment_Status 8
Industry_Status 24
Occupation_Status 15
Race 5
Hispanic_Origin 10
Martial_Status 7
Household_Status 31
Household_Summary 8
Citizenship 5
Birth_Country 43
Birth_Country (Father) 43
Birth_Country (Mother) 43
Tax_Status 6
Income_Status 3


In [145]:
for col in categorical_vars:
    # 기준으로 Income의 평균을 계산하고, 인덱스를 바탕으로 순서를 부여하여 숫자로 인코딩
    encoding_map_col = {category: i for i, category in enumerate(train.groupby(col).mean('Income').sort_values('Income').index)}
    # 인코딩된 값을 새로운 열에 추가
    train[col] = train[col].map(encoding_map_col)
    test[col] = test[col].map(encoding_map_col)

In [112]:
train = pd.concat([train,train_ohe],axis=1)

In [146]:
train

Unnamed: 0,ID,Age,Gender,Education_Status,Employment_Status,Working_Week (Yearly),Industry_Status,Occupation_Status,Race,Hispanic_Origin,...,Citizenship,Birth_Country,Birth_Country (Father),Birth_Country (Mother),Tax_Status,Gains,Losses,Dividends,Income_Status,Income
0,9224,63,1,4,4,4,7,5,4,8,...,3,29,29,32,0,0,0,0,1,425
1,5960,37,1,16,4,52,12,5,4,8,...,3,29,29,32,3,0,0,0,0,0
2,14937,58,0,13,4,52,14,8,1,8,...,3,29,29,32,5,3411,0,0,0,860
3,14861,44,1,13,4,52,5,14,4,8,...,3,29,29,32,3,0,0,0,0,850
4,11397,37,0,13,4,52,5,4,4,8,...,3,29,29,32,4,0,0,0,1,570
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,17565,33,1,13,3,52,17,7,4,8,...,3,29,29,32,3,0,0,0,0,1300
19996,14821,20,0,12,4,12,8,8,4,7,...,3,29,15,16,0,0,0,0,0,850
19997,15849,22,1,12,3,52,21,14,4,8,...,3,29,29,32,3,0,0,0,1,999
19998,2385,76,0,13,0,0,1,1,4,8,...,3,29,17,38,3,0,0,0,0,0


In [147]:
unused_feat = ['Set','ID']
target = 'Income'
#타겟과 불필요한 피처를 제외한 학습 피처들 지정 
features = [ col for col in train.columns if col not in unused_feat+[target]] 
#카테고리 피처들의 인덱스를 담아준다
cat_idxs = [ i for i, f in enumerate(features) if f in categorical_vars]
#카테고리 피처들의 차원을 담아준다 (카테고리 클래스 수)
cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical_vars]

#하이퍼 파라미터 임베딩 차원
cat_emb_dim = [1, 8, 4, 12, 7, 2, 5, 3, 15, 4, 2, 20, 20, 20, 3, 1]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(train[features], train[target], test_size=0.1, random_state=721)

In [148]:
X_train = train[features]
y_train = train[target]

In [149]:
X_train = X_train.values
#y는 차원이 없음
y_train = y_train.values.reshape(-1, 1)

#X_valid = X_valid.values
#y는 차원이 없음
#y_valid = y_valid.values.reshape(-1, 1)

카테고리 변수들 레이블 인코딩 # 참고로 tabnet은 cat_dim, cat_index, 임베딩 차원 이렇게 새 개의 하이퍼파라미터를 가진다.

In [None]:
#if "Set" not in train.columns:
#    train["Set"] = np.random.choice(["train", "valid"], p =[.9, .1], size=(train.shape[0],), random_seed = 721)

#train_indices = train[train.Set=="train"].index
#valid_indices = train[train.Set=="valid"].index

In [None]:
#X_train = train[features].values[train_indices]
#y는 차원이 없음
#y_train = train[target].values[train_indices].reshape(-1, 1)

#X_valid = train[features].values[valid_indices]
#y는 차원이 없음
#y_valid = train[target].values[valid_indices].reshape(-1, 1)

In [150]:
X_train

array([[63,  1,  4, ...,  0,  0,  1],
       [37,  1, 16, ...,  0,  0,  0],
       [58,  0, 13, ...,  0,  0,  0],
       ...,
       [22,  1, 12, ...,  0,  0,  1],
       [76,  0, 13, ...,  0,  0,  0],
       [41,  0, 13, ...,  0,  0,  0]])

In [151]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
tnet = TabNetRegressor(cat_idxs=cat_idxs, cat_dims=cat_dims,cat_emb_dim=cat_emb_dim, optimizer_fn=torch.optim.Adam, device_name=device)

max_epochs = 50 #if not os.getenv("CI", False) else 2

tnet.fit(
    X_train=X_train, y_train=y_train,
    #eval_set=(X_train, y_train),
    #eval_name=['train', 'valid'],
    eval_metric=['mse'],
    max_epochs=max_epochs,
    patience=20,
    batch_size=256, virtual_batch_size=32,
    num_workers=0,
    drop_last=False
    #augmentations=aug, #aug
) 

epoch 0  | loss: 731820.10058|  0:00:03s
epoch 1  | loss: 467499.05595|  0:00:06s
epoch 2  | loss: 378012.52415|  0:00:09s
epoch 3  | loss: 368997.3283|  0:00:13s
epoch 4  | loss: 366785.14292|  0:00:16s
epoch 5  | loss: 362109.30085|  0:00:19s
epoch 6  | loss: 357432.28879|  0:00:23s
epoch 7  | loss: 357212.09595|  0:00:26s
epoch 8  | loss: 354441.9205|  0:00:29s
epoch 9  | loss: 354168.82125|  0:00:32s
epoch 10 | loss: 352851.4953|  0:00:36s
epoch 11 | loss: 351078.41495|  0:00:39s
epoch 12 | loss: 350265.70438|  0:00:42s
epoch 13 | loss: 346773.2032|  0:00:46s
epoch 14 | loss: 346741.0312|  0:00:49s
epoch 15 | loss: 346591.33985|  0:00:52s
epoch 16 | loss: 344040.54925|  0:00:56s
epoch 17 | loss: 341569.39738|  0:00:59s
epoch 18 | loss: 341539.32637|  0:01:02s
epoch 19 | loss: 338939.9364|  0:01:06s
epoch 20 | loss: 335862.3104|  0:01:09s
epoch 21 | loss: 334673.54955|  0:01:12s
epoch 22 | loss: 334114.01515|  0:01:15s
epoch 23 | loss: 332302.59488|  0:01:19s
epoch 24 | loss: 330729

저 모델의 특성을 전혀 고려하지 않고 명목형 변수들까지 모두 스케일링 했더니 오류가 났다.
이에 수치형 변수들만 스케일링 해주었더니 오류가 나지 않는다.

메모리 문제로 cuda 오류가 뜰 수도 있다. 재부팅을 해주면 된다

In [None]:
p = tnet.predict(X_valid.values)
rmse = mean_squared_error(p, y_valid, squared=False)
print("RMSE:", rmse)

### 테스트

In [None]:
test.info()

test set은 항상 train set과 세세히 비교해야 한다.

In [None]:
train.columns

In [None]:
test.columns

테스트 데이터도 학습 데이터와 마찬가지로 수치형 피처들만 표준화를 시켜준다. 

In [None]:
# 데이터 타입이 정수인 열만 선택
int_columns = test.select_dtypes(include=['int']).columns

# 선택된 열에 대해 정규화 수행
for column in int_columns:
    # 특정 열에 대한 데이터 추출
    column_data = test[column].values.reshape(-1, 1)  # 행 벡터로 변환하여 스케일러에 입력
    
    # 표준화 수행
    scaler = StandardScaler()
    standardized_data = scaler.fit_transform(column_data)
    
    # 결과를 원래의 데이터프레임에 대입
    test[column] = standardized_data.flatten()  # 스케일링된 데이터를 다시 열에 대입

# 결과 확인
print("정규화된 데이터:")
print(test)

In [None]:
test.isna().sum()#결측치 확인

In [None]:
test['Household_Status'].value_counts() 

In [None]:
for col in categorical_vars:
    print(col, test[col].nunique())
    l_enc = LabelEncoder()
    test[col] = test[col].fillna("Householder") #최빈값
    test[col] = l_enc.fit_transform(test[col].values)
#    categorical_dims[col] = len(l_enc.classes_)

In [None]:
X_test = test[features].values
#y_test = test[target].values.reshape(-1, 1)

In [None]:
pred1 = tnet.predict(X_test)

In [None]:
pred1.shape

In [None]:
pred1[0:]

In [None]:
sample_submission['Income'] = pred1[0:]
submission1 = sample_submission.copy()

In [None]:
submission1

In [None]:
(-0.876546 * t_std) + t_mean

In [None]:
submission1['Income'] = submission1['Income'] * t_std + t_mean

In [None]:
submission1

In [None]:
#저장
submission1.to_csv('/home/sgh/yes/envs/DACON/submission_tabnet.csv',index=False)

In [157]:
fi =  pd.DataFrame(columns=features, data=[tnet.feature_importances_])#차원이 없는 array는 리스트로 만들어서 넣어주기
fi

Unnamed: 0,Age,Gender,Education_Status,Employment_Status,Working_Week (Yearly),Industry_Status,Occupation_Status,Race,Hispanic_Origin,Martial_Status,...,Household_Summary,Citizenship,Birth_Country,Birth_Country (Father),Birth_Country (Mother),Tax_Status,Gains,Losses,Dividends,Income_Status
0,0.0,0.041707,0.071556,0.014855,0.134215,0.046428,0.0,0.0,0.11838,0.11959,...,0.0,0.0,0.0,0.0,7e-06,0.391956,0.0,0.0,8.229627e-09,0.0


In [153]:
top_30_values = fi.iloc[0].nlargest(30)
print(top_30_values)

Tax_Status                3.919563e-01
Working_Week (Yearly)     1.342149e-01
Martial_Status            1.195898e-01
Hispanic_Origin           1.183803e-01
Education_Status          7.155593e-02
Household_Status          6.130665e-02
Industry_Status           4.642804e-02
Gender                    4.170701e-02
Employment_Status         1.485454e-02
Birth_Country (Mother)    6.544678e-06
Dividends                 8.229627e-09
Birth_Country (Father)    0.000000e+00
Losses                    0.000000e+00
Gains                     0.000000e+00
Age                       0.000000e+00
Birth_Country             0.000000e+00
Citizenship               0.000000e+00
Household_Summary         0.000000e+00
Race                      0.000000e+00
Occupation_Status         0.000000e+00
Income_Status             0.000000e+00
Name: 0, dtype: float64


Tax_Status                0.3919563

Working_Week (Yearly)     0.1342149

Martial_Status            0.1195898

Hispanic_Origin           0.1183803

Education_Status          0.0715559

Household_Status          0.0613066

Industry_Status           0.0464280

Gender                    0.0417070

Employment_Status         0.0148545

Birth_Country (Mother)    0.0000065

Dividends                 0.0000000082


In [10]:
int_columns = train.select_dtypes(include=['int']).columns
train[int_columns].corr()

Unnamed: 0,Age,Working_Week (Yearly),Gains,Losses,Dividends,Income
Age,1.0,0.206218,0.044156,0.053384,0.107565,0.124795
Working_Week (Yearly),0.206218,1.0,0.053362,0.056658,0.002209,0.419628
Gains,0.044156,0.053362,1.0,-0.013313,0.081115,0.009555
Losses,0.053384,0.056658,-0.013313,1.0,0.015351,0.036502
Dividends,0.107565,0.002209,0.081115,0.015351,1.0,0.002449
Income,0.124795,0.419628,0.009555,0.036502,0.002449,1.0


TABNET은 기대이하의 성능을 기록함. 데이터 부족 등의 문제로 추정됨

단, 딥 러닝 모델은 트리기반 머신러닝 모델들과 달리 하이퍼 파라미터에 의존적인 결과를 내진 않는다는 점에서 이 변수 중요도가 의미가 있다고 보았다.

## CATBOOST (baseline2)

트리기반 모델을 위한 데이터 재구성

In [260]:
train = pd.read_csv("dataset/income_prediction/train.csv")
test = pd.read_csv("dataset/income_prediction/test.csv")
sample_submission = pd.read_csv("dataset/income_prediction/sample_submission.csv")

중요도가 낮은 변수 제외

In [219]:
train = train[["ID","Tax_Status", "Working_Week (Yearly)", "Martial_Status", "Hispanic_Origin", "Education_Status",
              "Household_Status", "Industry_Status", "Gender" ,"Employment_Status", "Income"]]
test = test[["ID","Tax_Status", "Working_Week (Yearly)", "Martial_Status", "Hispanic_Origin", "Education_Status",
              "Household_Status", "Industry_Status", "Gender" ,"Employment_Status"]]

In [261]:
categorical_vars = train.drop(columns='ID').select_dtypes(include=['object']).columns.tolist()

결측치

In [262]:
test['Household_Status'].fillna("Other Relative 18+ never married Responsible Person of subfamily", inplace=True)

상위 세 특성 전체 원핫인코딩 + 클래스가 적은 특성

In [252]:
train.groupby('Gender').mean('Income')

Unnamed: 0_level_0,Age,Working_Week (Yearly),Gains,Losses,Dividends,Income
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
F,36.300038,33.327731,179.616979,23.631494,112.389324,491.735103
M,34.898825,36.718409,606.805311,58.414568,135.609572,623.620382


In [263]:
ohe_features = ['Tax_Status','Martial_Status','Hispanic_Origin','Gender']

train_ohe = pd.get_dummies(train[categorical_vars]).astype(int)
train = pd.concat([train,train_ohe],axis=1)

test_ohe = pd.get_dummies(test[categorical_vars]).astype(int)
test = pd.concat([test,test_ohe],axis=1)

In [264]:
# train 데이터와 test 데이터의 피처 이름을 가져옵니다.
train_features = set(train.drop(columns="Income").columns)
test_features = set(test.columns)

# train 데이터에만 있는 피처를 찾습니다.
extra_features_in_train = train_features - test_features

# train 데이터에서 해당 피처를 제거합니다.
train = train.drop(columns=extra_features_in_train)

In [170]:
train_taxstatus = pd.get_dummies(train['Tax_Status']).astype(int)
train = pd.concat([train,train_taxstatus],axis=1)

train_martialstatus = pd.get_dummies(train['Martial_Status']).astype(int)
train = pd.concat([train,train_martialstatus],axis=1)

train_hispanicorigin = pd.get_dummies(train['Hispanic_Origin']).astype(int)
train = pd.concat([train,train_hispanicorigin],axis=1)

train_gender = pd.get_dummies(train['Gender']).astype(int)
train = pd.concat([train,train_gender],axis=1)

In [171]:
test_taxstatus = pd.get_dummies(test['Tax_Status']).astype(int)
test = pd.concat([test,test_taxstatus],axis=1)

test_martialstatus = pd.get_dummies(test['Martial_Status']).astype(int)
test = pd.concat([test,test_martialstatus],axis=1)

test_hispanicorigin = pd.get_dummies(test['Hispanic_Origin']).astype(int)
test = pd.concat([test,test_hispanicorigin],axis=1)

test_gender = pd.get_dummies(test['Gender']).astype(int)
test = pd.concat([test,test_gender],axis=1)

Education_Status (Childern 변수 넣어주기)

In [222]:
# Education_Status가 'Children'인 경우에 해당하는 행의 값을 1로 설정
train['Children'] = train['Education_Status'].apply(lambda x: "child_yes" if x == 'Children' else "child_no")

train2 = pd.get_dummies(train['Children'])
train2 = train2.astype(int)
# 기존 데이터프레임과 원핫인코딩된 데이터프레임을 합칠 수도 있습니다.
train = pd.concat([train, train2], axis=1)

train.drop(columns=["Children"], inplace =True)

test['Children'] = test['Education_Status'].apply(lambda x: "child_yes" if x == 'Children' else "child_no")

test2 = pd.get_dummies(test['Children'])
test2 = test2.astype(int)
# 기존 데이터프레임과 원핫인코딩된 데이터프레임을 합칠 수도 있습니다.
test = pd.concat([test, test2], axis=1)

test.drop(columns=["Children"], inplace =True)

Household_Status(Householder 변수 넣어주기)

In [223]:
# Education_Status가 'Children'인 경우에 해당하는 행의 값을 1로 설정
train['householder'] = train['Household_Status'].apply(lambda x: "householder_yes" if x == 'Householder' else "householder_no")

train2 = pd.get_dummies(train['householder'])
train2 = train2.astype(int)
# 기존 데이터프레임과 원핫인코딩된 데이터프레임을 합칠 수도 있습니다.
train = pd.concat([train, train2], axis=1)

train.drop(columns=["householder"], inplace =True)

# Education_Status가 'Children'인 경우에 해당하는 행의 값을 1로 설정
test['householder'] = test['Household_Status'].apply(lambda x: "householder_yes" if x == 'Householder' else "householder_no")

test2 = pd.get_dummies(test['householder'])
test2 = test2.astype(int)
# 기존 데이터프레임과 원핫인코딩된 데이터프레임을 합칠 수도 있습니다.
test = pd.concat([test, test2], axis=1)

test.drop(columns=["householder"], inplace =True)

Industry_Status(Not in universe or children 변수 넣어주기)

In [224]:
# Employment_Status 'Not_working'인 경우에 해당하는 행의 값을 1로 설정
train['Not_Universe'] = train['Industry_Status'].apply(lambda x: "Not_University_yes" if x == 'Not in universe or children' else "Not_University_no")

train2 = pd.get_dummies(train['Not_Universe'])
train2 = train2.astype(int)
# 기존 데이터프레임과 원핫인코딩된 데이터프레임을 합칠 수도 있습니다.
train = pd.concat([train, train2], axis=1)

train.drop(columns=["Not_Universe"], inplace =True)

# Employment_Status 'Not_working'인 경우에 해당하는 행의 값을 1로 설정
test['Not_Universe'] = test['Industry_Status'].apply(lambda x: "Not_University_yes" if x == 'Not in universe or children' else "Not_University_no")

test2 = pd.get_dummies(test['Not_Universe'])
test2 = test2.astype(int)
# 기존 데이터프레임과 원핫인코딩된 데이터프레임을 합칠 수도 있습니다.
test = pd.concat([test, test2], axis=1)

test.drop(columns=["Not_Universe"], inplace =True)

Employment_Status(Not_Working 변수 넣어주기)

In [225]:
# Employment_Status 'Not_working'인 경우에 해당하는 행의 값을 1로 설정
train['Not_working'] = train['Employment_Status'].apply(lambda x: "Not_working_yes" if x == 'Not Working' else "Not_working_no")

train2 = pd.get_dummies(train['Not_working'])
train2 = train2.astype(int)
# 기존 데이터프레임과 원핫인코딩된 데이터프레임을 합칠 수도 있습니다.
train = pd.concat([train, train2], axis=1)

train.drop(columns=["Not_working"], inplace =True)

# Employment_Status 'Not_working'인 경우에 해당하는 행의 값을 1로 설정
test['Not_working'] = test['Employment_Status'].apply(lambda x: "Not_working_yes" if x == 'Not Working' else "Not_working_no")

test2 = pd.get_dummies(test['Not_working'])
test2 = test2.astype(int)
# 기존 데이터프레임과 원핫인코딩된 데이터프레임을 합칠 수도 있습니다.
test = pd.concat([test, test2], axis=1)

test.drop(columns=["Not_working"], inplace =True)

In [226]:
train.drop(columns=["Gender_F", "householder_no", "child_no", "Not_University_no", "Not_working_no"], inplace =True)
test.drop(columns=["Gender_F", "householder_no", "child_no", "Not_University_no", "Not_working_no"], inplace =True)

타겟 인코딩

In [267]:

target = 'Income'
unused_feat = ['ID']

#타겟과 불필요한 피처를 제외한 학습 피처들 지정 
features = [ col for col in train.columns if col not in unused_feat+[target]] 
#카테고리 피처들의 인덱스를 담아준다
cat_idxs = [ i for i, f in enumerate(features) if f in categorical_vars]

for col in categorical_vars:
    # 기준으로 Income의 평균을 계산하고, 인덱스를 바탕으로 순서를 부여하여 숫자로 인코딩
    encoding_map_col = {category: i for i, category in enumerate(train.groupby(col).mean('Income').sort_values('Income').index)}
    # 인코딩된 값을 새로운 열에 추가
    train[col] = train[col].map(encoding_map_col)
    test[col] = test[col].map(encoding_map_col)

In [280]:
print(features)
print(cat_idxs)

['Age', 'Gender', 'Education_Status', 'Employment_Status', 'Working_Week (Yearly)', 'Industry_Status', 'Occupation_Status', 'Race', 'Hispanic_Origin', 'Martial_Status', 'Household_Status', 'Household_Summary', 'Citizenship', 'Birth_Country', 'Birth_Country (Father)', 'Birth_Country (Mother)', 'Tax_Status', 'Gains', 'Losses', 'Dividends', 'Income_Status', 'Gender_F', 'Gender_M', 'Education_Status_Associates degree (Academic)', 'Education_Status_Associates degree (Vocational)', 'Education_Status_Bachelors degree', 'Education_Status_Children', 'Education_Status_College', 'Education_Status_Doctorate degree', 'Education_Status_Elementary (1-4)', 'Education_Status_Elementary (5-6)', 'Education_Status_High Freshman', 'Education_Status_High Junior', 'Education_Status_High Senior', 'Education_Status_High Sophomore', 'Education_Status_High graduate', 'Education_Status_Kindergarten', 'Education_Status_Masters degree', 'Education_Status_Middle (7-8)', 'Education_Status_Professional degree', 'Emplo

In [230]:
train

Unnamed: 0,ID,Tax_Status,Working_Week (Yearly),Martial_Status,Hispanic_Origin,Education_Status,Household_Status,Industry_Status,Gender,Employment_Status,...,Hispanic_Origin_ Mexican (Mexicano),Hispanic_Origin_ Mexican-American,Hispanic_Origin_ NA,Hispanic_Origin_ Other Spanish,Hispanic_Origin_ Puerto Rican,Gender_M,child_yes,householder_yes,Not_University_yes,Not_working_yes
0,TRAIN_00000,0,4,4,8,4,27,7,1,4,...,0,0,0,0,0,1,0,1,0,0
1,TRAIN_00001,3,52,3,8,16,22,12,1,4,...,0,0,0,0,0,1,0,0,0,0
2,TRAIN_00002,5,52,4,8,13,27,14,0,4,...,0,0,0,0,0,0,0,1,0,0
3,TRAIN_00003,3,52,5,8,13,22,5,1,4,...,0,0,0,0,0,1,0,0,0,0
4,TRAIN_00004,4,52,5,8,13,27,5,0,4,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,TRAIN_19995,3,52,1,8,13,22,17,1,3,...,0,0,0,0,0,1,0,0,0,0
19996,TRAIN_19996,0,12,1,7,12,16,8,0,4,...,0,1,0,0,0,0,0,0,0,0
19997,TRAIN_19997,3,52,1,8,12,8,21,1,3,...,0,0,0,0,0,1,0,0,0,0
19998,TRAIN_19998,3,0,0,8,13,22,1,0,0,...,0,0,0,0,0,0,0,0,1,1


In [179]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 63 columns):
 #   Column                                                               Non-Null Count  Dtype
---  ------                                                               --------------  -----
 0   Tax_Status                                                           20000 non-null  int64
 1   Working_Week (Yearly)                                                20000 non-null  int64
 2   Martial_Status                                                       20000 non-null  int64
 3   Hispanic_Origin                                                      20000 non-null  int64
 4   Education_Status                                                     20000 non-null  int64
 5   Household_Status                                                     20000 non-null  int64
 6   Industry_Status                                                      20000 non-null  int64
 7   Gender                

In [269]:
len(features)

285

In [270]:
X_train1 = train[features]
y_train1 = train[target]

In [271]:
X_test1 = test[features]

In [279]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Columns: 287 entries, ID to Income_Status_Unknown
dtypes: int64(286), object(1)
memory usage: 43.8+ MB


In [281]:
cat_idxs

[1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20]

In [None]:
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
#import numpy as np
from sklearn.model_selection import cross_val_score


# CatBoostRegressor 모델 생성
cb_model= CatBoostRegressor(iterations=1000, learning_rate=0.01, depth=12, cat_features=cat_idxs)

# 모델 훈련 및 교차 검증
scores = cross_val_score(cb_model, X_train1, y_train1, cv=5, scoring='neg_mean_squared_error')

# 교차 검증 점수 출력
print("Cross-Validation Scores (MSE):", -scores)

# 평균 교차 검증 점수 계산
mean_score = -scores.mean()
print("Mean Cross-Validation Score (MSE):", mean_score)

# RMSE로 변환
rmse_scores = np.sqrt(-scores)
print("Cross-Validation Scores (RMSE):", rmse_scores)

# 평균 RMSE 계산
mean_rmse = rmse_scores.mean()
print("Mean Cross-Validation RMSE:", mean_rmse)

In [92]:
import optuna
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
#import numpy as np
from sklearn.model_selection import cross_val_score

# Objective 함수 정의
def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        'depth': trial.suggest_int('depth', 1, 10),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 15.0),
        'border_count': trial.suggest_int('border_count', 32, 400),
        'random_strength': trial.suggest_loguniform('random_strength', 0.01, 15.0),
        'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.01, 120.0),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 100),
        'od_type': 'Iter',
        'od_wait': 20,
        'verbose': False
    }

    # 모델 훈련
    cb_model = CatBoostRegressor(**params,cat_features=cat_idxs)
    scores = cross_val_score(cb_model, X_train1, y_train1, cv=5, scoring='neg_mean_squared_error')
    
    # 각 fold의 평균 제곱근 오차(RMSE) 계산
    rmse_scores = np.sqrt(-scores)
    avg_rmse = rmse_scores.mean()
    return avg_rmse

# Optuna 스터디 설정 및 실행
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# 결과 출력
print("Best trial:")
best_trial = study.best_trial
print("  Value: ", best_trial.value)
print("  Params: ")
for key, value in best_trial.params.items():
    print("    {}: {}".format(key, value))

[I 2024-04-05 16:38:58,808] A new study created in memory with name: no-name-00369600-4251-49b6-8a08-83b93f90881e
[I 2024-04-05 16:39:03,643] Trial 0 finished with value: 608.0850035883557 and parameters: {'iterations': 880, 'learning_rate': 0.020972064514793862, 'depth': 1, 'l2_leaf_reg': 0.034616597133619885, 'border_count': 147, 'random_strength': 3.0439887625052218, 'bagging_temperature': 0.406249129731437, 'min_data_in_leaf': 50}. Best is trial 0 with value: 608.0850035883557.
[I 2024-04-05 16:39:23,455] Trial 1 finished with value: 602.1617864353232 and parameters: {'iterations': 401, 'learning_rate': 0.012378064840970681, 'depth': 10, 'l2_leaf_reg': 0.5558545555924262, 'border_count': 307, 'random_strength': 0.6015356373099148, 'bagging_temperature': 11.629770981957538, 'min_data_in_leaf': 98}. Best is trial 1 with value: 602.1617864353232.
[I 2024-04-05 16:39:25,299] Trial 2 finished with value: 604.2278218855312 and parameters: {'iterations': 110, 'learning_rate': 0.0434426594

Best trial:
  Value:  601.6519538748784
  Params: 
    iterations: 715
    learning_rate: 0.018878225933021422
    depth: 9
    l2_leaf_reg: 0.0040812839053198675
    border_count: 371
    random_strength: 1.8138146013086647
    bagging_temperature: 0.05334680309520758
    min_data_in_leaf: 9


In [None]:
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
#import numpy as np
from sklearn.model_selection import cross_val_score
# CatBoostRegressor 모델훈련
cb_model = CatBoostRegressor(**best_trial.params, cat_features=cat_idxs, loss_function='RMSE')
cb_model.fit(X_train1, y_train1)

In [None]:
# 변수 중요도를 가져오기
feature_importance = cb_model.feature_importances_

# 각 변수의 중요도 출력
for i, importance in enumerate(feature_importance):
    print(f"Feature {i}: Importance = {importance}")

cat_feature들을 지정해주니까 오류가 풀렸다. cat_boost애들도 카테고리 피처를 전달해줘야한다.

cross_val_score는 내부적으로 학습기를 학습, 예측, 평가까지 자동으로 해주는거다. !!모델 평가용이지 엄연히 학습은 아니라는 것을 기억해라

pred함수를 쓰려면 fit이 필요하다

In [None]:
#이건 fit을 해줘야 한다.

#검증 세트에 대한 예측
#y_pred1 = cb_model.predict(X_valid1)

#검증 세트에 대한 RMSE 계산
#test_rmse = np.sqrt(mean_squared_error(y_valid1, y_pred1))
#print("Valid Set RMSE:", test_rmse)

In [None]:
p = cb_model.predict(X_valid)
rmse = mean_squared_error(p, y_valid, squared=False)
print("RMSE:", rmse)

## XGBOOST (baseline3)

In [285]:
new_columns = [col.replace('(', '').replace(')', '').replace('<', '') for col in X_train1.columns]

In [287]:
X_train1.columns = new_columns
X_test1.columns = new_columns

In [None]:
import xgboost as xgb
# 하이퍼파라미터 설정

# XGBoost 데이터 객체 생성
dtrain = xgb.DMatrix(X_train1, label=y_train1, feature_names=new_columns)

params = {
    'objective': 'reg:squarederror',  # 회귀 문제 설정
    'max_depth': 6,# 트리의 최대 깊이
    'n_estimators':1000,
    'learning_rate': 0.1,  # 학습률
    'gamma': 0,  # 가지치기에 사용되는 최소 손실 감소량
    'colsample_bytree': 0.8,  # 각 트리를 구성할 때 피처의 비율
    'eval_metric': 'rmse'  # 모델 평가 지표
}

# 모델 학습
num_round = 1000  # 트리의 개수
xgb_model = xgb.train(params, dtrain, num_round)

In [102]:
from xgboost import XGBRegressor
import optuna
# Objective function for Optuna
def objective(trial):
    dtrain = xgb.DMatrix(X_train1, label=y_train1, feature_names=new_columns)
    
    # XGBoost 하이퍼파라미터 탐색 공간 설정
    param = {
        'objective': 'reg:squarederror',
        'booster': 'gbtree',
        'eval_metric': 'rmse',
        'tree_method': 'hist',
        'lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, min(300, len(X_train1))),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=100),
        'gamma': trial.suggest_float('gamma', 0.0, 1.0),
        'eta': trial.suggest_float('eta', 0.1, 0.3),
        'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide']),
    }
    
    # Train XGBoost model
    xgb_model = XGBRegressor(**param)#, early_stopping_rounds=50)
    scores = cross_val_score(xgb_model, X_train1, y_train1, cv=5, scoring='neg_mean_squared_error')
    
    # 각 fold의 평균 제곱근 오차(RMSE) 계산
    rmse_scores = np.sqrt(-scores)
    avg_rmse = rmse_scores.mean()
    return avg_rmse
    

# Create study object and optimize hyperparameters
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# Print best parameters and best score
print("Best parameters:", study.best_params)
print("Best RMSE:", study.best_value)

[I 2024-04-05 17:11:14,617] A new study created in memory with name: no-name-5cc3fd81-770d-4264-9460-780530231749
[I 2024-04-05 17:11:15,683] Trial 0 finished with value: 600.7842550133098 and parameters: {'lambda': 0.4744355760547249, 'alpha': 6.852729405889357e-05, 'subsample': 0.6262911254091028, 'colsample_bytree': 0.8225797781419424, 'min_child_weight': 221, 'max_depth': 8, 'learning_rate': 0.09213557792301795, 'n_estimators': 100, 'gamma': 0.37814639537143935, 'eta': 0.20808122527816664, 'grow_policy': 'lossguide'}. Best is trial 0 with value: 600.7842550133098.
[I 2024-04-05 17:11:19,143] Trial 1 finished with value: 599.8131402822304 and parameters: {'lambda': 3.2249719097029904e-06, 'alpha': 3.642898528406798e-06, 'subsample': 0.5400603894579123, 'colsample_bytree': 0.6841676043405005, 'min_child_weight': 70, 'max_depth': 13, 'learning_rate': 0.0114856463479101, 'n_estimators': 400, 'gamma': 0.6909872714603084, 'eta': 0.1338747066307574, 'grow_policy': 'depthwise'}. Best is tr

Best parameters: {'lambda': 0.2915478685381946, 'alpha': 5.617391259206513e-07, 'subsample': 0.8845179246850354, 'colsample_bytree': 0.917942397684142, 'min_child_weight': 222, 'max_depth': 7, 'learning_rate': 0.023034069401131108, 'n_estimators': 200, 'gamma': 0.2685525681231128, 'eta': 0.1959368642299215, 'grow_policy': 'lossguide'}
Best RMSE: 599.31341553408


## LGBM (baseline4)

In [282]:
from lightgbm import LGBMRegressor
import optuna
# Objective function for Optuna
def objective(trial):
    
    # XGBoost 하이퍼파라미터 탐색 공간 설정
    lgbm_params = {
        'n_estimators': trial.suggest_int('n_estimators', 100,1500),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 5,30),  # num_leaves를 증가시킴
        'max_depth': trial.suggest_int('max_depth', 3, 30),  
        'min_child_samples': trial.suggest_int("min_child_samples", 10,50),  # min_child_samples를 감소시킴
        'subsample': 0.8,
        'colsample_bytree': 0.8
    }
    
    # Train XGBoost model
    lgbm_model = LGBMRegressor(**lgbm_params)#, early_stopping_rounds=50)
    scores = cross_val_score(lgbm_model, X_train1, y_train1, cv=5, scoring='neg_mean_squared_error')
    
    # 각 fold의 평균 제곱근 오차(RMSE) 계산
    rmse_scores = np.sqrt(-scores)
    avg_rmse = rmse_scores.mean()
    return avg_rmse
    

# Create study object and optimize hyperparameters
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# Print best parameters and best score
print("Best parameters:", study.best_params)
print("Best RMSE:", study.best_value)

[I 2024-04-05 21:35:05,452] A new study created in memory with name: no-name-d27dbe57-3776-4fd5-a0c9-6c3684ebca9d


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001878 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1185
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 229
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001865 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1190
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 234
[LightGBM] [Info] Start training from score 550.044437
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001692 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:35:08,233] Trial 0 finished with value: 596.0974978529483 and parameters: {'n_estimators': 858, 'learning_rate': 0.03602258055865951, 'num_leaves': 16, 'max_depth': 30, 'min_child_samples': 18}. Best is trial 0 with value: 596.0974978529483.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001778 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1217
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 245
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001776 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1218
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 248
[LightGBM] [Info] Start training from score 550.044437
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001561 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:35:13,570] Trial 1 finished with value: 594.4239400989571 and parameters: {'n_estimators': 1428, 'learning_rate': 0.012464464895373733, 'num_leaves': 22, 'max_depth': 7, 'min_child_samples': 12}. Best is trial 1 with value: 594.4239400989571.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001243 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1117
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 195
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001490 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1110
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 194
[LightGBM] [Info] Start training from score 550.044437
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001226 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:35:16,403] Trial 2 finished with value: 592.2949575293947 and parameters: {'n_estimators': 958, 'learning_rate': 0.029181031501373498, 'num_leaves': 13, 'max_depth': 14, 'min_child_samples': 33}. Best is trial 2 with value: 592.2949575293947.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001263 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1129
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 201
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001556 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1124
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 201
[LightGBM] [Info] Start training from score 550.044437
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001245 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:35:18,693] Trial 3 finished with value: 590.130596315926 and parameters: {'n_estimators': 695, 'learning_rate': 0.012849047132577266, 'num_leaves': 13, 'max_depth': 25, 'min_child_samples': 30}. Best is trial 3 with value: 590.130596315926.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001520 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1143
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 208
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001551 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1134
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 206
[LightGBM] [Info] Start training from score 550.044437
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001540 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:35:22,885] Trial 4 finished with value: 611.0678761808899 and parameters: {'n_estimators': 1069, 'learning_rate': 0.07910615026476749, 'num_leaves': 23, 'max_depth': 24, 'min_child_samples': 27}. Best is trial 3 with value: 590.130596315926.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001614 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1237
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 255
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002075 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1232
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 255
[LightGBM] [Info] Start training from score 550.044437
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001614 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:35:25,250] Trial 5 finished with value: 593.9417029947786 and parameters: {'n_estimators': 1240, 'learning_rate': 0.025328700682678756, 'num_leaves': 7, 'max_depth': 26, 'min_child_samples': 11}. Best is trial 3 with value: 590.130596315926.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001458 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1135
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 204
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001531 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1128
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 203
[LightGBM] [Info] Start training from score 550.044437
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001514 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:35:28,158] Trial 6 finished with value: 590.75210866014 and parameters: {'n_estimators': 1491, 'learning_rate': 0.010960976396210942, 'num_leaves': 7, 'max_depth': 22, 'min_child_samples': 29}. Best is trial 3 with value: 590.130596315926.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001295 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 184
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001204 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1088
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 183
[LightGBM] [Info] Start training from score 550.044437
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001210 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:35:29,810] Trial 7 finished with value: 597.0558240404395 and parameters: {'n_estimators': 389, 'learning_rate': 0.09500979812161602, 'num_leaves': 20, 'max_depth': 27, 'min_child_samples': 41}. Best is trial 3 with value: 590.130596315926.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001653 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1175
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 224
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001651 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1176
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 227
[LightGBM] [Info] Start training from score 550.044437
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001526 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:35:35,140] Trial 8 finished with value: 595.823902998596 and parameters: {'n_estimators': 1153, 'learning_rate': 0.018960135056195763, 'num_leaves': 28, 'max_depth': 11, 'min_child_samples': 20}. Best is trial 3 with value: 590.130596315926.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001370 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1147
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 210
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001385 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1146
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 212
[LightGBM] [Info] Start training from score 550.044437
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001594 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:35:35,991] Trial 9 finished with value: 591.1770738221139 and parameters: {'n_estimators': 183, 'learning_rate': 0.01865484828687386, 'num_leaves': 10, 'max_depth': 10, 'min_child_samples': 25}. Best is trial 3 with value: 590.130596315926.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001523 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1079
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 176
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001478 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1078
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 178
[LightGBM] [Info] Start training from score 550.044437
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001401 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:35:38,032] Trial 10 finished with value: 592.6477488968742 and parameters: {'n_estimators': 584, 'learning_rate': 0.04984312231084754, 'num_leaves': 15, 'max_depth': 19, 'min_child_samples': 48}. Best is trial 3 with value: 590.130596315926.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001269 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1109
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 191
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001271 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1108
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 193
[LightGBM] [Info] Start training from score 550.044437
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001547 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] To

[I 2024-04-05 21:35:39,419] Trial 11 finished with value: 591.6626687142767 and parameters: {'n_estimators': 652, 'learning_rate': 0.011384405488839613, 'num_leaves': 5, 'max_depth': 21, 'min_child_samples': 35}. Best is trial 3 with value: 590.130596315926.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001474 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 190
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001260 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1102
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 190
[LightGBM] [Info] Start training from score 550.044437
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001254 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:35:43,098] Trial 12 finished with value: 590.0182127634623 and parameters: {'n_estimators': 1485, 'learning_rate': 0.010240190803108723, 'num_leaves': 10, 'max_depth': 18, 'min_child_samples': 36}. Best is trial 12 with value: 590.0182127634623.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001235 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1099
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 186
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001217 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1090
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 184
[LightGBM] [Info] Start training from score 550.044437
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001622 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:35:45,161] Trial 13 finished with value: 590.0664117573798 and parameters: {'n_estimators': 688, 'learning_rate': 0.015971424450673767, 'num_leaves': 11, 'max_depth': 18, 'min_child_samples': 40}. Best is trial 12 with value: 590.0182127634623.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001240 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1099
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 186
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001577 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1090
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 184
[LightGBM] [Info] Start training from score 550.044437
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001453 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:35:46,434] Trial 14 finished with value: 589.9466489791259 and parameters: {'n_estimators': 360, 'learning_rate': 0.01811923360050044, 'num_leaves': 10, 'max_depth': 17, 'min_child_samples': 40}. Best is trial 14 with value: 589.9466489791259.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001369 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1075
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 174
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001178 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1072
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 175
[LightGBM] [Info] Start training from score 550.044437
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001144 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:35:47,423] Trial 15 finished with value: 590.2431497126254 and parameters: {'n_estimators': 299, 'learning_rate': 0.02144369324469485, 'num_leaves': 8, 'max_depth': 15, 'min_child_samples': 50}. Best is trial 14 with value: 589.9466489791259.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001175 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1085
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 177
[LightGBM] [Info] Start training from score 556.142563
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001215 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 184
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001407 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:35:48,555] Trial 16 finished with value: 591.5730581424912 and parameters: {'n_estimators': 462, 'learning_rate': 0.015027523761432807, 'num_leaves': 19, 'max_depth': 3, 'min_child_samples': 42}. Best is trial 14 with value: 589.9466489791259.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001251 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1097
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 183
[LightGBM] [Info] Start training from score 556.142563
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001269 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 190
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001709 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:35:49,242] Trial 17 finished with value: 589.9659134190235 and parameters: {'n_estimators': 121, 'learning_rate': 0.04183270658248735, 'num_leaves': 11, 'max_depth': 13, 'min_child_samples': 36}. Best is trial 14 with value: 589.9466489791259.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001277 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1117
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 193
[LightGBM] [Info] Start training from score 556.142563
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001252 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1093
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 183
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001517 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:35:50,168] Trial 18 finished with value: 589.7000472443558 and parameters: {'n_estimators': 195, 'learning_rate': 0.04433220393462973, 'num_leaves': 13, 'max_depth': 12, 'min_child_samples': 43}. Best is trial 18 with value: 589.7000472443558.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001402 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1085
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 179
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001403 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1082
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 180
[LightGBM] [Info] Start training from score 550.044437
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001392 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:35:51,526] Trial 19 finished with value: 591.8121215603043 and parameters: {'n_estimators': 288, 'learning_rate': 0.05866001462064412, 'num_leaves': 30, 'max_depth': 7, 'min_child_samples': 46}. Best is trial 18 with value: 589.7000472443558.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001300 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1093
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 183
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001479 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1086
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 182
[LightGBM] [Info] Start training from score 550.044437
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001287 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:35:53,454] Trial 20 finished with value: 593.6961286580134 and parameters: {'n_estimators': 519, 'learning_rate': 0.06379443539737552, 'num_leaves': 17, 'max_depth': 9, 'min_child_samples': 43}. Best is trial 18 with value: 589.7000472443558.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001268 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 190
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001278 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1100
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 189
[LightGBM] [Info] Start training from score 550.044437
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001422 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:35:54,206] Trial 21 finished with value: 589.6280122226228 and parameters: {'n_estimators': 119, 'learning_rate': 0.03814375722306056, 'num_leaves': 14, 'max_depth': 13, 'min_child_samples': 37}. Best is trial 21 with value: 589.6280122226228.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001229 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1099
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 186
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001454 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1092
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 185
[LightGBM] [Info] Start training from score 550.044437
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001463 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:35:55,294] Trial 22 finished with value: 589.7875543522875 and parameters: {'n_estimators': 242, 'learning_rate': 0.03699696787428085, 'num_leaves': 14, 'max_depth': 16, 'min_child_samples': 39}. Best is trial 21 with value: 589.6280122226228.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001272 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1099
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 184
[LightGBM] [Info] Start training from score 556.142563
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001417 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1091
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 182
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001164 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:35:56,168] Trial 23 finished with value: 589.1969590787821 and parameters: {'n_estimators': 164, 'learning_rate': 0.03712068420202609, 'num_leaves': 14, 'max_depth': 12, 'min_child_samples': 45}. Best is trial 23 with value: 589.1969590787821.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001424 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 182
[LightGBM] [Info] Start training from score 556.142563
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001191 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1091
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 182
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001419 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:35:56,965] Trial 24 finished with value: 589.341102638211 and parameters: {'n_estimators': 117, 'learning_rate': 0.045560229409910294, 'num_leaves': 18, 'max_depth': 12, 'min_child_samples': 45}. Best is trial 23 with value: 589.1969590787821.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001561 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1085
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 179
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001421 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1082
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 180
[LightGBM] [Info] Start training from score 550.044437
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001387 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:35:57,605] Trial 25 finished with value: 592.3870138120286 and parameters: {'n_estimators': 108, 'learning_rate': 0.029950393361825037, 'num_leaves': 18, 'max_depth': 4, 'min_child_samples': 46}. Best is trial 23 with value: 589.1969590787821.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001281 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1091
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 180
[LightGBM] [Info] Start training from score 556.142563
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001456 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1085
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 179
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001414 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:35:59,341] Trial 26 finished with value: 592.7622308696821 and parameters: {'n_estimators': 400, 'learning_rate': 0.05460883756994296, 'num_leaves': 25, 'max_depth': 8, 'min_child_samples': 46}. Best is trial 23 with value: 589.1969590787821.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001365 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1075
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 174
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001316 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1072
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 175
[LightGBM] [Info] Start training from score 550.044437
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001151 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:36:00,547] Trial 27 finished with value: 589.5794745266408 and parameters: {'n_estimators': 288, 'learning_rate': 0.025036038873488195, 'num_leaves': 21, 'max_depth': 5, 'min_child_samples': 50}. Best is trial 23 with value: 589.1969590787821.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001207 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1085
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 177
[LightGBM] [Info] Start training from score 556.142563
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001391 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1079
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 176
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001408 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:36:02,203] Trial 28 finished with value: 590.1378166740527 and parameters: {'n_estimators': 489, 'learning_rate': 0.02510951931822754, 'num_leaves': 21, 'max_depth': 5, 'min_child_samples': 49}. Best is trial 23 with value: 589.1969590787821.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001202 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1091
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 182
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001434 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1082
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 180
[LightGBM] [Info] Start training from score 550.044437
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001197 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:36:04,770] Trial 29 finished with value: 592.5266494503992 and parameters: {'n_estimators': 815, 'learning_rate': 0.03327866766515565, 'num_leaves': 25, 'max_depth': 6, 'min_child_samples': 45}. Best is trial 23 with value: 589.1969590787821.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001440 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1079
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 176
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001308 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1078
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 178
[LightGBM] [Info] Start training from score 550.044437
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001126 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:36:06,056] Trial 30 finished with value: 592.5769078461994 and parameters: {'n_estimators': 305, 'learning_rate': 0.07352111321592908, 'num_leaves': 16, 'max_depth': 11, 'min_child_samples': 49}. Best is trial 23 with value: 589.1969590787821.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001164 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1085
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 177
[LightGBM] [Info] Start training from score 556.142563
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001195 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1091
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 182
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001440 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:36:06,814] Trial 31 finished with value: 589.2870055211114 and parameters: {'n_estimators': 102, 'learning_rate': 0.03817015602866781, 'num_leaves': 17, 'max_depth': 14, 'min_child_samples': 44}. Best is trial 23 with value: 589.1969590787821.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001239 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1097
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 183
[LightGBM] [Info] Start training from score 556.142563
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001197 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1091
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 182
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001424 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:36:07,879] Trial 32 finished with value: 589.8601068500748 and parameters: {'n_estimators': 206, 'learning_rate': 0.04548552599985405, 'num_leaves': 18, 'max_depth': 9, 'min_child_samples': 45}. Best is trial 23 with value: 589.1969590787821.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001415 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1075
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 174
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001168 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1072
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 175
[LightGBM] [Info] Start training from score 550.044437
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001131 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:36:08,790] Trial 33 finished with value: 589.8345956762735 and parameters: {'n_estimators': 112, 'learning_rate': 0.02719553701366066, 'num_leaves': 23, 'max_depth': 15, 'min_child_samples': 50}. Best is trial 23 with value: 589.1969590787821.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001307 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1085
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 179
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001168 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1078
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 178
[LightGBM] [Info] Start training from score 550.044437
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001179 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:36:09,890] Trial 34 finished with value: 589.4881921452622 and parameters: {'n_estimators': 216, 'learning_rate': 0.03341957967961951, 'num_leaves': 16, 'max_depth': 14, 'min_child_samples': 47}. Best is trial 23 with value: 589.1969590787821.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001273 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1089
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 179
[LightGBM] [Info] Start training from score 556.142563
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001741 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1121
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 197
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001501 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:36:10,967] Trial 35 finished with value: 589.5608737256878 and parameters: {'n_estimators': 202, 'learning_rate': 0.03324378567861742, 'num_leaves': 17, 'max_depth': 13, 'min_child_samples': 32}. Best is trial 23 with value: 589.1969590787821.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001524 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1129
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 199
[LightGBM] [Info] Start training from score 556.142563
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001453 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1105
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 189
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001256 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:36:13,900] Trial 36 finished with value: 594.4092349101887 and parameters: {'n_estimators': 880, 'learning_rate': 0.040152924671577904, 'num_leaves': 16, 'max_depth': 30, 'min_child_samples': 38}. Best is trial 23 with value: 589.1969590787821.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001362 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1093
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 183
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001439 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1086
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 182
[LightGBM] [Info] Start training from score 550.044437
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001328 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:36:18,724] Trial 37 finished with value: 601.7208932012018 and parameters: {'n_estimators': 1337, 'learning_rate': 0.04993187029138462, 'num_leaves': 19, 'max_depth': 16, 'min_child_samples': 43}. Best is trial 23 with value: 589.1969590787821.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001746 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1205
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 239
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001741 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1200
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 239
[LightGBM] [Info] Start training from score 550.044437
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001720 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:36:20,150] Trial 38 finished with value: 591.3830793592213 and parameters: {'n_estimators': 380, 'learning_rate': 0.0315305840377335, 'num_leaves': 13, 'max_depth': 20, 'min_child_samples': 15}. Best is trial 23 with value: 589.1969590787821.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001451 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1085
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 179
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001157 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1078
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 178
[LightGBM] [Info] Start training from score 550.044437
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001396 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:36:21,203] Trial 39 finished with value: 591.0817752337225 and parameters: {'n_estimators': 235, 'learning_rate': 0.06720863653538019, 'num_leaves': 15, 'max_depth': 14, 'min_child_samples': 47}. Best is trial 23 with value: 589.1969590787821.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001115 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1089
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 179
[LightGBM] [Info] Start training from score 556.142563
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001628 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1155
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 214
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001469 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:36:23,137] Trial 40 finished with value: 592.6344901324337 and parameters: {'n_estimators': 444, 'learning_rate': 0.03547222684386641, 'num_leaves': 19, 'max_depth': 11, 'min_child_samples': 23}. Best is trial 23 with value: 589.1969590787821.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001494 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1121
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 197
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001516 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1118
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 198
[LightGBM] [Info] Start training from score 550.044437
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001265 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:36:24,178] Trial 41 finished with value: 589.8095878847784 and parameters: {'n_estimators': 185, 'learning_rate': 0.03291388271887134, 'num_leaves': 17, 'max_depth': 13, 'min_child_samples': 32}. Best is trial 23 with value: 589.1969590787821.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001383 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1129
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 199
[LightGBM] [Info] Start training from score 556.142563
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001563 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1117
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 195
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001467 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:36:25,123] Trial 42 finished with value: 590.2499093723628 and parameters: {'n_estimators': 172, 'learning_rate': 0.048224357148228925, 'num_leaves': 15, 'max_depth': 14, 'min_child_samples': 33}. Best is trial 23 with value: 589.1969590787821.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001491 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1129
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 199
[LightGBM] [Info] Start training from score 556.142563
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001275 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1091
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 182
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001429 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:36:26,481] Trial 43 finished with value: 589.4855424415359 and parameters: {'n_estimators': 263, 'learning_rate': 0.028031446079270692, 'num_leaves': 17, 'max_depth': 12, 'min_child_samples': 44}. Best is trial 23 with value: 589.1969590787821.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001450 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1091
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 182
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001208 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1086
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 182
[LightGBM] [Info] Start training from score 550.044437
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001173 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:36:29,139] Trial 44 finished with value: 590.7841661134262 and parameters: {'n_estimators': 587, 'learning_rate': 0.021159746139269246, 'num_leaves': 20, 'max_depth': 10, 'min_child_samples': 44}. Best is trial 23 with value: 589.1969590787821.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001200 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 184
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001427 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1088
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 183
[LightGBM] [Info] Start training from score 550.044437
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001178 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:36:30,458] Trial 45 finished with value: 589.8478398379139 and parameters: {'n_estimators': 338, 'learning_rate': 0.02821243819031085, 'num_leaves': 12, 'max_depth': 12, 'min_child_samples': 41}. Best is trial 23 with value: 589.1969590787821.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001405 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1085
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 179
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001160 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1078
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 178
[LightGBM] [Info] Start training from score 550.044437
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001169 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:36:35,188] Trial 46 finished with value: 593.427868205636 and parameters: {'n_estimators': 1022, 'learning_rate': 0.022745656335370804, 'num_leaves': 23, 'max_depth': 15, 'min_child_samples': 47}. Best is trial 23 with value: 589.1969590787821.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001364 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 184
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001208 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1088
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 183
[LightGBM] [Info] Start training from score 550.044437
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001197 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:36:36,367] Trial 47 finished with value: 590.3450832338731 and parameters: {'n_estimators': 267, 'learning_rate': 0.041196387765038894, 'num_leaves': 14, 'max_depth': 17, 'min_child_samples': 41}. Best is trial 23 with value: 589.1969590787821.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001249 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1097
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 183
[LightGBM] [Info] Start training from score 556.142563
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001376 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1079
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 176
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001461 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:36:37,259] Trial 48 finished with value: 589.8974337182918 and parameters: {'n_estimators': 150, 'learning_rate': 0.053767561112756386, 'num_leaves': 16, 'max_depth': 23, 'min_child_samples': 48}. Best is trial 23 with value: 589.1969590787821.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001407 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1087
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 178
[LightGBM] [Info] Start training from score 556.142563
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001229 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1091
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 182
[LightGBM] [Info] Start training from score 554.683937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001415 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

[I 2024-04-05 21:36:38,901] Trial 49 finished with value: 590.4180251201333 and parameters: {'n_estimators': 328, 'learning_rate': 0.035346474445711894, 'num_leaves': 20, 'max_depth': 10, 'min_child_samples': 44}. Best is trial 23 with value: 589.1969590787821.


Best parameters: {'n_estimators': 164, 'learning_rate': 0.03712068420202609, 'num_leaves': 14, 'max_depth': 12, 'min_child_samples': 45}
Best RMSE: 589.1969590787821


## RandomForest (baseline5)

In [None]:
from sklearn.ensemble import RandomForestRegressor

# 랜덤 포레스트 모델 생성 및 훈련
rf_model = RandomForestRegressor(n_estimators=600, random_state=721, max_depth=10)
rf_model.fit(X_train1, y_train1)

# 테스트 세트에 대한 예측
y_pred2 = rf_model.predict(X_valid1)

# 모델 성능 평가 (MSE 사용)
mse = mean_squared_error(y_valid1, y_pred2)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", np.sqrt(mse))

In [115]:
from sklearn.ensemble import RandomForestRegressor
# Objective function for Optuna
def objective(trial):

    # Define parameters to search
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 2000),
        'max_depth': trial.suggest_int('max_depth', 3, 30),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 40),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 30),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'random_state': 3489
    }
    
    # Train RandomForest model
    rf_model = RandomForestRegressor(**params)
    rf_model.fit(X_train1, y_train1)
    
    scores = cross_val_score(rf_model, X_train1, y_train1, cv=5, scoring='neg_mean_squared_error')
    
    # 각 fold의 평균 제곱근 오차(RMSE) 계산
    rmse_scores = np.sqrt(-scores)
    avg_rmse = rmse_scores.mean()
    return avg_rmse

# Create study object and optimize hyperparameters
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# Print best parameters and best score
print("Best parameters:", study.best_params)
print("Best RMSE:", study.best_value)

[I 2024-04-05 17:41:59,074] A new study created in memory with name: no-name-0522ccff-2099-4f33-8777-5969953b0e11
[I 2024-04-05 17:42:20,745] Trial 0 finished with value: 603.7440904873903 and parameters: {'n_estimators': 1187, 'max_depth': 8, 'min_samples_split': 24, 'min_samples_leaf': 21, 'max_features': 'sqrt'}. Best is trial 0 with value: 603.7440904873903.
[I 2024-04-05 17:42:44,501] Trial 1 finished with value: 600.5666106792056 and parameters: {'n_estimators': 1020, 'max_depth': 29, 'min_samples_split': 39, 'min_samples_leaf': 14, 'max_features': 'sqrt'}. Best is trial 1 with value: 600.5666106792056.
[I 2024-04-05 17:43:19,757] Trial 2 finished with value: 601.8445465838697 and parameters: {'n_estimators': 1757, 'max_depth': 13, 'min_samples_split': 19, 'min_samples_leaf': 13, 'max_features': 'log2'}. Best is trial 1 with value: 600.5666106792056.
[I 2024-04-05 17:43:31,289] Trial 3 finished with value: 617.129725586132 and parameters: {'n_estimators': 1042, 'max_depth': 3, 'm

Best parameters: {'n_estimators': 1511, 'max_depth': 17, 'min_samples_split': 37, 'min_samples_leaf': 5, 'max_features': 'sqrt'}
Best RMSE: 600.1023736466215


## KNN (baseline6)

In [None]:
from sklearn.neighbors import KNeighborsRegressor
# Objective function for Optuna
def objective(trial):

    # Define parameters to search
    knn_params = {
        'n_neighbors': trial.suggest_int('n_neighbors', 3, 30),
        'weights': 'distance',  # uniform에서 distance로 변경
        'algorithm': 'auto'
    }
    
    # Train RandomForest model
    rf_model = KNeighborsRegressor(**knn_params)
    rf_model.fit(X_train1, y_train1)
    
    scores = cross_val_score(rf_model, X_train1, y_train1, cv=5, scoring='neg_mean_squared_error')
    
    # 각 fold의 평균 제곱근 오차(RMSE) 계산
    rmse_scores = np.sqrt(-scores)
    avg_rmse = rmse_scores.mean()
    return avg_rmse

# Create study object and optimize hyperparameters
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# Print best parameters and best score
print("Best parameters:", study.best_params)
print("Best RMSE:", study.best_value)

# Stacking_Regressor

In [124]:
cat_idxs

[0, 2, 3, 4, 5, 6, 7, 8]

In [126]:
catb_params = {'iterations': 715, 'learning_rate': 0.018878225933021422,
               'depth': 9, 'l2_leaf_reg': 0.0040812839053198675, 
               'border_count': 371, 'random_strength': 1.8138146013086647,
               'bagging_temperature': 0.05334680309520758, 'min_data_in_leaf': 9, 'cat_features' : cat_idxs}


lgbm_params = {'n_estimators': 301, 'learning_rate': 0.018423257131070497,
               'num_leaves': 16, 'max_depth': 16, 'min_child_samples': 39}

xgb_params = {'lambda': 0.2915478685381946, 'alpha': 5.617391259206513e-07, 'subsample': 0.8845179246850354,
              'colsample_bytree': 0.917942397684142, 'min_child_weight': 222, 'max_depth': 7, 'learning_rate': 0.023034069401131108,
              'n_estimators': 200, 'gamma': 0.2685525681231128, 'eta': 0.1959368642299215, 'grow_policy': 'lossguide'}

knn_params = {
    'n_neighbors': 30,
    'weights': 'distance',  # uniform에서 distance로 변경
    'algorithm': 'auto'
}

svm_params = {
    'C': 1.0,
    'kernel': 'rbf',
    'gamma': 'auto'  # scale에서 auto로 변경
}

rf_params = {'n_estimators': 1511, 'max_depth': 17, 'min_samples_split': 37, 'min_samples_leaf': 5, 'max_features': 'sqrt'}#study.best_params#{'n_estimators': 1200, 'max_depth': 12,  # max_depth를 증가시킴 'min_samples_split': 5, 'min_samples_leaf': 1,'bootstrap': True}

mlp_params = {
    'hidden_layer_sizes' : (50,25),
    'activation': 'relu',
    'max_iter': 100
}


In [121]:
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import Ridge

def objective(trial): 

    # Define hyperparameters to optimize for StackingRegressor
    meta_learner_alpha = trial.suggest_float('meta_learner_alpha', 0.1, 1.0)

    estimators = [
        ('catb', CatBoostRegressor(**catb_params)),
        ('lgbm', LGBMRegressor(**lgbm_params)),
        ('xgb', XGBRegressor(**xgb_params)),
        ('knn', KNeighborsRegressor(**knn_params)),
        ('svm', SVR(**svm_params)),
        ('rf', RandomForestRegressor(**rf_params)),  
        ('rl', LinearRegression()),
        ('mlp', MLPRegressor(**mlp_params))]  

    
    stacking_regressor = StackingRegressor(estimators=estimators, final_estimator=Ridge(alpha=meta_learner_alpha))

    scores = cross_val_score(stacking_regressor, X_train1, y_train1, cv=5, scoring='neg_mean_squared_error')
    
    # 각 fold의 평균 제곱근 오차(RMSE) 계산
    rmse_scores = np.sqrt(-scores)
    avg_rmse = rmse_scores.mean()
    return avg_rmse



# Perform hyperparameter optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)

# 결과 출력
print("Best trial:")
best_trial = study.best_trial
print("  Value: ", best_trial.value)
print("  Params: ")
for key, value in best_trial.params.items():
    print("    {}: {}".format(key, value))

[I 2024-04-05 19:06:04,508] A new study created in memory with name: no-name-cf94e208-702a-41fb-8a03-16cf4b23c367


0:	learn: 688.0287955	total: 10.4ms	remaining: 7.45s
1:	learn: 685.2323482	total: 19.4ms	remaining: 6.9s
2:	learn: 682.8490117	total: 23.2ms	remaining: 5.51s
3:	learn: 680.3039720	total: 29.8ms	remaining: 5.3s
4:	learn: 677.9517415	total: 32.7ms	remaining: 4.64s
5:	learn: 675.6824936	total: 36.4ms	remaining: 4.3s
6:	learn: 673.1997666	total: 45.3ms	remaining: 4.58s
7:	learn: 671.0454033	total: 51ms	remaining: 4.51s
8:	learn: 668.5761725	total: 61.1ms	remaining: 4.79s
9:	learn: 666.4517127	total: 71.9ms	remaining: 5.07s
10:	learn: 663.9774640	total: 82.9ms	remaining: 5.3s
11:	learn: 661.6977549	total: 91ms	remaining: 5.33s
12:	learn: 659.2975612	total: 101ms	remaining: 5.45s
13:	learn: 657.0668292	total: 111ms	remaining: 5.57s
14:	learn: 655.1720986	total: 116ms	remaining: 5.41s
15:	learn: 653.0479264	total: 125ms	remaining: 5.44s
16:	learn: 651.0979425	total: 131ms	remaining: 5.38s
17:	learn: 649.2248233	total: 135ms	remaining: 5.21s
18:	learn: 647.3922370	total: 142ms	remaining: 5.22s

[W 2024-04-05 19:08:28,907] Trial 0 failed with parameters: {'meta_learner_alpha': 0.46286732600893254} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/sgh/yes/envs/DACON/lib/python3.10/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_2595/2316491175.py", line 31, in objective
    scores = cross_val_score(stacking_regressor, X_train1, y_train1, cv=5, scoring='neg_mean_squared_error')
  File "/home/sgh/yes/envs/DACON/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
  File "/home/sgh/yes/envs/DACON/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 719, in cross_val_score
    cv_results = cross_validate(
  File "/home/sgh/yes/envs/DACON/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
  File "/home/

KeyboardInterrupt: 

In [283]:
catb_params = {
    'iterations':600,
    'learning_rate':0.01,
    'depth':12,
    'cat_features':cat_idxs
}


lgbm_params = {'n_estimators': 164, 
               'learning_rate': 0.03712068420202609, 
               'num_leaves': 14, 'max_depth': 12, 'min_child_samples': 45}

xgb_params = {
    'n_estimators': 200,
    'learning_rate': 0.05,  # learning_rate를 감소시킴
    'max_depth': 13,  # max_depth를 증가시킴
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8
}

knn_params = {
    'n_neighbors': 30,
    'weights': 'distance',  # uniform에서 distance로 변경
    'algorithm': 'auto'
}

svm_params = {
    'C': 1.0,
    'kernel': 'rbf',
    'gamma': 'auto'  # scale에서 auto로 변경
}

rf_params = {
    'n_estimators': 1000,
    'max_depth': 12,  # max_depth를 증가시킴
    'min_samples_split': 5,
    'min_samples_leaf': 1,
    'bootstrap': True
}

mlp_params = {
    'hidden_layer_sizes' : (50,25),
    'activation': 'relu',
    'max_iter': 100
}

In [288]:
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor

#np.random.seed(721)

# 개별 모델 정의
estimators = [
    ('catb', CatBoostRegressor(**catb_params)),
    ('lgbm', LGBMRegressor(**lgbm_params)),
    ('xgb', XGBRegressor(**xgb_params)),
    ('knn', KNeighborsRegressor(**knn_params)),
    ('svm', SVR(**svm_params)),
    ('rf', RandomForestRegressor(**rf_params, random_state = 721)),  
    ('rl', LinearRegression()),
    ('mlp', MLPRegressor(**mlp_params))]  

    
# 스태킹 모델 정의
stacking_regressor = StackingRegressor(estimators=estimators, final_estimator=None)

# 모델 훈련
stacking_regressor.fit(X_train1, y_train1)

# 테스트 데이터에 대한 예측
#y_pred = stacking_regressor.predict(X_test1)

0:	learn: 699.7100018	total: 78.7ms	remaining: 47.1s
1:	learn: 698.0160109	total: 154ms	remaining: 46.1s
2:	learn: 696.2220779	total: 228ms	remaining: 45.3s
3:	learn: 694.3958050	total: 301ms	remaining: 44.9s
4:	learn: 692.6578895	total: 382ms	remaining: 45.4s
5:	learn: 690.8816695	total: 450ms	remaining: 44.5s
6:	learn: 689.2070684	total: 530ms	remaining: 44.9s
7:	learn: 687.6386213	total: 543ms	remaining: 40.2s
8:	learn: 685.9636040	total: 615ms	remaining: 40.4s
9:	learn: 684.2988223	total: 691ms	remaining: 40.8s
10:	learn: 682.6082251	total: 772ms	remaining: 41.3s
11:	learn: 681.0878519	total: 787ms	remaining: 38.6s
12:	learn: 679.4978822	total: 858ms	remaining: 38.7s
13:	learn: 677.8680559	total: 937ms	remaining: 39.2s
14:	learn: 676.3574275	total: 1.02s	remaining: 39.9s
15:	learn: 674.9072368	total: 1.1s	remaining: 40s
16:	learn: 673.6825310	total: 1.1s	remaining: 37.9s
17:	learn: 672.1815500	total: 1.18s	remaining: 38s
18:	learn: 670.7744444	total: 1.26s	remaining: 38.4s
19:	lear

In [289]:
# 테스트 데이터에 대한 예측
y_pred = stacking_regressor.predict(X_test1)

sample_submission['Income'] = y_pred
submission = sample_submission.copy()

submission.to_csv('/home/sgh/yes/envs/DACON/stacking_04_05(3).csv',index=False)

In [292]:
import os
import joblib

# 모델 파일 경로
file_path = "/home/sgh/yes/envs/DACON/models/stacking_regressor_4_05(3)_sota.pkl"

# 모델 저장
joblib.dump(stacking_regressor, file_path)

['/home/sgh/yes/envs/DACON/models/stacking_regressor_4_05(3)_sota.pkl']

In [291]:
test.columns

Index(['ID', 'Age', 'Gender', 'Education_Status', 'Employment_Status',
       'Working_Week (Yearly)', 'Industry_Status', 'Occupation_Status', 'Race',
       'Hispanic_Origin',
       ...
       'Birth_Country (Mother)_Yugoslavia',
       'Tax_Status_Head of Household (HOH)',
       'Tax_Status_Married Filling Jointly both over 65 (MFJ)',
       'Tax_Status_Married Filling Jointly both under 65 (MFJ)',
       'Tax_Status_Married Filling Jointly one over 65 & one under 65 (MFJ)',
       'Tax_Status_Nonfiler', 'Tax_Status_Single', 'Income_Status_Over Median',
       'Income_Status_Under Median', 'Income_Status_Unknown'],
      dtype='object', length=286)

In [290]:
child_yes_1_ids = test.loc[test['Childern'] == 1, 'ID'].values
len(child_yes_1_ids)
Not_working_yes_ids = test.loc[test['Not_working_yes'] == 1, 'ID'].values
len(Not_working_yes_ids)
Not_in_universe_or_children = test.loc[test['Not_University_yes'] == 1, 'ID'].values 
len(Not_in_universe_or_children)


KeyError: 'child_yes'

In [None]:
sota = pd.read_csv("/home/sgh/yes/envs/DACON/stacking_04_05(3).csv")

# child_yes_1에 있는 ID에 해당하는 행을 sota 데이터프레임에서 찾아서 Income 값을 0으로 변경
sota.loc[sota['ID'].isin(child_yes_1_ids), 'Income'] = 0
sota.loc[sota['ID'].isin(Not_working_yes_ids), 'Income'] = 0
sota.loc[sota['ID']=='TEST_8902', 'Income'] = 0
sota.loc[sota['ID'].isin(Not_in_universe_or_children), 'Income'] = 0
sota.to_csv('/home/sgh/yes/envs/DACON/stacking_ridge_4_05(3).csv',index=False)

In [249]:
train.iloc[:,1:].corr()[target]

Tax_Status                                                             0.357877
Working_Week (Yearly)                                                  0.419628
Martial_Status                                                         0.223206
Hispanic_Origin                                                        0.076279
Education_Status                                                       0.318630
Household_Status                                                       0.303402
Industry_Status                                                        0.458388
Gender                                                                 0.093893
Employment_Status                                                      0.259920
Income                                                                 1.000000
Tax_Status_Head of Household (HOH)                                     0.044177
Tax_Status_Married Filling Jointly both over 65 (MFJ)                 -0.077625
Tax_Status_Married Filling Jointly both 

In [None]:
import optuna
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import mean_squared_error

# Load dataset1

def objective(trial):
    # Define hyperparameters to optimize for RandomForestRegressor
    catb_params = {
        'iterations':trial.suggest_int('iterations', 100, 1000),
        'learning_rate':trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        'depth': trial.suggest_int('depth', 1, 10),
        'cat_features':cat_index,
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 15.0),
        'border_count': trial.suggest_int('border_count', 32, 400),
        'random_strength': trial.suggest_loguniform('random_strength', 0.01, 15.0),
        'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.01, 120.0),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 100),
        'od_type': 'Iter',
        'od_wait': 20,
        'verbose': False
    }


    lgbm_params = {
        'n_estimators': trial.suggest_int('n_estimators', 100,1500),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 5,30),  # num_leaves를 증가시킴
        'max_depth': trial.suggest_int('max_depth', 3, 30),  
        'min_child_samples': trial.suggest_int("min_child_samples", 10,50),  # min_child_samples를 감소시킴
        'subsample': 0.8,
        'colsample_bytree': 0.8
    }

    xgb_params = {
        'lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, min(300, len(X_train))),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=100),
        'gamma': trial.suggest_float('gamma', 0.0, 1.0),
        'eta': trial.suggest_float('eta', 0.1, 0.3),
        'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide']),
    }

    knn_params = {
        'n_neighbors': trial.suggest_int('n_neighbors', 3, 30),
        'weights': 'distance',  # uniform에서 distance로 변경
        'algorithm': 'auto'
    }

    svm_params = {
        'C': 1.0,
        'kernel': 'rbf',
        'gamma': 'auto'  # scale에서 auto로 변경
    }

    

    mlp_params = {
        'hidden_layer_sizes' : (25,100),
        'activation': 'relu',
        'max_iter': trial.suggest_int('max_iter', 50, 300)
    }
    
    # Define hyperparameters to optimize for Ridge
    ridge_alpha = trial.suggest_loguniform('ridge_alpha', 0.001, 10)

    # Define hyperparameters to optimize for StackingRegressor
    meta_learner_alpha = trial.suggest_float('meta_learner_alpha', 0.1, 1.0)

    estimators = [
        ('catb', CatBoostRegressor(**catb_params)),
        ('lgbm', LGBMRegressor(**lgbm_params)),
        ('xgb', XGBRegressor(**xgb_params)),
        ('knn', KNeighborsRegressor(**knn_params)),
        ('svm', SVR(**svm_params)),
        ('rf', RandomForestRegressor(**rf_params)),  
        ('rl', LinearRegression()),
        ('mlp', MLPRegressor(**mlp_params))]  

    
    stacking_regressor = StackingRegressor(estimators=estimators, final_estimator=Ridge(alpha=meta_learner_alpha))

    scores = cross_val_score(stacking_regressor, X_train1, y_train1, cv=5, scoring='neg_mean_squared_error')
    
    # 각 fold의 평균 제곱근 오차(RMSE) 계산
    rmse_scores = np.sqrt(-scores)
    avg_rmse = rmse_scores.mean()
    return avg_rmse



# Perform hyperparameter optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# 결과 출력
print("Best trial:")
best_trial = study.best_trial
print("  Value: ", best_trial.value)
print("  Params: ")
for key, value in best_trial.params.items():
    print("    {}: {}".format(key, value))