In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pycaret.classification import *

In [2]:
train = pd.read_csv("/Users/raekkkky/Desktop/DNA/Semester_2023_2/5주차/train.csv")
test = pd.read_csv("/Users/raekkkky/Desktop/DNA/Semester_2023_2/5주차/test.csv")
sub = pd.read_csv("/Users/raekkkky/Desktop/DNA/Semester_2023_2/5주차/sample_submission.csv")

In [None]:
train.info()

In [None]:
test.info()

In [None]:
sub.info()

In [None]:
train.head()

In [None]:
train.describe()

In [None]:
# 'dental caries', 'Urine protein', 'hearing(left)', 'hearing(right)' 컬럼의 데이터 타입을 'object'로 변경합니다.
cols_to_change = ['dental caries', 'Urine protein', 'hearing(left)', 'hearing(right)']
for col in cols_to_change:
    train[col] = train[col].astype('object')
    test[col] = test[col].astype('object')


In [None]:
# 컬럼별로 분포 확인
for column in train.columns:
    if column != 'smoking':
        plt.figure(figsize=(10,5))
        sns.kdeplot(data=train, x=column, hue="smoking", fill=True)
        plt.title(column)
        plt.show()


In [None]:
train = train.drop(columns=['id'])
test = test.drop(columns=['id'])

train = pd.get_dummies(train, columns=cols_to_change)
test = pd.get_dummies(test, columns=cols_to_change)

In [None]:
train.info()

In [None]:


# setup 함수는 PyCaret의 환경을 설정하는 함수입니다.
clf = setup(data = train, target = 'smoking', normalize = True)


# 모델 비교를 통해 최적의 모델을 찾습니다.
compare_models()


In [None]:
# 가장 성능이 좋은 모델을 선택하여 학습시킵니다.
best_model = create_model('lightgbm')

In [None]:
plot_model(best_model, plot = 'learning')

In [None]:
# 테스트 데이터에 대한 예측을 수행합니다.
predictions = predict_model(best_model, data = test)

# 예측 결과를 확인합니다.
print(predictions.head())

In [None]:
# 예측 결과를 'sub' 데이터프레임의 'smoking' 컬럼에 저장합니다.
sub['smoking'] = predictions['prediction_label']

In [None]:
# 'sub' 데이터프레임을 csv 파일로 저장합니다.
sub.to_csv('submission(lightgbm_pycaret).csv',index= False)

In [None]:
train = pd.read_csv("/Users/raekkkky/Desktop/DNA/Semester_2023_2/5주차/train.csv")
test = pd.read_csv("/Users/raekkkky/Desktop/DNA/Semester_2023_2/5주차/test.csv")
sub = pd.read_csv("/Users/raekkkky/Desktop/DNA/Semester_2023_2/5주차/sample_submission.csv")

In [None]:
# 중복된 행의 개수 계산
duplicates = train.duplicated().sum()

# 중복된 행 삭제
train = train.drop_duplicates()

# 삭제된 중복값의 수 출력
print(f"삭제된 중복값의 수: {duplicates}")

In [None]:
# 'dental caries', 'Urine protein', 'hearing(left)', 'hearing(right)' 컬럼의 데이터 타입을 'object'로 변경합니다.
cols_to_change = ['dental caries', 'Urine protein', 'hearing(left)', 'hearing(right)']
for col in cols_to_change:
    train[col] = train[col].astype('object')
    test[col] = test[col].astype('object')


In [None]:

# 숫자형 변수만 선택
numeric_cols = train.select_dtypes(include=[np.number]).columns.tolist()

outliers_indexes = set()

for col in numeric_cols:
    Q1 = train[col].quantile(0.25)
    Q3 = train[col].quantile(0.75)
    IQR = Q3 - Q1

    # IQR 범위를 벗어나는 값을 이상치로 판단
    outliers_condition = (train[col] < (Q1 - 1.5 * IQR)) | (train[col] > (Q3 + 1.5 * IQR))

    # 이상치의 인덱스를 찾아서 outliers_indexes에 추가
    outliers_indexes.update(train[outliers_condition].index)

# 이상치가 있는 행을 제거
train = train.drop(outliers_indexes)


In [None]:
train = train.drop(columns=['id'])
test = test.drop(columns=['id'])

train = pd.get_dummies(train, columns=cols_to_change)
test = pd.get_dummies(test, columns=cols_to_change)

In [None]:
from pycaret.classification import *

# setup 함수는 PyCaret의 환경을 설정하는 함수입니다.
clf = setup(data = train, target = 'smoking', normalize = True)


# 모델 비교를 통해 최적의 모델을 찾습니다.
compare_models()


In [None]:
# 가장 성능이 좋은 모델을 선택하여 학습시킵니다.
best_model = create_model('lightgbm')

In [None]:
plot_model(best_model, plot = 'learning')

In [None]:
# 테스트 데이터에 대한 예측을 수행합니다.
predictions = predict_model(best_model, data = test)

# 예측 결과를 확인합니다.
print(predictions.head())

In [None]:
# 예측 결과를 'sub' 데이터프레임의 'smoking' 컬럼에 저장합니다.
sub['smoking'] = predictions['prediction_label']

In [None]:
# 'sub' 데이터프레임을 csv 파일로 저장합니다.
sub.to_csv('submission(lightgbm+outlier).csv',index= False)

In [3]:
import optuna
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

# X, y 데이터 설정
X = train.drop('smoking', axis=1) # 'target' 열을 제외한 나머지를 독립 변수 X로 설정
y = train['smoking'] # 'target' 열을 종속 변수 y로 설정

def objective(trial):
    # 하이퍼파라미터의 범위를 지정
    param = {
        'objective': 'binary', # 이진 분류 문제
        'verbosity': -1,
        'boosting_type': 'gbdt', # Gradient Boosting Decision Tree
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    }

    # 데이터를 학습용과 검증용으로 분리
    train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=0.25)

    train_set = lgb.Dataset(train_x, train_y)
    valid_set = lgb.Dataset(valid_x, valid_y)

    gbm = lgb.train(param, train_set, valid_sets=[valid_set], num_boost_round=1000)

    preds = gbm.predict(valid_x)
    auc = roc_auc_score(valid_y, preds)
    return auc

study = optuna.create_study(direction='maximize') # 'maximize'로 설정하여 AUC를 최대화
study.optimize(objective, n_trials=100) # n_trials는 시도할 파라미터 셋의 수를 의미

print('Best trial:')
trial = study.best_trial
print('  Value: ', trial.value)
print('  Params: ')
for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))


[I 2023-11-24 23:09:54,368] A new study created in memory with name: no-name-d7ed8449-9367-422a-bf3c-3e65b600e60e
[I 2023-11-24 23:10:22,475] Trial 0 finished with value: 0.8642150782068101 and parameters: {'lambda_l1': 4.399965568230971, 'lambda_l2': 7.501115485778749, 'num_leaves': 167, 'feature_fraction': 0.44537235593277796, 'bagging_fraction': 0.5836897446364023, 'bagging_freq': 7, 'min_child_samples': 57}. Best is trial 0 with value: 0.8642150782068101.
[I 2023-11-24 23:10:27,088] Trial 1 finished with value: 0.868235159258979 and parameters: {'lambda_l1': 0.03602234353727245, 'lambda_l2': 0.035428185204244206, 'num_leaves': 17, 'feature_fraction': 0.8729967811289276, 'bagging_fraction': 0.8984053928571574, 'bagging_freq': 4, 'min_child_samples': 96}. Best is trial 1 with value: 0.868235159258979.
[I 2023-11-24 23:10:41,701] Trial 2 finished with value: 0.8583541381254387 and parameters: {'lambda_l1': 0.0004947659264268662, 'lambda_l2': 0.003444076912375832, 'num_leaves': 90, 'fe

Best trial:
  Value:  0.8714565449631217
  Params: 
    lambda_l1: 0.0013754401015119405
    lambda_l2: 0.03725787668558615
    num_leaves: 8
    feature_fraction: 0.5605134405925951
    bagging_fraction: 0.6673721502209349
    bagging_freq: 5
    min_child_samples: 83


In [4]:
import lightgbm as lgb

# 최적화된 하이퍼파라미터 설정
best_params = {
    'objective': 'binary',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'lambda_l1': 0.7295951154367691,
    'lambda_l2': 1.0785212215696376e-06,
    'num_leaves': 12,
    'feature_fraction': 0.8333693677052417,
    'bagging_fraction': 0.7287152936960324,
    'bagging_freq': 1,
    'min_child_samples': 65
}

# LightGBM 데이터셋으로 변환
train_set = lgb.Dataset(train.drop('smoking', axis=1), train['smoking'])

# 모델 학습
gbm = lgb.train(best_params, train_set, num_boost_round=1000)

# test 데이터에 대한 예측 수행
preds = gbm.predict(test)


In [5]:
# 피처 중요도 확인
feature_imp = gbm.feature_importance()

# 피처명과 함께 확인
feature_imp_df = pd.DataFrame({'Feature': gbm.feature_name(), 'Importance': feature_imp})

# 중요도 순으로 정렬하여 출력
feature_imp_df = feature_imp_df.sort_values('Importance', ascending=False)
print(feature_imp_df)


                Feature  Importance
13         triglyceride        1029
21                  Gtp         903
16           hemoglobin         774
15                  LDL         718
12          Cholesterol         692
4             waist(cm)         677
11  fasting_blood_sugar         660
20                  ALT         641
0                    id         595
9              systolic         565
19                  AST         555
14                  HDL         549
10           relaxation         528
1                   age         489
2            height(cm)         364
18     serum_creatinine         312
3            weight(kg)         289
5        eyesight(left)         274
6       eyesight(right)         259
22        dental_caries          58
17        Urine_protein          31
8        hearing(right)          24
7         hearing(left)          14


In [6]:
# 예측 결과를 'sub' 데이터프레임의 'smoking' 컬럼에 저장합니다.
sub['smoking'] = preds

# 'sub' 데이터프레임을 csv 파일로 저장합니다.
sub.to_csv('submission(lightgbm+outlier+optuna).csv',index= False)


In [9]:
train = pd.read_csv("/Users/raekkkky/Desktop/DNA/Semester_2023_2/5주차/train.csv")
test = pd.read_csv("/Users/raekkkky/Desktop/DNA/Semester_2023_2/5주차/test.csv")
sub = pd.read_csv("/Users/raekkkky/Desktop/DNA/Semester_2023_2/5주차/sample_submission.csv")

In [10]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159256 entries, 0 to 159255
Data columns (total 24 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   159256 non-null  int64  
 1   age                  159256 non-null  int64  
 2   height(cm)           159256 non-null  int64  
 3   weight(kg)           159256 non-null  int64  
 4   waist(cm)            159256 non-null  float64
 5   eyesight(left)       159256 non-null  float64
 6   eyesight(right)      159256 non-null  float64
 7   hearing(left)        159256 non-null  int64  
 8   hearing(right)       159256 non-null  int64  
 9   systolic             159256 non-null  int64  
 10  relaxation           159256 non-null  int64  
 11  fasting blood sugar  159256 non-null  int64  
 12  Cholesterol          159256 non-null  int64  
 13  triglyceride         159256 non-null  int64  
 14  HDL                  159256 non-null  int64  
 15  LDL              

In [11]:
train = train[train['eyesight(left)'] != 9.9]

train = train[train['eyesight(left)'] != 9.9]

Q1_triglyceride = train['triglyceride'].quantile(0.25)
Q3_triglyceride = train['triglyceride'].quantile(0.75)
IQR_triglyceride = Q3_triglyceride - Q1_triglyceride

Q1_creatinine = train['serum creatinine'].quantile(0.25)
Q3_creatinine = train['serum creatinine'].quantile(0.75)
IQR_creatinine = Q3_creatinine - Q1_creatinine

train = train[(train['triglyceride'] >= Q1_triglyceride - 1.5 * IQR_triglyceride) & (train['triglyceride'] <= Q3_triglyceride + 1.5 * IQR_triglyceride)]
train = train[(train['serum creatinine'] >= Q1_creatinine - 1.5 * IQR_creatinine) & (train['serum creatinine'] <= Q3_creatinine + 1.5 * IQR_creatinine)]

train['Gtp'].clip(lower = 0, upper = 300)
train['HDL'].clip(lower = 0, upper = 150)
train['LDL'].clip(lower = 0, upper = 200)
train['ALT'].clip(lower = 0, upper = 150)
train['AST'].clip(lower = 0, upper = 100)

1         27
2         27
3         20
4         19
6         17
          ..
159251    25
159252    21
159253    15
159254    22
159255    21
Name: AST, Length: 150186, dtype: int64

In [12]:
train2 = pd.DataFrame()
test2 = pd.DataFrame()

In [13]:
# train 데이터셋
train2['age'] = train['age']
train2['BMI'] = train['weight(kg)'] / ((train['height(cm)'] / 100) ** 2)
train2['HW_ratio'] = train['height(cm)'] / train['waist(cm)']
train2['hearing(left)'] = train['hearing(left)']
train2['hearing(right)'] = train['hearing(right)']
train2['eyesight'] = train[['eyesight(left)', 'eyesight(right)']].max(axis=1)
train2['systolic'] = pd.cut(train['systolic'], bins=[-np.inf, 90, 140, np.inf], labels=[0, 1, 2])
train2['relaxation'] = pd.cut(train['relaxation'], bins=[-np.inf, 60, 90, np.inf], labels=[0, 1, 2])
train2['fasting blood sugar'] = pd.cut(train['fasting blood sugar'], bins=[-np.inf, 100, 126, np.inf], labels=[0, 1, 2])
train2['Cholesterol'] = pd.cut(train['Cholesterol'], bins=[-np.inf, 150, 250, np.inf], labels=[0, 1, 2])
train2['triglyceride'] = pd.cut(train['triglyceride'], bins=[-np.inf, 150, 200, np.inf], labels=[0, 1, 2])
train2['HDL'] = pd.cut(train['HDL'], bins=[-np.inf, 40, np.inf], labels=[1, 0])
train2['LDL'] = pd.cut(train['LDL'], bins=[-np.inf, 130, 160, np.inf], labels=[0, 1, 2])
train2['hemoglobin'] = pd.cut(train['hemoglobin'], bins=[-np.inf,12.1, 15.1, np.inf], labels=[1,0, 1],ordered=False)
train2['serum creatinine'] = pd.cut(train['serum creatinine'], bins=[-np.inf,0.6, 1.2, np.inf], labels=[1,0, 1],ordered=False)
train2['AST'] = pd.cut(train['AST'], bins=[-np.inf, 40, np.inf], labels=[0, 1])
train2['ALT'] = pd.cut(train['ALT'], bins=[-np.inf, 40, np.inf], labels=[0, 1])
train2['Gtp'] = pd.cut(train['Gtp'], bins=[-np.inf,10, 40, np.inf], labels=[1,0, 1],ordered=False)
train2['Urine protein'] = train['Urine protein']
train2['dental caries'] = train['dental caries']
train2['smoking'] = train['smoking']


In [14]:
train2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150186 entries, 1 to 159255
Data columns (total 21 columns):
 #   Column               Non-Null Count   Dtype   
---  ------               --------------   -----   
 0   age                  150186 non-null  int64   
 1   BMI                  150186 non-null  float64 
 2   HW_ratio             150186 non-null  float64 
 3   hearing(left)        150186 non-null  int64   
 4   hearing(right)       150186 non-null  int64   
 5   eyesight             150186 non-null  float64 
 6   systolic             150186 non-null  category
 7   relaxation           150186 non-null  category
 8   fasting blood sugar  150186 non-null  category
 9   Cholesterol          150186 non-null  category
 10  triglyceride         150186 non-null  category
 11  HDL                  150186 non-null  category
 12  LDL                  150186 non-null  category
 13  hemoglobin           150186 non-null  category
 14  serum creatinine     150186 non-null  category
 15  

In [15]:
# test 데이터셋
test2['age'] = test['age']
test2['HW_ratio'] = test['height(cm)'] / test['waist(cm)']
test2['BMI'] = test['weight(kg)'] / ((test['height(cm)'] / 100) ** 2)
test2['hearing(left)'] = test['hearing(left)']
test2['hearing(right)'] = test['hearing(right)']
test2['eyesight'] = test[['eyesight(left)', 'eyesight(right)']].max(axis=1)
test2['systolic'] = pd.cut(test['systolic'], bins=[-np.inf, 90, 140, np.inf], labels=[0, 1, 2])
test2['relaxation'] = pd.cut(test['relaxation'], bins=[-np.inf, 60, 90, np.inf], labels=[0, 1, 2])
test2['fasting blood sugar'] = pd.cut(test['fasting blood sugar'], bins=[-np.inf, 100, 126, np.inf], labels=[0, 1, 2])
test2['Cholesterol'] = pd.cut(test['Cholesterol'], bins=[-np.inf, 150, 250, np.inf], labels=[0, 1, 2])
test2['triglyceride'] = pd.cut(test['triglyceride'], bins=[-np.inf, 150, 200, np.inf], labels=[0, 1, 2])
test2['HDL'] = pd.cut(test['HDL'], bins=[-np.inf, 40, np.inf], labels=[1, 0])
test2['LDL'] = pd.cut(test['LDL'], bins=[-np.inf, 130, 160, np.inf], labels=[0, 1, 2])
test2['hemoglobin'] = pd.cut(test['hemoglobin'], bins=[-np.inf,12.1, 15.1, np.inf], labels=[1,0, 1],ordered=False)
test2['serum creatinine'] = pd.cut(test['serum creatinine'], bins=[-np.inf,0.6, 1.2, np.inf], labels=[1,0, 1],ordered=False)
test2['AST'] = pd.cut(test['AST'], bins=[-np.inf, 40, np.inf], labels=[0, 1])
test2['ALT'] = pd.cut(test['ALT'], bins=[-np.inf, 40, np.inf], labels=[0, 1])
test2['Gtp'] = pd.cut(test['Gtp'], bins=[-np.inf,10, 40, np.inf], labels=[1,0, 1],ordered=False)
test2['Urine protein'] = test['Urine protein']
test2['dental caries'] = test['dental caries']


In [16]:
test2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106171 entries, 0 to 106170
Data columns (total 20 columns):
 #   Column               Non-Null Count   Dtype   
---  ------               --------------   -----   
 0   age                  106171 non-null  int64   
 1   HW_ratio             106171 non-null  float64 
 2   BMI                  106171 non-null  float64 
 3   hearing(left)        106171 non-null  int64   
 4   hearing(right)       106171 non-null  int64   
 5   eyesight             106171 non-null  float64 
 6   systolic             106171 non-null  category
 7   relaxation           106171 non-null  category
 8   fasting blood sugar  106171 non-null  category
 9   Cholesterol          106171 non-null  category
 10  triglyceride         106171 non-null  category
 11  HDL                  106171 non-null  category
 12  LDL                  106171 non-null  category
 13  hemoglobin           106171 non-null  category
 14  serum creatinine     106171 non-null  category
 15  

In [17]:
# 데이터프레임 이름이 train이라고 가정하겠습니다.
category_columns = ['systolic', 'relaxation', 'fasting blood sugar', 'Cholesterol', 'triglyceride',
                    'HDL', 'LDL', 'hemoglobin', 'serum creatinine', 'AST', 'ALT', 'Gtp']

# 원핫인코딩 실행
train_encoded = pd.get_dummies(train2, columns=category_columns)
test_encoded = pd.get_dummies(test2, columns=category_columns)

# 원핫인코딩 된 데이터프레임을 train3에 저장
train3 = train_encoded
test3 = test_encoded

In [18]:
from sklearn.preprocessing import StandardScaler

# StandardScaler 객체 생성
scaler = StandardScaler()

# 수치형 컬럼만 선택
num_cols = train3.select_dtypes(include=['int64', 'float64']).columns
num_cols = num_cols.drop('smoking')

# 수치형 컬럼에 대해 스케일링 수행
train3[num_cols] = scaler.fit_transform(train3[num_cols])

num_cols = test3.select_dtypes(include=['int64', 'float64']).columns
test3[num_cols] = scaler.fit_transform(test3[num_cols])

In [19]:
import optuna
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

# X, y 데이터 설정
X = train3.drop('smoking', axis=1) # 'target' 열을 제외한 나머지를 독립 변수 X로 설정
y = train3['smoking'] # 'target' 열을 종속 변수 y로 설정

def objective(trial):
    # 하이퍼파라미터의 범위를 지정
    param = {
        'objective': 'binary', # 이진 분류 문제
        'verbosity': -1,
        'boosting_type': 'gbdt', # Gradient Boosting Decision Tree
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    }

    # 데이터를 학습용과 검증용으로 분리
    train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=0.25)

    train_set = lgb.Dataset(train_x, train_y)
    valid_set = lgb.Dataset(valid_x, valid_y)

    gbm = lgb.train(param, train_set, valid_sets=[valid_set], num_boost_round=1000)

    preds = gbm.predict(valid_x)
    auc = roc_auc_score(valid_y, preds)
    return auc

study = optuna.create_study(direction='maximize') # 'maximize'로 설정하여 AUC를 최대화
study.optimize(objective, n_trials=10) # n_trials는 시도할 파라미터 셋의 수를 의미

print('Best trial:')
trial = study.best_trial
print('  Value: ', trial.value)
print('  Params: ')
for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))


[I 2023-11-24 23:25:53,663] A new study created in memory with name: no-name-c3caa274-5d43-40f2-99b9-d08573540245
[I 2023-11-24 23:26:03,299] Trial 0 finished with value: 0.8158446356376523 and parameters: {'lambda_l1': 1.9851004355692733e-06, 'lambda_l2': 1.1081664280662274e-05, 'num_leaves': 64, 'feature_fraction': 0.8957805580775147, 'bagging_fraction': 0.720098528927517, 'bagging_freq': 7, 'min_child_samples': 7}. Best is trial 0 with value: 0.8158446356376523.
[I 2023-11-24 23:26:29,451] Trial 1 finished with value: 0.8091614429844034 and parameters: {'lambda_l1': 0.0013061659859514357, 'lambda_l2': 4.859294398702604e-06, 'num_leaves': 191, 'feature_fraction': 0.5168339262613921, 'bagging_fraction': 0.920542477830865, 'bagging_freq': 6, 'min_child_samples': 77}. Best is trial 0 with value: 0.8158446356376523.
[I 2023-11-24 23:26:48,563] Trial 2 finished with value: 0.8064349109167973 and parameters: {'lambda_l1': 3.778250558096528e-07, 'lambda_l2': 0.8670823679491086, 'num_leaves'

Best trial:
  Value:  0.8291312149120142
  Params: 
    lambda_l1: 0.008016410579590854
    lambda_l2: 5.296959059807859e-05
    num_leaves: 24
    feature_fraction: 0.8246787355677498
    bagging_fraction: 0.9089146427168509
    bagging_freq: 3
    min_child_samples: 69


In [20]:
import lightgbm as lgb

# 최적화된 하이퍼파라미터 설정
best_params = {
    'objective': 'binary',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'lambda_l1': 4.65813172282846e-08,
    'lambda_l2': 2.208853417915532,
    'num_leaves': 28,
    'feature_fraction': 0.7231047028015054,
    'bagging_fraction': 0.5618507220644526,
    'bagging_freq': 1,
    'min_child_samples': 82
}

# LightGBM 데이터셋으로 변환
train_set = lgb.Dataset(train3.drop('smoking', axis=1), train['smoking'])

# 모델 학습
gbm = lgb.train(best_params, train_set, num_boost_round=1000)

# test 데이터에 대한 예측 수행
preds = gbm.predict(test3)


In [21]:
# 예측 결과를 'sub' 데이터프레임의 'smoking' 컬럼에 저장합니다.
sub['smoking'] = preds

# 'sub' 데이터프레임을 csv 파일로 저장합니다.
sub.to_csv('submission(lightgbm+outlier+category+optuna).csv',index= False)


In [22]:
# 피처 중요도 확인
feature_imp = gbm.feature_importance()

# 피처명과 함께 확인
feature_imp_df = pd.DataFrame({'Feature': gbm.feature_name(), 'Importance': feature_imp})

# 중요도 순으로 정렬하여 출력
feature_imp_df = feature_imp_df.sort_values('Importance', ascending=False)
print(feature_imp_df)


                  Feature  Importance
2                HW_ratio        6489
1                     BMI        6065
0                     age        3125
5                eyesight        2404
28           hemoglobin_0         681
36                  Gtp_0         604
7           dental_caries         520
14  fasting_blood_sugar_0         505
20         triglyceride_0         432
15  fasting_blood_sugar_1         422
34                  ALT_0         418
25                  LDL_0         396
23                  HDL_1         371
6           Urine_protein         363
21         triglyceride_1         355
26                  LDL_1         302
22         triglyceride_2         283
32                  AST_0         239
12           relaxation_1         233
30     serum_creatinine_0         229
18          Cholesterol_1         217
29           hemoglobin_1         198
37                  Gtp_1         193
9              systolic_1         192
17          Cholesterol_0         190
16  fasting_

여기서부터 이상치 처리(우리방식) + 스케일링(standard) + optuna

In [23]:
train = pd.read_csv("/Users/raekkkky/Desktop/DNA/Semester_2023_2/5주차/train.csv")
test = pd.read_csv("/Users/raekkkky/Desktop/DNA/Semester_2023_2/5주차/test.csv")
sub = pd.read_csv("/Users/raekkkky/Desktop/DNA/Semester_2023_2/5주차/sample_submission.csv")

In [24]:
train = train[train['eyesight(left)'] != 9.9]

train = train[train['eyesight(left)'] != 9.9]

Q1_triglyceride = train['triglyceride'].quantile(0.25)
Q3_triglyceride = train['triglyceride'].quantile(0.75)
IQR_triglyceride = Q3_triglyceride - Q1_triglyceride

Q1_creatinine = train['serum creatinine'].quantile(0.25)
Q3_creatinine = train['serum creatinine'].quantile(0.75)
IQR_creatinine = Q3_creatinine - Q1_creatinine

train = train[(train['triglyceride'] >= Q1_triglyceride - 1.5 * IQR_triglyceride) & (train['triglyceride'] <= Q3_triglyceride + 1.5 * IQR_triglyceride)]
train = train[(train['serum creatinine'] >= Q1_creatinine - 1.5 * IQR_creatinine) & (train['serum creatinine'] <= Q3_creatinine + 1.5 * IQR_creatinine)]

train['Gtp'].clip(lower = 0, upper = 300)
train['HDL'].clip(lower = 0, upper = 150)
train['LDL'].clip(lower = 0, upper = 200)
train['ALT'].clip(lower = 0, upper = 150)
train['AST'].clip(lower = 0, upper = 100)

1         27
2         27
3         20
4         19
6         17
          ..
159251    25
159252    21
159253    15
159254    22
159255    21
Name: AST, Length: 150186, dtype: int64

In [25]:
# 'dental caries', 'Urine protein', 'hearing(left)', 'hearing(right)' 컬럼의 데이터 타입을 'object'로 변경합니다.
cols_to_change = ['dental caries', 'Urine protein', 'hearing(left)', 'hearing(right)']
for col in cols_to_change:
    train[col] = train[col].astype('object')
    test[col] = test[col].astype('object')

train = train.drop(columns=['id'])
test = test.drop(columns=['id'])

train = pd.get_dummies(train, columns=cols_to_change)
test = pd.get_dummies(test, columns=cols_to_change)

In [26]:
from sklearn.preprocessing import StandardScaler

# StandardScaler 객체 생성
scaler = StandardScaler()

# 수치형 컬럼만 선택
num_cols = train.select_dtypes(include=['int64', 'float64']).columns
num_cols = num_cols.drop('smoking')

# 수치형 컬럼에 대해 스케일링 수행
train[num_cols] = scaler.fit_transform(train[num_cols])

num_cols = test.select_dtypes(include=['int64', 'float64']).columns
test[num_cols] = scaler.fit_transform(test[num_cols])

In [27]:
import optuna
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

# X, y 데이터 설정
X = train.drop('smoking', axis=1) # 'target' 열을 제외한 나머지를 독립 변수 X로 설정
y = train['smoking'] # 'target' 열을 종속 변수 y로 설정

def objective(trial):
    # 하이퍼파라미터의 범위를 지정
    param = {
        'objective': 'binary', # 이진 분류 문제
        'verbosity': -1,
        'boosting_type': 'gbdt', # Gradient Boosting Decision Tree
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    }

    # 데이터를 학습용과 검증용으로 분리
    train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=0.25)

    train_set = lgb.Dataset(train_x, train_y)
    valid_set = lgb.Dataset(valid_x, valid_y)

    gbm = lgb.train(param, train_set, valid_sets=[valid_set], num_boost_round=1000)

    preds = gbm.predict(valid_x)
    auc = roc_auc_score(valid_y, preds)
    return auc

study = optuna.create_study(direction='maximize') # 'maximize'로 설정하여 AUC를 최대화
study.optimize(objective, n_trials=10) # n_trials는 시도할 파라미터 셋의 수를 의미

print('Best trial:')
trial = study.best_trial
print('  Value: ', trial.value)
print('  Params: ')
for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))


[I 2023-11-24 23:29:35,425] A new study created in memory with name: no-name-3081444a-528b-4156-a254-72142136dc59
[I 2023-11-24 23:29:50,055] Trial 0 finished with value: 0.8601088251892268 and parameters: {'lambda_l1': 2.2184803728554846e-06, 'lambda_l2': 6.167385653798415e-06, 'num_leaves': 92, 'feature_fraction': 0.595894570570156, 'bagging_fraction': 0.4585090142634179, 'bagging_freq': 5, 'min_child_samples': 36}. Best is trial 0 with value: 0.8601088251892268.
[I 2023-11-24 23:30:19,316] Trial 1 finished with value: 0.8653722552181431 and parameters: {'lambda_l1': 4.476329234049679e-05, 'lambda_l2': 0.002126352513812906, 'num_leaves': 241, 'feature_fraction': 0.5747848711312149, 'bagging_fraction': 0.9513726311026102, 'bagging_freq': 1, 'min_child_samples': 37}. Best is trial 1 with value: 0.8653722552181431.
[I 2023-11-24 23:30:49,178] Trial 2 finished with value: 0.8522308337934814 and parameters: {'lambda_l1': 6.428391551123907e-05, 'lambda_l2': 0.005684698565660634, 'num_leave

Best trial:
  Value:  0.8700687783452309
  Params: 
    lambda_l1: 6.845144659484626e-08
    lambda_l2: 0.17029231699774178
    num_leaves: 39
    feature_fraction: 0.5624090879105655
    bagging_fraction: 0.9425786815363083
    bagging_freq: 3
    min_child_samples: 59


In [28]:
import lightgbm as lgb

# 최적화된 하이퍼파라미터 설정
best_params = {
    'objective': 'binary',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'lambda_l1': 5.875823452416582,
    'lambda_l2': 2.3718803731807686e-05,
    'num_leaves': 14,
    'feature_fraction': 0.4963672454966828,
    'bagging_fraction': 0.9529099493216389,
    'bagging_freq': 3,
    'min_child_samples': 81
}

# LightGBM 데이터셋으로 변환
train_set = lgb.Dataset(train.drop('smoking', axis=1), train['smoking'])

# 모델 학습
gbm = lgb.train(best_params, train_set, num_boost_round=1000)

# test 데이터에 대한 예측 수행
preds = gbm.predict(test)


In [29]:
# 예측 결과를 'sub' 데이터프레임의 'smoking' 컬럼에 저장합니다.
sub['smoking'] = preds

# 'sub' 데이터프레임을 csv 파일로 저장합니다.
sub.to_csv('submission(lightgbm+outlier+category+optuna).csv',index= False)


In [30]:
# 피처 중요도 확인
feature_imp = gbm.feature_importance()

# 피처명과 함께 확인
feature_imp_df = pd.DataFrame({'Feature': gbm.feature_name(), 'Importance': feature_imp})

# 중요도 순으로 정렬하여 출력
feature_imp_df = feature_imp_df.sort_values('Importance', ascending=False)
print(feature_imp_df)

                Feature  Importance
10         triglyceride        1332
17                  Gtp        1051
9           Cholesterol         904
13           hemoglobin         882
12                  LDL         881
8   fasting_blood_sugar         830
3             waist(cm)         819
16                  ALT         806
6              systolic         729
15                  AST         680
11                  HDL         658
0                   age         653
7            relaxation         652
1            height(cm)         458
2            weight(kg)         402
14     serum_creatinine         399
4        eyesight(left)         373
5       eyesight(right)         334
18      dental_caries_0          56
19      dental_caries_1          36
29     hearing(right)_2          13
20      Urine_protein_1          11
26      hearing(left)_1          11
28     hearing(right)_1          11
21      Urine_protein_2          11
27      hearing(left)_2           5
22      Urine_protein_3     