## **패키지 설치**

In [3]:
!pip install xgboost==2.0.3
!pip install numpy==1.26.3
!pip install catboost==1.2
!pip install lightgbm==4.1.0
!pip install pandas==2.1.4
!pip install category_encoders==2.6.3



## **import**

In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
import os
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.model_selection import train_test_split
from category_encoders import TargetEncoder
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import optuna
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score
# XGBoost와 CatBoost 라이브러리를 임포트합니다.
import xgboost as xgb
import catboost as cb
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
os.environ['KMP_DUPLICATE_LIB_OK']='True'

## **데이터 호출**

In [3]:
train = pd.read_csv("train.csv")
test = pd.read_csv("submission.csv")
test.drop('id' , axis = 1 , inplace = True)

## **데이터 전처리**

In [4]:
# 사용하지 않을 column 제외
train.drop('customer_country.1',axis = 1 , inplace = True)
train.drop('product_modelname',axis = 1 , inplace = True)

test.drop('customer_country.1',axis = 1 , inplace = True)
test.drop('product_modelname',axis = 1 , inplace = True)

# object 타입의 열들은 모든 소문자로 변환
for column in train.columns:
    if train[column].dtype == 'object': 
        train[column] = train[column].str.lower()
for column in test.columns:
    if test[column].dtype == 'object':  
        test[column] = test[column].str.lower()

In [5]:
# 결측치 처리
# 열마다 결측치를 none, 0, other로 치환.
none_col = ['customer_country','customer_type','customer_job','product_category','product_subcategory','business_area','business_subarea','expected_timeline']
zero_col = ['com_reg_ver_win_rate','historical_existing_cnt','id_strategic_ver','it_strategic_ver','idit_strategic_ver','ver_win_rate_x','ver_win_ratio_per_bu']
other_col = ['inquiry_type']

for col in none_col:
    train[col] = train[col].fillna('none')
    test[col] = test[col].fillna('none')
    
for col in zero_col:
    train[col] = train[col].fillna(0)
    test[col] = test[col].fillna(0)

for col in other_col:
    train[col] = train[col].fillna('other')
    test[col] = test[col].fillna('other')

In [6]:
# customer_country 열의 데이터 형식을 결측치에도 적용
train.loc[train['customer_country'] == 'none', 'customer_country'] = '//none'
test.loc[test['customer_country'] == 'none', 'customer_country'] = '//none'

In [7]:
obj_col = ['customer_idx', 'customer_type','customer_job', 'inquiry_type','product_category','product_subcategory','product_modelname', 'customer_position','response_corporate','expected_timeline','business_subarea']

In [8]:
# 데이터 타입이 object인 컬럼에 대해서 빈도수를 계산, 빈도수가 10이하인 데이터를 묶음 (특정 값도 같이 묶음)

# customer_idx
frequency = train['customer_idx'].value_counts()
idx_to_replace = frequency[frequency <= 10].index
train['customer_idx'] = train['customer_idx'].replace(idx_to_replace, -1)

# customer_type
train.loc[train['customer_type'] == 'end customer', 'customer_type'] = 'end-customer'
train.loc[train['customer_type'] == 'specifier / influencer', 'customer_type'] = 'specifier/ influencer'
frequency = train['customer_type'].value_counts()
idx_to_replace = frequency[frequency <= 10].index
train['customer_type'] = train['customer_type'].replace(idx_to_replace, 'other')

# customer_job
frequency = train['customer_job'].value_counts()
idx_to_replace = frequency[frequency <= 10].index
train['customer_job'] = train['customer_job'].replace(idx_to_replace, 'other')

# inquiry_type
train.loc[train['inquiry_type'] == 'etc.', 'inquiry_type'] = 'other'
train.loc[train['inquiry_type'] == 'other_', 'inquiry_type'] = 'other'
train.loc[train['inquiry_type'] == 'others', 'inquiry_type'] = 'other'
frequency = train['inquiry_type'].value_counts()
idx_to_replace = frequency[frequency <= 10].index
train['inquiry_type'] = train['inquiry_type'].replace(idx_to_replace, 'other')

# product_category
train.loc[train['product_category'] == 'etc.', 'product_category'] = 'other'
frequency = train['product_category'].value_counts()
idx_to_replace = frequency[frequency <= 10].index
train['product_category'] = train['product_category'].replace(idx_to_replace, 'other')

# product_subcategory
frequency = train['product_subcategory'].value_counts()
idx_to_replace = frequency[frequency <= 10].index
train['product_subcategory'] = train['product_subcategory'].replace(idx_to_replace, 'other')

# customer_position
train.loc[train['customer_position'] == 'others', 'customer_position'] = 'other'
frequency = train['customer_position'].value_counts()
idx_to_replace = frequency[frequency <= 10].index
train['customer_position'] = train['customer_position'].replace(idx_to_replace, 'other')

# response_corporate
frequency = train['response_corporate'].value_counts()
idx_to_replace = frequency[frequency <= 10].index
train['response_corporate'] = train['response_corporate'].replace(idx_to_replace, 'other')

# expected_timeline
train.loc[train['expected_timeline'] == 'less_than_3_month.', 'expected_timeline'] = 'less than 3 month'
train.loc[train['expected_timeline'] == '3_month_~_6_month', 'expected_timeline'] = '3 month ~ 6 month'
train.loc[train['expected_timeline'] == '6_month_~_9_month', 'expected_timeline'] = '6 month ~ 9 month'
frequency = train['expected_timeline'].value_counts()
idx_to_replace = frequency[frequency <= 10].index
train['expected_timeline'] = train['expected_timeline'].replace(idx_to_replace, 'other')

# business_subarea
train.loc[train['business_subarea'] == 'others', 'business_subarea'] = 'other'
frequency = train['business_subarea'].value_counts()
idx_to_replace = frequency[frequency <= 10].index
train['business_subarea'] = train['expected_timeline'].replace(idx_to_replace, 'other')

In [9]:
train_unique_values = {
    'customer_idx': set(train[train['customer_idx'] != 'other' ]['customer_idx']),
    'customer_type': set(train[train['customer_type'] != 'other']['customer_type']),
    'customer_job': set(train[train['customer_job'] != 'other']['customer_job']),
    'inquiry_type': set(train[train['inquiry_type'] != 'other']['inquiry_type']),
    'product_category': set(train[train['product_category'] != 'other']['product_category']),
    'product_subcategory': set(train[train['product_subcategory'] != 'other']['product_subcategory']),
    'customer_position': set(train[train['customer_position'] != 'other']['customer_position']),
    'response_corporate': set(train[train['response_corporate'] != 'other']['response_corporate']),
    'expected_timeline': set(train[train['expected_timeline'] != 'other']['expected_timeline']),
    'business_subarea': set(train[train['business_subarea'] != 'other']['business_subarea'])
}

# train에서 other로 처리한 값을 test에도 적용
test_replacements = {
    'customer_type': {
        'end customer': 'end-customer',
        'specifier / influencer': 'specifier/ influencer'
    },
    'inquiry_type': {
        'etc.': 'other',
        'other_': 'other',
        'others': 'other'
    },
    'product_category': {
        'etc.': 'other'
    },
    'customer_position': {
        'others': 'other'
    },
    'expected_timeline': {
        'less_than_3_month.': 'less than 3 month',
        '3_month_~_6_month': '3 month ~ 6 month',
        '6_month_~_9_month': '6 month ~ 9 month'
    },
    'business_subarea': {
        'others': 'other'
    }
}

# test 데이터셋의 컬럼들에 대해 'other'로 치환할 필요가 있는 값을 치환합니다.
for col, replacements in test_replacements.items():
    test[col].replace(replacements, inplace=True)

# 이제, train 데이터셋에서 'other'로 레이블된 값을 기반으로
# test 데이터셋에서 해당하지 않는 값들을 'unknown'로 치환합니다.
for col, unique_values in train_unique_values.items():
    test[col] = test[col].apply(lambda x: x if x in unique_values else 'unknown')

In [10]:
# customer_country을 /로 구분 후 공백제거한 country열 생성
train['country'] = train['customer_country'].str.split('/').str[-1]
test['country'] = test['customer_country'].str.split('/').str[-1]
train['country'] = train['country'].str.strip()
test['country'] = test['country'].str.strip()

In [11]:
# 나라명 표준화 적용
country_mapping = {
    'u.a.e': 'united arab emirates',
    'uae dubai': 'united arab emirates',
    ' india': 'india',
    'türkiye': 'turkey',
    'us': 'united states',
    'usa': 'united states',
    ' united states' : 'united states',
    'u.k': 'united kingdom',
    'uk': 'united kingdom',
    'brasil': 'brazil',
    'ha noi': 'vietnam',
    'south korea': 'korea',
    'republic of korea': 'korea',
    's. korea': 'korea',
    'czech': 'czech republic',
    'czechia': 'czech republic',
    'macau': 'china',
    'england': 'united kingdom',
    'scotland': 'united kingdom',
    'prc': 'china',
    'mainland china': 'china',
    'france': 'france',
    'french': 'france',
    'paris': 'france',  # 특정 도시를 나라명으로 매핑
    'germany': 'germany',
    'deutschland': 'germany',  # 독일의 독일어 명칭
    'berlin': 'germany',  # 특정 도시를 나라명으로 매핑
    'abu dhabi': 'united arab emirates',  # 특정 도시를 나라명으로 매핑
    'dubai': 'united arab emirates',  # 특정 도시를 나라명으로 매핑
    '' : 'none'
}

train['country'] = train['country'].replace(country_mapping)
test['country'] = test['country'].replace(country_mapping)

In [12]:
# country열은 빈도수 3이하인 데이터를 other로 묶음
frequency = train['country'].value_counts()
idx_to_replace = frequency[frequency <= 3].index
train['country'] = train['country'].replace(idx_to_replace, 'other')

frequency = test['country'].value_counts()
idx_to_replace = frequency[frequency <= 3].index
test['country'] = test['country'].replace(idx_to_replace, 'other')

### **파생변수 생성**

In [13]:
# 'lead_owner' 카테고리별로 'is_converted'의 평균과 갯수를 계산합니다.
conversion_rate_and_count_by_lead_owner = train.groupby('lead_owner')['is_converted'].agg(['mean', 'count']).sort_values(by='mean', ascending=False)

conversion_dict = conversion_rate_and_count_by_lead_owner['mean'].to_dict()

# 이제 'train' 데이터 프레임에 새로운 열 'mean_value'를 추가하고, 'conversion_dict'를 사용하여 값을 매핑합니다.
train['lead_owner_mean_value'] = train['lead_owner'].map(conversion_dict)
test['lead_owner_mean_value'] = test['lead_owner'].map(conversion_dict)

In [14]:
# 조건에 따라 새로운 컬럼을 만들기 위한 기준을 설정합니다.
conditions = [
    ('new_column_90', (conversion_rate_and_count_by_lead_owner['mean'] >= 0.99) & (conversion_rate_and_count_by_lead_owner['count'] >= 5)),
    ('new_column_80', (conversion_rate_and_count_by_lead_owner['mean'] < 0.99) & (conversion_rate_and_count_by_lead_owner['mean'] >= 0.8) & (conversion_rate_and_count_by_lead_owner['count'] >= 5)),
    ('new_column_50', (conversion_rate_and_count_by_lead_owner['mean'] < 0.8) & (conversion_rate_and_count_by_lead_owner['mean'] >= 0.5) & (conversion_rate_and_count_by_lead_owner['count'] >= 5)),
    ('new_column_20', (conversion_rate_and_count_by_lead_owner['mean'] < 0.5) & (conversion_rate_and_count_by_lead_owner['mean'] >= 0.2) & (conversion_rate_and_count_by_lead_owner['count'] >= 5)),
    ('new_column_01', (conversion_rate_and_count_by_lead_owner['mean'] <= 0.01) & (conversion_rate_and_count_by_lead_owner['count'] >= 30))
]

# 각 조건에 따라 새로운 컬럼을 만들고 해당하는 'lead_owner'에 대해 값을 1로 설정합니다.
for col_name, condition in conditions:
    # 조건을 만족하는 'lead_owner'를 찾습니다.
    qualified_lead_owners = conversion_rate_and_count_by_lead_owner[condition].index

    # 새로운 컬럼을 생성하고 기본값을 0으로 설정합니다.
    train[col_name] = 0
    test[col_name] = 0 
    
    # 조건을 만족하는 'lead_owner'의 행에 대해서만 새로운 컬럼의 값을 1로 설정합니다.
    train.loc[train['lead_owner'].isin(qualified_lead_owners), col_name] = 1
    test.loc[train['lead_owner'].isin(qualified_lead_owners), col_name] = 1


In [15]:
# 'response_corporate' 컬럼을 생성하고 기본값을 0으로 설정합니다.
train['response_corporate_high'] = 0
test['response_corporate_high'] = 0

train['response_corporate_low'] = 0
test['response_corporate_low'] = 0

# 'lead_owner' 값이 'lgett', 'lgeaf', 'lgehk' 중 하나인 행에 대해서 'response_corporate' 값을 1로 설정합니다.
train.loc[train['response_corporate'].isin(['lgett', 'lgeaf', 'lgehk']), 'response_corporate_high'] = 1
test.loc[test['response_corporate'].isin(['lgett', 'lgeaf', 'lgehk']), 'response_corporate_high'] = 1

# 'lead_owner' 값이 'lgett', 'lgeaf', 'lgehk' 중 하나인 행에 대해서 'response_corporate' 값을 1로 설정합니다.
train.loc[train['response_corporate'].isin(['lgeis', 'lgejp', 'lgekr','lgeuk']), 'response_corporate_low'] = 1
test.loc[test['response_corporate'].isin(['lgeis', 'lgejp', 'lgekr','lgeuk']), 'response_corporate_low'] = 1

In [16]:
# 'response_corporate' 컬럼을 생성하고 기본값을 0으로 설정합니다.
train['country_2'] = 0
test['country_2'] = 0

train.loc[train['country'].isin(['brazil','colombia','india','philippines','united kingdom']), 'country_2'] = 1
test.loc[test['country'].isin(['brazil','colombia','india','philippines','united kingdom']), 'country_2'] = 1

In [17]:
# 'response_corporate' 컬럼을 생성하고 기본값을 0으로 설정합니다.
train['customer_idx_row'] = 0
test['customer_idx_row'] = 0

train.loc[train['customer_idx'].isin([
    47466, 37680, 21321, 4936, 32240, 25309, 19804, 40491,
    42067, 37657, 31864, 33773, 37399, 40344, 7195, 18030,
    33334, 7810, 33350, 16590, 742, 9624
]), 'customer_idx_row'] = 1
test.loc[test['customer_idx'].isin([
    47466, 37680, 21321, 4936, 32240, 25309, 19804, 40491,
    42067, 37657, 31864, 33773, 37399, 40344, 7195, 18030,
    33334, 7810, 33350, 16590, 742, 9624
]), 'customer_idx_row'] = 1

In [18]:
train['ver_win_rate_x_category'] = train['ver_win_rate_x'].astype('object')
test['ver_win_rate_x_category'] = test['ver_win_rate_x'].astype('object')



train['historical_existing_cnt_category'] = train['historical_existing_cnt']
test['historical_existing_cnt_category'] = test['historical_existing_cnt']

frequency = train['historical_existing_cnt_category'].value_counts()
idx_to_replace = frequency[frequency <= 5].index
train['historical_existing_cnt_category'] = train['historical_existing_cnt_category'].replace(idx_to_replace, 'other')
train['historical_existing_cnt_category'] = train['historical_existing_cnt_category'].astype('object')


train_unique_values = {
    'historical_existing_cnt_category': set(train[train['historical_existing_cnt_category'] != 'other']['historical_existing_cnt_category'])
}


for col, unique_values in train_unique_values.items():
    test[col] = test[col].apply(lambda x: x if x in unique_values else 'other')


In [19]:
test['lead_owner_mean_value'] = test['lead_owner_mean_value'].fillna(0.0817889) # 평균값 대체

In [20]:
train.to_csv('./train_final.csv', index=False)
test.to_csv('./test_final.csv', index=False)

In [21]:
train = pd.read_csv("./train_final.csv")
train.drop(['customer_country'], axis= 1 , inplace = True)
train['lead_owner'] = train['lead_owner'].astype('object')

test = pd.read_csv("./test_final.csv")
test.drop(['customer_country'], axis= 1 , inplace = True)
test['lead_owner'] = test['lead_owner'].astype('object')

### **타겟 인코딩**

In [22]:
# 타겟인코딩할 컬럼들을 category로 타입변환
columns_to_encode = ['country','lead_owner', 'business_unit', 'customer_idx', 'customer_type', 'enterprise',
                     'customer_job', 'inquiry_type', 'product_category',
                     'product_subcategory', 'customer_position', 'response_corporate',
                     'expected_timeline', 'business_area', 'business_subarea','ver_win_rate_x_category','historical_existing_cnt_category']

for column in columns_to_encode:
    train[column] = train[column].astype('category')
    test[column] = test[column].astype('category')

In [23]:
smoothing_value = 1 

# 타겟 인코더를 각 컬럼에 적용하기 위한 함수
def target_encode(train_df, test_df, columns, target, smoothing):
    encoders = {column: TargetEncoder(smoothing=smoothing) for column in columns}
    train_encoded = train_df.copy()
    test_encoded = test_df.copy()

    for column in columns:
        # 학습 데이터셋에 대해 fit_transform을 사용하여 타겟 인코딩 적용
        train_encoded[column] = encoders[column].fit_transform(train_df[column], target)
        # 테스트 데이터셋에 대해 transform을 사용하여 타겟 인코딩 적용
        test_encoded[column] = encoders[column].transform(test_df[column])

    return train_encoded, test_encoded, encoders

# 인코딩할 컬럼 목록
columns_to_encode = ['business_unit', 'customer_idx', 'customer_type', 'enterprise',
                     'customer_job', 'inquiry_type', 'product_category',
                     'product_subcategory', 'customer_position', 'response_corporate',
                     'expected_timeline', 'business_area', 'business_subarea','ver_win_rate_x_category']
# 타겟 변수
target_column = 'is_converted'

# train, test 데이터셋에 타겟 인코딩 적용
train_encoded, test_encoded, encoders = target_encode(train, test, columns_to_encode, train[target_column], smoothing=smoothing_value)

# 결과 확인
print("Train encoded shape:", train_encoded.shape)
print("Test encoded shape:", test_encoded.shape)

train = train_encoded
test = test_encoded

Train encoded shape: (59299, 39)
Test encoded shape: (5271, 39)


In [24]:
smoothing_value = 3 

# 타겟 인코더를 각 컬럼에 적용하기 위한 함수
def target_encode(train_df, test_df, columns, target, smoothing):
    encoders = {column: TargetEncoder(smoothing=smoothing) for column in columns}
    train_encoded = train_df.copy()
    test_encoded = test_df.copy()

    for column in columns:
        # 학습 데이터셋에 대해 fit_transform을 사용하여 타겟 인코딩 적용
        train_encoded[column] = encoders[column].fit_transform(train_df[column], target)
        # 테스트 데이터셋에 대해 transform을 사용하여 타겟 인코딩 적용
        test_encoded[column] = encoders[column].transform(test_df[column])

    return train_encoded, test_encoded, encoders

# 인코딩할 컬럼 목록
columns_to_encode = ['country','lead_owner','historical_existing_cnt_category']

# 타겟 변수
target_column = 'is_converted'

# train, test 데이터셋에 타겟 인코딩 적용
train_encoded, test_encoded, encoders = target_encode(train, test, columns_to_encode, train[target_column], smoothing=smoothing_value)

# 결과 확인
print("Train encoded shape:", train_encoded.shape)
print("Test encoded shape:", test_encoded.shape)

train = train_encoded
test = test_encoded

Train encoded shape: (59299, 39)
Test encoded shape: (5271, 39)


In [25]:
# scailing 
#int형 또는 float형의 컬럼만 선택
numeric_columns = ['historical_existing_cnt','lead_desc_length']

scaler = StandardScaler()

# train 데이터에 대해서 fit과 transform 수행
train_scaled = train.copy()
train_scaled[numeric_columns] = scaler.fit_transform(train[numeric_columns])

# test 데이터에 대해서는 transform만 수행
test_scaled = test.copy()
test_scaled[numeric_columns] = scaler.transform(test[numeric_columns])

train = train_scaled
test = test_scaled

In [26]:
# 파생변수 생성
# 조건에 따라 새로운 컬럼에 1을 할당하는 코드
train['high_customer_type'] = (train['customer_type'] >= 0.98).astype(int)
train['high_customer_idx'] = (train['customer_idx'] >= 0.98).astype(int)
train['high_lead_owner'] = (train['lead_owner'] >= 0.98).astype(int)

train['low_customer_type'] = (train['customer_type'] <= 0.0001).astype(int)
train['low_customer_idx'] = (train['customer_idx'] <= 0.0001).astype(int)
train['low_lead_owner'] = (train['lead_owner'] <= 0.0001).astype(int)

test['high_customer_type'] = (test['customer_type'] >= 0.98).astype(int)
test['high_customer_idx'] = (test['customer_idx'] >= 0.98).astype(int)
test['high_lead_owner'] = (test['lead_owner'] >= 0.98).astype(int)

test['low_customer_type'] = (test['customer_type'] <= 0.0001).astype(int)
test['low_customer_idx'] = (test['customer_idx'] <= 0.0001).astype(int)
test['low_lead_owner'] = (test['lead_owner'] <= 0.0001).astype(int)

In [27]:
# 타겟 변수 인코딩
train.loc[train['is_converted'] == True, 'is_converted'] = 1
train.loc[train['is_converted'] == False, 'is_converted'] = 0

train['is_converted'] = train['is_converted'].astype(float)

## **데이터 분리**

In [28]:
# X와 Y로 나누기
X = train[train.columns.drop('is_converted')]
Y = train['is_converted']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.01, random_state=42, shuffle=True)
# 예측에 필요한 데이터 분리
x_test = test.drop(["is_converted"], axis=1)

## **데이터 불균형 처리**

In [29]:
smoteto = SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'), random_state=42)
X_train, y_train = smoteto.fit_resample(X_train, y_train)

## **모델 학습**

In [41]:
def objective(trial):

    classifier_name = 'XGBoost'
    if classifier_name == 'DecisionTree':
        param = {
            'max_depth': trial.suggest_int('max_depth', 3, 20),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
            'max_features': trial.suggest_int('max_features', 1, 30)
        }
        model = DecisionTreeClassifier(**param)
    elif classifier_name == 'LGBM':
        param = {
            'n_estimators': trial.suggest_int('n_estimators', 300, 500),
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
            'num_leaves': trial.suggest_int('num_leaves', 31, 128),
            'max_depth': trial.suggest_int('max_depth', 3, 12),
        }
        model = LGBMClassifier(**param)
    elif classifier_name == 'XGBoost':
        param = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 500),
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'subsample': trial.suggest_float('subsample', 0.5, 1),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1),
            'random_state' : trial.suggest_int('random_state',42,42),
        }
        model = xgb.XGBClassifier(**param)
    elif classifier_name == 'CatBoost':
        param = {
            'iterations': trial.suggest_int('iterations', 100, 500),
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
            'depth': trial.suggest_int('depth', 4, 10),
            'l2_leaf_reg': trial.suggest_int('l2_leaf_reg', 1, 10),
            'border_count': trial.suggest_int('border_count', 1, 255),
            'loss_function': 'Logloss',
        }
        model = cb.CatBoostClassifier(**param, verbose=False)


    model.fit(X_train, y_train)
    
    test_pred = model.predict(x_test)
    print(sum(test_pred))
    y_pred = model.predict(X_test)
    score = f1_score(y_test, y_pred, average='binary')
    return score

  from .autonotebook import tqdm as notebook_tqdm


In [68]:
# Optuna Study 생성 및 최적화
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(direction='maximize', sampler=sampler)
study.optimize(objective, n_trials=100)

# 최적화 결과 출력
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[I 2024-02-25 09:44:19,375] A new study created in memory with name: no-name-76b944f8-3df6-4946-ac52-2961d87d1ee9
[I 2024-02-25 09:44:22,361] Trial 0 finished with value: 0.8659793814432989 and parameters: {'n_estimators': 250, 'learning_rate': 0.2536999076681772, 'max_depth': 8, 'min_child_weight': 6, 'subsample': 0.5780093202212182, 'colsample_bytree': 0.5779972601681014, 'random_state': 42}. Best is trial 0 with value: 0.8659793814432989.


1889


[I 2024-02-25 09:44:23,488] Trial 1 finished with value: 0.8431372549019608 and parameters: {'n_estimators': 123, 'learning_rate': 0.19030368381735815, 'max_depth': 7, 'min_child_weight': 8, 'subsample': 0.5102922471479012, 'colsample_bytree': 0.9849549260809971, 'random_state': 42}. Best is trial 0 with value: 0.8659793814432989.


1851


[I 2024-02-25 09:44:25,931] Trial 2 finished with value: 0.7731092436974789 and parameters: {'n_estimators': 433, 'learning_rate': 0.020589728197687916, 'max_depth': 4, 'min_child_weight': 2, 'subsample': 0.6521211214797689, 'colsample_bytree': 0.762378215816119, 'random_state': 42}. Best is trial 0 with value: 0.8659793814432989.


2302


[I 2024-02-25 09:44:28,032] Trial 3 finished with value: 0.8113207547169813 and parameters: {'n_estimators': 273, 'learning_rate': 0.02692655251486473, 'max_depth': 7, 'min_child_weight': 2, 'subsample': 0.6460723242676091, 'colsample_bytree': 0.6831809216468459, 'random_state': 42}. Best is trial 0 with value: 0.8659793814432989.


1788


[I 2024-02-25 09:44:29,561] Trial 4 finished with value: 0.8316831683168316 and parameters: {'n_estimators': 282, 'learning_rate': 0.14447746112718687, 'max_depth': 4, 'min_child_weight': 6, 'subsample': 0.7962072844310213, 'colsample_bytree': 0.5232252063599989, 'random_state': 42}. Best is trial 0 with value: 0.8659793814432989.


1731


[I 2024-02-25 09:44:31,304] Trial 5 finished with value: 0.7076923076923077 and parameters: {'n_estimators': 343, 'learning_rate': 0.0178601378893971, 'max_depth': 3, 'min_child_weight': 10, 'subsample': 0.9828160165372797, 'colsample_bytree': 0.9041986740582306, 'random_state': 42}. Best is trial 0 with value: 0.8659793814432989.


2598


[I 2024-02-25 09:44:33,302] Trial 6 finished with value: 0.8108108108108109 and parameters: {'n_estimators': 222, 'learning_rate': 0.013940346079873234, 'max_depth': 8, 'min_child_weight': 5, 'subsample': 0.5610191174223894, 'colsample_bytree': 0.7475884550556351, 'random_state': 42}. Best is trial 0 with value: 0.8659793814432989.


2119


[I 2024-02-25 09:44:34,130] Trial 7 finished with value: 0.8514851485148515 and parameters: {'n_estimators': 113, 'learning_rate': 0.22038218939289875, 'max_depth': 5, 'min_child_weight': 7, 'subsample': 0.6558555380447055, 'colsample_bytree': 0.7600340105889054, 'random_state': 42}. Best is trial 0 with value: 0.8659793814432989.


1786


[I 2024-02-25 09:44:37,289] Trial 8 finished with value: 0.8269230769230769 and parameters: {'n_estimators': 319, 'learning_rate': 0.01875220945578641, 'max_depth': 10, 'min_child_weight': 8, 'subsample': 0.9697494707820946, 'colsample_bytree': 0.9474136752138245, 'random_state': 42}. Best is trial 0 with value: 0.8659793814432989.


1791


[I 2024-02-25 09:44:38,962] Trial 9 finished with value: 0.8400000000000001 and parameters: {'n_estimators': 339, 'learning_rate': 0.22999586428143728, 'max_depth': 3, 'min_child_weight': 2, 'subsample': 0.522613644455269, 'colsample_bytree': 0.6626651653816322, 'random_state': 42}. Best is trial 0 with value: 0.8659793814432989.


1974


[W 2024-02-25 09:44:39,332] Trial 10 failed with parameters: {'n_estimators': 478, 'learning_rate': 0.06690992453172911, 'max_depth': 10, 'min_child_weight': 4, 'subsample': 0.8200442512337781, 'colsample_bytree': 0.5193625999805914, 'random_state': 42} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/elicer/.local/lib/python3.10/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_196/1529646471.py", line 51, in objective
    model.fit(X_train, y_train)
  File "/home/elicer/.local/lib/python3.10/site-packages/xgboost/core.py", line 730, in inner_f
    return func(**kwargs)
  File "/home/elicer/.local/lib/python3.10/site-packages/xgboost/sklearn.py", line 1519, in fit
    self._Booster = train(
  File "/home/elicer/.local/lib/python3.10/site-packages/xgboost/core.py", line 730, in inner_f
    return func(**kwargs)
  File "/home/elicer/.local/lib/python3.10/site-pac

KeyboardInterrupt: 

## **모델 추론**

In [30]:
model_xgb = xgb.XGBClassifier(random_state=42,
                              max_depth=4,
                              n_estimators=300,
                              learning_rate= 0.040589728197687916,
                              min_child_weight = 2,
                              subsample = 0.6521211214797689,
                              colsample_bytree = 0.762378215816119)
                              
              
model_xgb.fit(X_train, y_train)


test_pred = model_xgb.predict(x_test)
print(sum(test_pred))  # True로 예측된 개수

y_pred_xgb = model_xgb.predict(X_test)
# 정확도 및 F1 점수 계산
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

print(f"XGB Accuracy: {accuracy_xgb}")
print(f1_score(y_test, y_pred_xgb, average='binary'))

2196
XGB Accuracy: 0.9612141652613828
0.7927927927927927


In [31]:
model_xgb_2 = xgb.XGBClassifier(random_state=42,
                              max_depth=8,
                              n_estimators=222,
                              learning_rate= 0.013940346079873234,
                              min_child_weight = 5,
                              subsample = 0.5610191174223894,
                              colsample_bytree = 0.7475884550556351)
                              
              
model_xgb_2.fit(X_train, y_train)


test_pred_2 = model_xgb_2.predict(x_test)
print(sum(test_pred_2))  # True로 예측된 개수

y_pred_xgb_2 = model_xgb_2.predict(X_test)
# 정확도 및 F1 점수 계산
accuracy_xgb_2 = accuracy_score(y_test, y_pred_xgb_2)

print(f"XGB Accuracy: {accuracy_xgb_2}")
print(f1_score(y_test, y_pred_xgb_2, average='binary'))

2135
XGB Accuracy: 0.9645868465430016
0.8108108108108109


In [32]:
proba_xgb = model_xgb_2.predict_proba(x_test)[:, 1]
xgb_result1 = np.where(proba_xgb >= 0.64, 1, 0)

print(test_pred_2.sum())
print(xgb_result1.sum())

2135
1636


In [33]:
test_pred[(xgb_result1 == 1) & (test_pred == False)] = True

In [34]:
print((test_pred).sum())

2204


In [60]:
# 1을 True로, 0을 False로 변환
test_pred = test_pred.astype(bool)

In [61]:
# 제출 데이터 생성
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_pred
df_sub.to_csv("submission_test.csv", index=False)