# 라이브러리

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import plotly.express as px
import re
from functools import partial
from scipy.stats import mode
from tqdm.notebook import tqdm
pd.set_option('display.max_columns', 100)
plt.style.use('ggplot')

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder, FunctionTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GridSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve, RocCurveDisplay, cohen_kappa_score, log_loss, f1_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import RFE, RFECV
from sklearn.isotonic import IsotonicRegression
from sklearn.calibration import CalibrationDisplay
from sklearn.inspection import PartialDependenceDisplay, permutation_importance
from sklearn.linear_model import LogisticRegression
from collections import Counter
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.manifold import TSNE
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

import optuna
import xgboost as xgb

# Read Dataset

In [3]:
train_df = pd.read_csv('/Users/raekkkky/Desktop/DNA/Semester_2023_2/5주차/train.csv')
test_df = pd.read_csv('/Users/raekkkky/Desktop/DNA/Semester_2023_2/5주차/test.csv')
print(train_df.shape, test_df.shape)

(159256, 24) (106171, 23)


# Preprocessing

In [None]:
def create_extra_features(df):
    # order the ears
    best = np.where(df['hearing(left)'] < df['hearing(right)'], 
                    df['hearing(left)'],  df['hearing(right)'])
    worst = np.where(df['hearing(left)'] < df['hearing(right)'], 
                     df['hearing(right)'],  df['hearing(left)'])
    df['hearing(left)'] = best - 1
    df['hearing(right)'] = worst - 1
    
    # order the eyes - eyesight is worst to best, and 9+ should be worst!
    df['eyesight(left)'] = np.where(df['eyesight(left)'] > 9, 0, df['eyesight(left)'])
    df['eyesight(right)'] = np.where(df['eyesight(right)'] > 9, 0, df['eyesight(right)'])
    best = np.where(df['eyesight(left)'] < df['eyesight(right)'], 
                    df['eyesight(left)'],  df['eyesight(right)'])
    worst = np.where(df['eyesight(left)'] < df['eyesight(right)'], 
                     df['eyesight(right)'],  df['eyesight(left)'])
    df['eyesight(left)'] = best
    df['eyesight(right)'] = worst
    ##
    df['Gtp'] = np.clip(df['Gtp'], 0, 300)
    df['HDL'] = np.clip(df['HDL'], 0, 110)
    df['LDL'] = np.clip(df['LDL'], 0, 200)
    df['ALT'] = np.clip(df['ALT'], 0, 150)
    df['AST'] = np.clip(df['AST'], 0, 100)
    df['serum creatinine'] = np.clip(df['serum creatinine'], 0, 3)    

create_extra_features(train_df)
create_extra_features(test_df)

In [None]:
train_df.corr()['smoking'].sort_values()

In [None]:
def bmi_(BMI):
    if BMI < 18.5: 
        return 'Underwieght'
    elif BMI >18.5 and BMI <24.99: 
        return 'Normal'
    elif BMI > 25 and BMI < 29.99: 
        return 'Overweight'
    elif BMI > 30 and BMI < 34.99: 
        return 'Obese'
    elif BMI > 35 and BMI < 39.99: 
        return 'Obese'
    else:
        return 'Obese'
train_df['bmi'] = (train_df['weight(kg)']) / ((train_df['height(cm)']/100)**2) 
train_df['bmi_cate'] = train_df['bmi'].apply(bmi_)
train_df['bmi_cate'].value_counts()
train_df = train_df.drop('bmi',axis=1)

In [None]:
test_df['bmi'] = (test_df['weight(kg)']) / ((test_df['height(cm)']/100)**2) 
test_df['bmi_cate'] = test_df['bmi'].apply(bmi_)
test_df = test_df.drop('bmi',axis=1)

In [None]:
train_df = pd.get_dummies(train_df, columns=['bmi_cate'], drop_first=True)
test_df = pd.get_dummies(test_df, columns=['bmi_cate'], drop_first=True)

In [None]:
def y_gtp(gtp):
    if gtp>5 and gtp<40:
        return 1
    else:
        return 0
train_df['gtp_cate'] = train_df['Gtp'].apply(y_gtp)
test_df['gtp_cate'] = test_df['Gtp'].apply(y_gtp)

In [None]:
value_counts = train_df['Gtp'].value_counts().sort_index()
plt.figure(figsize=(12, 6))
plt.bar(value_counts.index, value_counts.values)

In [None]:
value_counts = train_df['AST'].value_counts().sort_index()
plt.figure(figsize=(12, 6))
plt.bar(value_counts.index, value_counts.values)

In [None]:
def check_ast_alt(x):
    if x > 8 and x < 45:
        return 1 
    else:
        return 0
train_df['ast_cate'] = train_df["AST"].apply(check_ast_alt)
train_df['alt_cate'] = train_df["ALT"].apply(check_ast_alt)

test_df['ast_cate'] = test_df["AST"].apply(check_ast_alt)
test_df['alt_cate'] = test_df["ALT"].apply(check_ast_alt)

In [None]:
cols = ['ast_cate','alt_cate','gtp_cate']
train_df = pd.get_dummies(data = train_df, columns=cols, drop_first=True)
test_df = pd.get_dummies(data = test_df, columns=cols, drop_first=True)

In [None]:
train_df.drop(columns = ['id', 'smoking'], axis = 1).nunique()

In [None]:
cols = ['dental caries','Urine protein','hearing(left)','hearing(right)']
train_df = pd.get_dummies(data = train_df, columns=cols, drop_first=True)
test_df = pd.get_dummies(data = test_df, columns=cols, drop_first=True)

# Modeling

In [None]:
from sklearn.model_selection import train_test_split
X = train_df.drop(['id','smoking'],axis=1)
y = train_df['smoking']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
X_train_logt = np.log1p(X_train)
X_test_logt = np.log1p(X_test)

In [None]:
params = {'learning_rate': 0.0192025093223225293453, 
            'colsample_bytree': 0.21329015151846925,
            'colsample_bylevel': 0.6748369225084079,
            'subsample': 0.8831564960046078,
            'reg_alpha': 1.1496763786731952e-05, 
            'reg_lambda': 7.512814356733987e-07, 
            'max_depth': 14, 
            'n_estimators': 1500,
            'min_child_weight': 21,
          'tree_method': 'gpu_hist',
          'eval_metric': 'auc',
          'booster': 'gbtree',
          'n_jobs': -1,
          'verbosity': 0}
params['tree_method'] = 'hist'  # Use CPU for training
xgb_model = xgb.XGBClassifier(**params)

xgb_model.fit(X_train_logt,y_train)
predictions_xgb = xgb_model.predict_proba(X_test_logt)[:, 1]
from sklearn.metrics import roc_auc_score
roc_auc = roc_auc_score(y_test,predictions_xgb)
roc_auc # 0.870410157711284

In [None]:
lgbm_classifier = LGBMClassifier(
    n_estimators=2000,
    max_depth=13,
    subsample=0.4796,
    num_leaves=120,
    learning_rate=0.023512,
    colsample_bytree=0.273534,
    reg_alpha=1.1496763786731952e-05,
    reg_lambda=7.512814356733987e-07,
)
lgbm_classifier.fit(X_train_logt,y_train)
predictions_lgbm = lgbm_classifier.predict_proba(X_test_logt)[:, 1]
roc_auc = roc_auc_score(y_test,predictions_lgbm)
roc_auc

In [None]:
hist_gb_classifier = HistGradientBoostingClassifier(l2_regularization = 0.065,
                                             early_stopping = False,
                                             learning_rate = 0.07,
                                             max_iter = 300,
                                             max_depth = 11,
                                             max_bins = 255,
                                             min_samples_leaf = 25,
                                             max_leaf_nodes = 60)

hist_gb_classifier.fit(X_train_logt,y_train)

predictions_hist_gb = hist_gb_classifier.predict_proba(X_test_logt)[:, 1]
roc_auc = roc_auc_score(y_test, predictions_hist_gb)
roc_auc

In [None]:
ensemble_predictions = 0.45 * predictions_lgbm + 0.85 * predictions_xgb + 0.15 * predictions_hist_gb

# Calculate the ROC AUC score for the ensemble
ensemble_roc_auc = roc_auc_score(y_test, ensemble_predictions)

print("Ensemble ROC AUC Score:", ensemble_roc_auc)

In [None]:
test_df = test_df.drop('id',axis=1)

In [None]:
print('The shape of training', train_df.shape) 
print('The shape of testing', test_df.shape)

In [None]:
X_ = train_df.drop(['id','smoking'],axis=1) 
y_to_train = train_df['smoking']

#LogT:
X_to_train = np.log1p(X_)
X_to_test = np.log1p(test_df)

In [None]:
print('The shape of training', X_to_train.shape) 
print('The shape of testing', X_to_test.shape)

In [None]:
def all_model(X_train_all_scaled,y_all,X_test_all_scaled):
    
    lgbm_classifier.fit(X_train_all_scaled,y_all)
    predictions_lgbm = lgbm_classifier.predict_proba(X_test_all_scaled)[:, 1]
    
    xgb_model.fit(X_train_all_scaled,y_all)
    predictions_xgb = xgb_model.predict_proba(X_test_all_scaled)[:, 1]
    
    hist_gb_classifier.fit(X_train_all_scaled,y_all)
    predictions_hist = hist_gb_classifier.predict_proba(X_test_all_scaled)[:, 1]
    
    ensemble_predictions = 0.45 * predictions_lgbm + 0.85 * predictions_xgb + 0.15 * predictions_hist 
    return ensemble_predictions

In [None]:
# CALLING THE ABOVE FUNCTION :
predictions_ = all_model(X_to_train, y_to_train, X_to_test)

In [None]:
len(predictions_)

In [None]:
predictions = pd.DataFrame(predictions_)
submission = pd.read_csv('/Users/raekkkky/Desktop/DNA/Semester_2023_2/5주차/sample_submission.csv')
combined_df_ann = pd.concat([submission, predictions], axis=1)
combined_df_ann = combined_df_ann.drop('smoking',axis=1)
combined_df_ann.columns=['id','smoking']
combined_df_ann.to_csv('/Users/raekkkky/Desktop/DNA/Semester_2023_2/7주차/sample_submission(1).csv',index=False)

In [None]:
combined_df_ann