### 1. 데이터 불러오기, 라이브러리 설치 

In [1]:
# 드라이브 마운트

from google.colab import drive
drive.mount('/content/drive')

# 라이브러리 설치

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from lightgbm import LGBMClassifier
import sklearn.metrics as metrics
import lightgbm as lgb

# 데이터 불러오기

train=pd.read_csv("/content/drive/Shareddrives/🍕PSAT_Summer_Seminar_Team1🍕/Data/train.csv")
test=pd.read_csv("/content/drive/Shareddrives/🍕PSAT_Summer_Seminar_Team1🍕/Data/test.csv")

# Train data의 Feature 변수와 Target 변수 따로 저장

X=train.iloc[:,1:]
y=train['target'

Mounted at /content/drive




In [7]:
# 소요시간 측정을 위한 타이머 세팅 

from datetime import datetime

def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

### 2. Made Rebalancing Method (모델 불균형 해결을 위한 샘플링 함수) 

In [8]:
def numeric_cols(input_df):

    # train data에서 수치형 변수만 뽑아냄

    print(input_df.select_dtypes('number').columns)
    sel_train = input_df.select_dtypes('number').columns.values
    print(type(sel_train))

    # 수치형 변수만 갖는 데이터프레임 생성

    train = input_df[sel_train]
    print(train.describe())
    return train

def balanced_sampling(input_df, factor): 
    
    # 앞서 만든 함수로 수치형 변수만 갖는 데이터프레임 불러옴

    train = numeric_cols(input_df)
    y= train['target']
    
    # Target이 1인 row와 Target이 0인 row를 나누고 각각의 row 개수를 셈

    X_one = train[train.target==1]
    X_zero= train[train.target==0]
    total_target = X_one.shape
    print("Target Size : ",total_target[1],total_target[0])

    # 여기서 factor는 함수 호출시 입력받는 숫자

    scale_factor = factor

    # Target이 0인 row 개수가 더 많으므로 factor 곱하기 Target이 1인 row 개수만큼만 Target이 0인 row에서 샘플링

    X_zero1=X_zero.sample(scale_factor*total_target[0], random_state = 1)

    # 샘플링된 row와 Target이 1인 row를 합쳐서 새로운 데이터프레임 생성

    X=pd.concat([X_one,X_zero1], ignore_index=True)
    y= X['target']
    print(X.shape)
    print(X.sample(10))

    
    X.drop(["target"],axis=1,inplace=True)
    
    # 새로 만든 데이터프레임으로 train test split
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,  test_size=0.25, random_state = 1)
    return X_train, X_test, y_train, y_test

### 3. 모델링 

In [9]:
# Data Rebalancing

X_train, X_test, y_train, y_test = balanced_sampling(train,3)

# RandomSearchCV로 찾은 최적파라미터들로 모델 생성

lgb = LGBMClassifier(bagging_fraction=0.85, bagging_freq=1, boost='gbdt',
               boosting_type='gbdt', class_weight=None, colsample_bytree=1,
               feature_fraction=1, gamma=1, importance_type='split',
               learning_rate=0.1, max_bin=256, max_depth=1,
               min_child_samples=153, min_child_weight=0.1, min_split_gain=0.0,
               n_estimators=4000, n_jobs=-1, num_leaves=4, num_threads=8,
               objective='binary', random_state=1, reg_alpha=0.1,
               reg_lambda=0, seed=500, silent=True, subsample=0.8,
               subsample_for_bin=200000, subsample_freq=0)

# 모델 적합

start_time = timer(None)
lgb.fit(X_train, y_train)
y_pred = lgb.predict(X_test)
timer(start_time)

# Validation set으로 계산한 정확도와 f1 score

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print('F1_score:', metrics.f1_score(y_test, y_pred, labels=None, pos_label=1, average='binary', sample_weight=None, zero_division='warn'))

Index(['target', 'var_0', 'var_1', 'var_2', 'var_3', 'var_4', 'var_5', 'var_6',
       'var_7', 'var_8',
       ...
       'var_190', 'var_191', 'var_192', 'var_193', 'var_194', 'var_195',
       'var_196', 'var_197', 'var_198', 'var_199'],
      dtype='object', length=201)
<class 'numpy.ndarray'>
             target         var_0         var_1         var_2         var_3  \
count  28000.000000  28000.000000  28000.000000  28000.000000  28000.000000   
mean       0.100321     10.685383     -1.610482     10.737660      6.786323   
std        0.300433      3.065415      4.046000      2.632142      2.029132   
min        0.000000      1.283200    -13.688700      2.615000      0.255000   
25%        0.000000      8.451800     -4.713100      8.761400      5.269275   
50%        0.000000     10.525650     -1.570650     10.591150      6.816400   
75%        0.000000     12.763600      1.364000     12.527875      8.289725   
max        1.000000     19.701100     10.335600     18.412700     12.