In [None]:
import pandas as pd
import numpy as np
import datetime
import random
import os
import sys
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, RobustScaler, StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split,RandomizedSearchCV,GridSearchCV
from sklearn.feature_selection import f_regression, SelectKBest
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import Lasso, LogisticRegression
from lightgbm import LGBMRegressor
import warnings
warnings.filterwarnings('ignore')

In [None]:
print(f"파이썬 버전 : {sys.version}")
print(f"pandas 버전 : {pd.__version__}")
print(f"numpy 버전 : {np.__version__}")
print(f"sklearn 버전 : {sklearn.__version__}")

In [None]:
def seed_everything(seed: int = 2024):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
seed_everything(2024)

In [None]:
#데이터 로드

train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")

In [None]:
# keyword에 NAN값들을 하나의 값으로 정의
# 굳이 필요한 과정은 아님

train['keyword'].fillna('Category4_Keyword_618', inplace=True)
test['keyword'].fillna('Category4_Keyword_618', inplace=True)
train['referral_path'].fillna('Category6_Path_1577', inplace=True)
test['referral_path'].fillna('Category6_Path_1577', inplace=True)

In [None]:
#browserr값 중 앞은 같으나 뒤가 랜덤변수로 다른 값들이 있었음. 이것들을 두 개의 값으로 나눠 통일

train.loc[(25<=train['browser'].map(len))&(train['browser'].map(len)<=28),'browser']='case1'
test.loc[(25<=test['browser'].map(len))&(test['browser'].map(len)<=28),'browser']='case1'
train.loc[train['browser'].map(len)>=29,'browser']='CT_JOB_ID'
test.loc[test['browser'].map(len)>=29,'browser']='CT_JOB_ID'

#browser값에 따른 target의 값이 모두 1인 것들을 굳이 나누지 않고 하나로 통일하는 코드
bro = pd.DataFrame(train.groupby('browser',as_index=True)['TARGET'].mean())
bro_1 = bro[bro['TARGET']==1.0].index
for i in range(len(train)):
    if train.loc[i,'browser'] in bro_1:
        train.loc[i,'browser']='set_1'
        
for k in range(len(test)):
    if test.loc[k,'browser'] in bro_1:
        test.loc[k,'browser']='set_1'

In [None]:
#bounced가 1인 값들은 몯두 타겟값이 1이었음. 따라서, 제외하고 나중에 처리

train.drop(train[train['bounced']==1].index, inplace=True)
test.drop(test[test['bounced']==1].index, inplace=True)

In [None]:
# 이상치 전처리르 하려했으나, 하지 않는것의 정확도가 더 높았음

# train.drop(train[train['transaction']==12].index, inplace = True)

# train.drop(train[train['transaction_revenue']>=1*(10**9)].index, inplace = True)

# train.loc[(25<=train['browser'].map(len))&(train['browser'].map(len)<=28),'browser']='case1'

# train.loc[train['browser'].map(len)>=29,'browser']='CT_JOB_ID'

# test.loc[(25<=test['browser'].map(len))&(test['browser'].map(len)<=28),'browser']='case1'

# test.loc[test['browser'].map(len)>=29,'browser']='CT_JOB_ID'

# train.drop(train[train['TARGET']>=350].index, inplace = True)

# train.drop(train[train['duration']>=6000].index, inplace = True)
# train.drop(train[train['TARGET']>=100].index, inplace = True)
# train.drop(train[(train['new']==0) & (train['TARGET']>=120)].index,inplace=True)
# train.drop(train[(train['new']==1) & (train['TARGET']>=130)].index,inplace=True)
# train.drop(train[(train['OS']=='Windows') & (train['TARGET']>=100)].index,inplace=True)
# train.drop(train[(train['OS']=='Chrome OS') & (train['TARGET']>=80)].index,inplace=True)
# train.drop(train[(train['OS']=='Xbox') & (train['TARGET']>=15)].index,inplace=True)
# train.drop(train[(train['OS']=='Playstation Vita') & (train['TARGET']>=3)].index,inplace=True)
# train.drop(train[(train['subcontinent']=='Southern Asia') & (train['TARGET']>=100)].index,inplace=True)
# train.drop(train[(train['subcontinent']=='Southern Africa') & (train['TARGET']>=100)].index,inplace=True)

In [None]:
# train.drop(train[train['bounced']==1].index,inplace=True)
train = train.drop(columns=['sessionID','userID','bounced'],axis=1)

# test_bounced = test[test['bounced']==1]
# test.drop(test[test['bounced']==1].index,inplace=True)
# test_ID=test['sessionID']
test = test.drop(columns=['sessionID','userID','bounced'],axis=1)


test1=pd.read_csv('./test.csv')

In [None]:
#browser가 (not set)인 경우 처리
# test_ns = test[test['browser']=='(not set)']
# test.drop(test[test['browser']=='(not set)'].index,inplace=True)
# test_ns2 = pd.concat([test_ns,test[(test['country']=='Turks & Caicos Islands') | (test['country']=='Gambia')]],axis=1)
# test.drop(test[(test['country']=='Turks & Caicos Islands') | (test['country']=='Gambia')].index,inplace=True)

In [None]:
# 빈 값을 최대값으로 바꿈
# lst_col=['browser','country','traffic_source','keyword','referral_path']
# for col in lst_col:
#     test
#     mx = test.value_counts(col).idxmax()
#     lst=train[col].unique()
#     for k in range(len(test)):
#         if test.loc[k,col] not in lst:
#             test.loc[k,col]=mx
    

In [None]:
# encoding_target = list(train.dtypes[train.dtypes == "object"].index)

#너무 다양한 독립변수 제거
# train.drop(columns=['keyword','traffic_source','referral_path'],inplace=True)
# test.drop(columns=['keyword','traffic_source','referral_path'],inplace=True)

In [None]:
#인덱스 정렬
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [None]:
#범주값 인코딩

encoding_target = list(train.dtypes[train.dtypes == "object"].index)
for obj in encoding_target:
    ohe = OneHotEncoder(handle_unknown='ignore',sparse_output=False)
    ary = np.array(train[obj]).reshape(-1,1)
    ohe.fit(ary)
    cols = train[obj].unique()
    i=0
    for e in cols:
        cols[i]=obj+cols[i]
        i+=1
    train = pd.concat([train,pd.DataFrame(ohe.transform(np.array(train[obj]).reshape(-1,1)),index=list(range(len(train))), columns=cols)],axis=1)
    test = pd.concat([test,pd.DataFrame(ohe.transform(np.array(test[obj]).reshape(-1,1)),index=list(range(len(test))), columns=cols)],axis=1)

In [None]:
#인코딩한 변수의 기존 값 제거
train.drop(columns=encoding_target, inplace=True)
test.drop(columns=encoding_target, inplace=True)

In [None]:
#X, Y 나누기
train_x = train.drop(columns=['TARGET'])
train_y = train['TARGET']

In [None]:
#train, val 나누기
Train = train_x.copy()
Y = train_y.copy()
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.1, random_state=2024, shuffle=True)

In [None]:
#train값에 맞춰 val도 변환
selector = SelectKBest(score_func=f_regression, k=20)
#학습데이터에 fit_transform 
X_train_selected = selector.fit_transform(X_train, y_train)
#테스트 데이터는 transform
X_test_selected = selector.transform(X_test)
X_train_selected.shape, X_test_selected.shape

In [None]:
#하이퍼 파라미터 조정
params = {
    'n_estimators': [200, 500, 1000, 2000], 
    'learning_rate': [0.1, 0.05, 0.01], 
    'max_depth': [9,10,11], 
    'colsample_bytree': [0.8, 0.9, 1.0], 
    'subsample': [0.8, 0.9, 1.0],
}


# n_iter 값을 조절하여 몇 회의 시도를 할 것인지 설정
clf = RandomizedSearchCV(LGBMRegressor(), params, random_state=2024, cv=3, n_iter=25, scoring='neg_mean_squared_error')

clf.fit(train_x, train_y)

clf.best_score_

In [None]:
clf.best_params_

## LGBM

In [None]:
#LGBM
lgbm = LGBMRegressor(subsample= 1.0,
                     n_estimators= 1000,
                     max_depth= 9,
                     learning_rate= 0.01,
                     colsample_bytree= 0.8)

lgbm.fit(train_x, train_y)

pred_lgbm = lgbm.predict(val_x)
for idx in range(len(pred_lgbm)):
    if pred_lgbm[idx]<1:
        pred_lgbm[idx]=1
pred_lgbm

## XGBoost

In [None]:
#XGBoost
xgb = XGBRegressor(n_estimators = 1000, random_state = 2024, learning_rate = 0.01, max_depth = 10)

xgb.fit(train_x, train_y)

pred_xgb = xgb.predict(val_x)
for idx in range(len(pred_xgb)):
    if pred_xgb[idx]<1:
        pred_xgb[idx]=1
pred_xgb

In [None]:
mean_squared_error(val_y, pred_lgbm, squared=False)
mean_squared_error(val_y, np.round(pred_lgbm), squared=False)
mean_squared_error(val_y, pred_xgb, squared=False)
mean_squared_error(val_y, np.round(pred_xgb), squared=False)

## Voting

In [None]:
#보팅 방법 확인
xgb = XGBRegressor(n_estimators = 1000, random_state = 2024, learning_rate = 0.01, max_depth = 10)

lgbm = LGBMRegressor(subsample= 1.0,
                     n_estimators= 1000,
                     max_depth= 10,
                     learning_rate= 0.01,
                     colsample_bytree= 0.8)

vote_model = VotingRegressor(
    estimators =[("lgbm",lgbm), ("xgb", xgb)]
)


vote_model.fit(train_x, train_y)

pred_vote = vote_model.predict(val_x)
for idx in range(len(pred_vote)):
    if pred_vote[idx]<1:
        pred_vote[idx]= 1
pred_vote

In [None]:
mean_squared_error(val_y, pred_vote, squared=False)
mean_squared_error(val_y, np.round(pred_vote), squared=False)

## Stacking

In [None]:
#캐스팅 방법 구현
new_train = pd.concat([pd.DataFrame(np.round(pred_lgbm),columns=['pred_lgbm']), pd.DataFrame(np.round(pred_xgb),columns=['pred_xgb'])], axis=1)

In [None]:
new_train['TARGET']=Y

In [None]:
stacking = RandomForestRegressor().fit(new_train.drop(['best_quality']), new_train['best_quality'])

In [None]:
pred_stacking = stacking.predict(val_x)
for idx in range(len(pred_stacking)):
    if pred_stacking[idx]<1:
        pred_stacking[idx]= 1
pred_stacking

In [None]:
pred_lgbm_test = lgbm.predict(test)
for idx in range(len(pred_lgbm_test)):
    if pred_lgbm_test[idx]<0:
        pred_lgbm_test[idx]= 0

pred_xgb_test = xgb.predict(test)
for idx in range(len(pred_xgb_test)):
    if pred_xgb_test[idx]<0:
        pred_xgb_test[idx]= 0

new_test = pd.concat([pd.DataFrame(pred_lgbm_test,columns=['pred_lgbm']), pd.DataFrame(pred_xgb_test,columns=['pred_xgb'])], axis=1)

In [None]:
Pred = stacking.predict(new_test)

In [None]:
test1=pd.read_csv('./test.csv')

test1["TARGET"]=Pred
test1[test1['bounced']==1]['TARGET']=1

result = test1[["sessionID","TARGET"]]
result.to_csv('./lgbm_xgb.csv', index=False)