In [37]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, LabelEncoder
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.cluster import KMeans
import seaborn as sns
import matplotlib.pyplot as plt

from imblearn.under_sampling import TomekLinks

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
train_df = pd.read_csv('/kaggle/input/playground-series-s4e1/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s4e1/test.csv')
other_df = pd.read_csv('/kaggle/input/bank-customer-churn-prediction/Churn_Modelling.csv')
submission_df = pd.read_csv('/kaggle/input/playground-series-s4e1/sample_submission.csv')
train_df.head()

test_ids = test_df['id']

dfs = [train_df, test_df, other_df]
for df in dfs:
    df.drop(columns=['CustomerId', 'Surname',], inplace=True)

train_df.drop(columns=['id'], inplace=True)
test_df.drop(columns=['id'], inplace=True)
other_df.drop(columns=['RowNumber'], inplace=True)
other_df.dropna(inplace=True)

/kaggle/input/bank-customer-churn-prediction/Churn_Modelling.csv
/kaggle/input/playground-series-s4e1/sample_submission.csv
/kaggle/input/playground-series-s4e1/train.csv
/kaggle/input/playground-series-s4e1/test.csv


Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42.0,2,0.0,1,1.0,1.0,101348.88,1
1,608,Spain,Female,41.0,1,83807.86,1,0.0,1.0,112542.58,0
2,502,France,Female,42.0,8,159660.8,3,1.0,0.0,113931.57,1
3,699,France,Female,39.0,1,0.0,2,0.0,0.0,93826.63,0
5,645,Spain,Male,44.0,8,113755.78,2,1.0,0.0,149756.71,1


In [38]:
train_df = pd.concat([train_df, other_df])
train_df

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,668,France,Male,33.0,3,0.00,2,1.0,0.0,181449.97,0
1,627,France,Male,33.0,1,0.00,2,1.0,1.0,49503.50,0
2,678,France,Male,40.0,10,0.00,2,1.0,0.0,184866.69,0
3,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,716,Spain,Male,33.0,5,0.00,2,1.0,1.0,15068.83,0
...,...,...,...,...,...,...,...,...,...,...,...
9997,709,France,Female,36.0,7,0.00,1,0.0,1.0,42085.58,1
9998,772,Germany,Male,42.0,3,75075.31,2,1.0,0.0,92888.52,1
9999,772,Germany,Male,42.0,3,75075.31,2,1.0,0.0,92888.52,1
10000,792,France,Female,28.0,4,130142.79,1,1.0,0.0,38190.78,0


In [39]:
train_df['Gender'] = np.where(train_df['Gender'] == 'Female', 1, 0)
test_df['Gender'] = np.where(test_df['Gender'] == 'Female', 1, 0)

cols_to_scale = ['CreditScore', 'Balance', 'EstimatedSalary']

train_df = pd.get_dummies(train_df, columns=['Geography',], dtype=int,)
test_df = pd.get_dummies(test_df, columns=['Geography', ], dtype=int,)

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,668,0,33.0,3,0.00,2,1.0,0.0,181449.97,0,1,0,0
1,627,0,33.0,1,0.00,2,1.0,1.0,49503.50,0,1,0,0
2,678,0,40.0,10,0.00,2,1.0,0.0,184866.69,0,1,0,0
3,581,0,34.0,2,148882.54,1,1.0,1.0,84560.88,0,1,0,0
4,716,0,33.0,5,0.00,2,1.0,1.0,15068.83,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9997,709,1,36.0,7,0.00,1,0.0,1.0,42085.58,1,1,0,0
9998,772,0,42.0,3,75075.31,2,1.0,0.0,92888.52,1,0,1,0
9999,772,0,42.0,3,75075.31,2,1.0,0.0,92888.52,1,0,1,0
10000,792,1,28.0,4,130142.79,1,1.0,0.0,38190.78,0,1,0,0


In [40]:
seed = 5

In [41]:

X = train_df.drop(columns='Exited')
y = train_df['Exited']

#tl = TomekLinks()
#X, y = tl.fit_resample(X, y)
#kmean = KMeans(n_clusters=2)
#X['Cluster_labels'] = kmean.fit_predict(X)

kfold = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)

In [42]:
import optuna
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier

In [43]:
xgb_params = {'lambda': 0.4013321226005979, 'alpha': 0.2083702010712507, 'max_depth': 6,
              'eta': 0.4963877014650146, 'gamma': 0.31692957503691443,
              'colsample_bytree': 0.7961833027642351, 'subsample': 0.9204737745273246,
              'min_child_weight': 8.579438365295818, 'objective': 'binary:logistic',
              'eval_metric': 'auc', 'booster': 'gbtree', }

catboost_params = {'iterations': 865, 'learning_rate': 0.10633938642544231, 'depth': 5,
                   'l2_leaf_reg': 0.24462536512236643, 'border_count': 164,
                   'bagging_temperature': 7.399499580237206, 'objective': 'Logloss',
                   'eval_metric': 'AUC', 'verbose': False}

lgb_params = {'lambda_l1': 8.383333330514086, 'lambda_l2': 0.7332939315361802,
              'num_leaves': 40, 'feature_fraction': 0.7609143584181806,
              'bagging_fraction': 0.9637290384155398, 'bagging_freq': 4,
              'min_child_samples': 90, 'objective': 'binary','metric': 'auc',
              'verbosity': -1,'boosting_type': 'gbdt',}

In [44]:
lgbm = lgb.LGBMClassifier(**lgb_params)
catb = CatBoostClassifier(**catboost_params)
xgbc = xgb.XGBClassifier(**xgb_params)
voter = VotingClassifier([('lgb', lgbm), ('cat', catb), ('xgb', xgbc)], voting='soft')

In [45]:
voter.fit(X, y)
results = voter.predict_proba(test_df)
results_to_sub = results[:, 1]
submission_df['Exited'] = results_to_sub
submission_df.head()
submission_df.to_csv('/kaggle/working/bank_churn_submission_5.csv', index=False)