In [2]:
import os
import pickle
import numpy as np
import pandas as pd
import xgboost as xgb


In [4]:
import sklearn
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
	OneHotEncoder,
	OrdinalEncoder,
	StandardScaler,
	MinMaxScaler,
	PowerTransformer,
	FunctionTransformer
)

from feature_engine.encoding import (
	RareLabelEncoder,
	MeanEncoder,
	CountFrequencyEncoder
)


In [70]:
# read datasets

train = pd.read_csv(r"E:\1. Projects\ML-internship\customer-churn-prediction\Research\data\train.csv", index_col= False)

In [72]:
train.head()

Unnamed: 0,rownumber,customerid,surname,creditscore,geography,gender,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary,exited
0,9470,15674573,Gearhart,713,France,Female,25.0,4,121172.97,1,1.0,1.0,56268.98,0
1,4798,15666173,Chidumaga,793,Germany,Female,32.0,1,96408.98,1,1.0,1.0,138191.81,0
2,943,15804586,Lin,376,France,Female,46.0,6,0.0,1,1.0,0.0,157333.69,1
3,9466,15815259,Fang,835,France,Female,56.0,2,0.0,2,1.0,1.0,39820.13,0
4,4147,15698246,Gordon,658,France,Female,24.0,2,0.0,2,1.0,1.0,84694.49,0


In [74]:
train = train.drop(columns = ['rownumber','customerid'] )

In [76]:
train.head()

Unnamed: 0,surname,creditscore,geography,gender,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary,exited
0,Gearhart,713,France,Female,25.0,4,121172.97,1,1.0,1.0,56268.98,0
1,Chidumaga,793,Germany,Female,32.0,1,96408.98,1,1.0,1.0,138191.81,0
2,Lin,376,France,Female,46.0,6,0.0,1,1.0,0.0,157333.69,1
3,Fang,835,France,Female,56.0,2,0.0,2,1.0,1.0,39820.13,0
4,Gordon,658,France,Female,24.0,2,0.0,2,1.0,1.0,84694.49,0


In [78]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")

In [80]:
train.gender = encoder.fit_transform(train[['gender']])

In [81]:
rare_label_encoder = RareLabelEncoder(tol = 0.0026, replace_with="others", n_categories=70)

In [84]:
train.surname = rare_label_encoder.fit_transform(train[['surname']])

In [86]:
train.surname = encoder.fit_transform(train[['surname']])

In [88]:
train.geography = encoder.fit_transform(train[['geography']])

In [90]:
min_max_scaler = MinMaxScaler() 

In [92]:
train.balance = min_max_scaler.fit_transform(train[['balance']])

In [94]:
train.estimatedsalary = min_max_scaler.fit_transform(train[['estimatedsalary']])

In [96]:
train.sample(10)

Unnamed: 0,surname,creditscore,geography,gender,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary,exited
3822,0.0,753,0.0,0.0,32.0,5,0.670777,1,1.0,0.0,0.743947,0
5892,0.0,685,0.0,0.0,38.0,4,0.468976,2,1.0,1.0,0.510682,0
1072,0.0,765,1.0,0.0,34.0,9,0.385235,1,0.0,0.0,0.691262,0
3028,0.0,835,0.0,0.0,38.0,7,0.364214,1,0.0,0.0,0.879499,0
2682,0.0,563,0.0,0.0,24.0,7,0.0,2,0.0,0.0,0.081111,0
406,0.0,751,1.0,0.0,42.0,4,0.0,2,1.0,1.0,0.406912,0
2296,0.0,597,1.0,1.0,33.0,6,0.569256,2,0.0,0.0,0.373934,0
2457,0.0,581,0.0,0.0,34.0,1,0.0,2,0.0,1.0,0.405574,0
5062,0.0,639,0.0,0.0,38.0,9,0.546308,1,1.0,1.0,0.409006,0
3868,0.0,613,0.0,0.0,37.0,3,0.720059,1,0.0,1.0,0.026247,0


In [100]:
from sklearn.metrics import accuracy_score

In [102]:
train.head()

Unnamed: 0,surname,creditscore,geography,gender,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary,exited
0,0.0,713,1.0,1.0,25.0,4,0.508302,1,1.0,1.0,0.280972,0
1,0.0,793,0.0,1.0,32.0,1,0.404421,1,1.0,1.0,0.69082,0
2,0.0,376,1.0,1.0,46.0,6,0.0,1,1.0,0.0,0.786584,1
3,0.0,835,1.0,1.0,56.0,2,0.0,2,1.0,1.0,0.198681,0
4,0.0,658,1.0,1.0,24.0,2,0.0,2,1.0,1.0,0.423181,0


In [112]:
x_train = train.iloc[:, 0:-1]
y_train = train.iloc[:, -1]

In [110]:
x_train.head()

Unnamed: 0,surname,creditscore,geography,gender,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary
0,0.0,713,1.0,1.0,25.0,4,0.508302,1,1.0,1.0,0.280972
1,0.0,793,0.0,1.0,32.0,1,0.404421,1,1.0,1.0,0.69082
2,0.0,376,1.0,1.0,46.0,6,0.0,1,1.0,0.0,0.786584
3,0.0,835,1.0,1.0,56.0,2,0.0,2,1.0,1.0,0.198681
4,0.0,658,1.0,1.0,24.0,2,0.0,2,1.0,1.0,0.423181


In [114]:
y_train

0       0
1       0
2       1
3       0
4       0
       ..
6391    0
6392    0
6393    0
6394    0
6395    1
Name: exited, Length: 6396, dtype: int64

In [116]:
model = xgb.XGBClassifier(use_label_encoder = False, eval_metric ='logloss',
                         n_estimators = 100, max_depth = 3, learning_rate = 0.1)

In [118]:
model.fit(x_train, y_train)

Parameters: { "use_label_encoder" } are not used.

