In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import xgboost as xgb
import pickle

In [13]:
df = pd.read_csv('/content/data_D.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,churn
0,0,106473,15639576,Sarratt,652.0,France,Female,65.0,3,0.0,2,1,1,136592.24,0
1,1,62345,15769582,Hanson,464.0,France,Male,35.0,4,0.0,1,0,0,99505.75,1
2,2,126615,15675888,Austin,620.0,Germany,Female,39.0,6,129401.87,2,1,1,102681.32,1
3,3,35909,15786617,Tuan,598.0,France,Female,30.0,7,0.0,2,1,0,141210.18,0
4,4,45175,15757310,Li Fonti,682.0,Germany,Female,46.0,4,107720.57,1,0,0,93832.33,1


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41259 entries, 0 to 41258
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       41259 non-null  int64  
 1   id               41259 non-null  int64  
 2   CustomerId       41259 non-null  int64  
 3   Surname          41259 non-null  object 
 4   CreditScore      41251 non-null  float64
 5   Geography        41259 non-null  object 
 6   Gender           41259 non-null  object 
 7   Age              41259 non-null  float64
 8   Tenure           41259 non-null  int64  
 9   Balance          41259 non-null  float64
 10  NumOfProducts    41259 non-null  int64  
 11  HasCrCard        41259 non-null  int64  
 12  IsActiveMember   41259 non-null  int64  
 13  EstimatedSalary  41259 non-null  float64
 14  churn            41259 non-null  int64  
dtypes: float64(4), int64(8), object(3)
memory usage: 4.7+ MB


In [15]:
df.describe()

Unnamed: 0.1,Unnamed: 0,id,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,churn
count,41259.0,41259.0,41259.0,41251.0,41259.0,41259.0,41259.0,41259.0,41259.0,41259.0,41259.0,41259.0
mean,20629.0,82171.693231,15692210.0,656.278733,38.060254,4.999127,55712.496615,1.554328,0.754405,0.496667,112308.502138,0.211324
std,11910.591715,47498.739183,71379.74,80.062353,8.801094,2.80293,62858.019091,0.545629,0.430445,0.499995,50367.87451,0.408252
min,0.0,3.0,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,10314.5,41098.5,15633470.0,598.0,32.0,3.0,0.0,1.0,1.0,0.0,74580.8,0.0
50%,20629.0,81851.0,15690130.0,659.0,37.0,5.0,0.0,2.0,1.0,0.0,117036.38,0.0
75%,30943.5,123064.5,15757140.0,710.0,42.0,7.0,120165.3,2.0,1.0,1.0,154811.29,0.0
max,41258.0,165033.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [16]:
df.isnull().sum()

Unnamed: 0         0
id                 0
CustomerId         0
Surname            0
CreditScore        8
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
churn              0
dtype: int64

In [17]:
data_cleaned = df.drop(columns=['Unnamed: 0', 'id', 'CustomerId', 'Surname'])

In [18]:
data_cleaned['CreditScore'].fillna(data_cleaned['CreditScore'].mean(), inplace=True)

In [19]:
numerical_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
categorical_cols = ['Geography', 'Gender']

In [20]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ])

In [21]:
X = data_cleaned.drop('churn', axis=1)
y = data_cleaned['churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])
X_train_transformed = pipeline.fit_transform(X_train)
X_test_transformed = pipeline.transform(X_test)

X_train_transformed.shape, X_test_transformed.shape

((33007, 13), (8252, 13))

In [23]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

rf_model = RandomForestClassifier(random_state=42)
xgb_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

In [24]:
# Traing model Random Forest
rf_model.fit(X_train_transformed, y_train)
rf_predictions = rf_model.predict(X_test_transformed)

In [25]:
# Training Model XGBoost
xgb_model.fit(X_train_transformed, y_train)
xgb_predictions = xgb_model.predict(X_test_transformed)

In [29]:
metrics_rf = {
    'Accuracy': accuracy_score(y_test, rf_predictions),
    'Precision': precision_score(y_test, rf_predictions),
    'Recall': recall_score(y_test, rf_predictions),
    'F1 Score': f1_score(y_test, rf_predictions)
}

metrics_xgb = {
    'Accuracy': accuracy_score(y_test, xgb_predictions),
    'Precision': precision_score(y_test, xgb_predictions),
    'Recall': recall_score(y_test, xgb_predictions),
    'F1 Score': f1_score(y_test, xgb_predictions)
}

print("Random Forest Model : ",metrics_rf)
print("XGBoost Model : ", metrics_xgb)

Random Forest Model :  {'Accuracy': 0.8540959767329133, 'Precision': 0.7198027937551356, 'Recall': 0.503737780333525, 'F1 Score': 0.5926928281461434}
XGBoost Model :  {'Accuracy': 0.8622152205525934, 'Precision': 0.7344236760124611, 'Recall': 0.5422656699252444, 'F1 Score': 0.6238835593781012}


In [30]:
import pickle

# Menyimpan model XGBoost ke dalam format pickle
xgb_model_filename = 'best_data.pkl'
with open(xgb_model_filename, 'wb') as file:
    pickle.dump(xgb_model_filename, file)

xgb_model_filename


'best_data.pkl'