# Data Loader

In [1]:
import pandas as pd
import argparse
from data_loader import load_data

DATA_PATH = '../data/WA_Fn-UseC_-Telco-Customer-Churn.csv'
x, y, num_features, cat_features, cat_enc_features, cat_orders = load_data()

✅ Connected to BigQuery. Target table: axial-entropy-351511.telco_customers_churn.customers_summary
✅ Successfully loaded 7063 rows from BigQuery.


# Preprocessing Pipeline

In [2]:
from preprocessor_pipeline import build_preprocessor
from sklearn.model_selection import train_test_split

# Load data
X, y, num_features, cat_features, cat_enc_features, cat_orders = load_data()

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Build preprocessor
preprocessor = build_preprocessor(
    num_features=num_features,
    cat_features=cat_features,
    cat_enc_features=cat_enc_features,
    cat_orders=cat_orders,
    remainder='drop'
)

# Fit and transform training data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

✅ Connected to BigQuery. Target table: axial-entropy-351511.telco_customers_churn.customers_summary
✅ Successfully loaded 7063 rows from BigQuery.
[ColumnTransformer] ........... (1 of 3) Processing num, total=   0.0s
[ColumnTransformer] ........... (2 of 3) Processing cat, total=   0.0s
[ColumnTransformer] ....... (3 of 3) Processing cat_enc, total=   0.0s


In [3]:
# Get feature names from each pipeline
num_cols = num_features  # numerical features
cat_cols = preprocessor.named_transformers_['cat']['encoder'].get_feature_names_out(cat_features)  # one-hot encoded categorical features
cat_enc_cols = cat_enc_features  # already encoded categorical features

# Combine all feature names
all_feature_names= list(num_cols) + list(cat_cols) + list(cat_enc_cols)

# Create DataFrame from the transformed arrays
X_train_df = pd.DataFrame(X_train_processed, columns=all_feature_names)
X_test_df= pd.DataFrame(X_test_processed, columns=all_feature_names)

print('Shape:',X_train_df.shape)
X_train_df.head()

Shape: (5650, 16)


Unnamed: 0,tenure,TotalCharges,MonthlyCharges,Contract,OnlineSecurity,TechSupport,InternetService,PaymentMethod,OnlineBackup,DeviceProtection,StreamingMovies,StreamingTV,PaperlessBilling,Dependents,Partner,SeniorCitizen
0,0.638889,0.134437,0.066667,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,0.263889,0.193994,0.713433,0.0,1.0,2.0,2.0,1.0,1.0,2.0,1.0,2.0,0.0,0.0,0.0,0.0
2,0.180556,0.151428,0.820896,0.0,1.0,2.0,2.0,0.0,1.0,1.0,2.0,2.0,0.0,0.0,0.0,0.0
3,0.027778,0.016034,0.518408,0.0,1.0,1.0,2.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
4,0.875,0.75997,0.858209,1.0,2.0,2.0,2.0,3.0,2.0,2.0,1.0,2.0,0.0,0.0,0.0,0.0


# train model

`data_loader` and `preprocessing_pipeline` are already in the train model function so we just need to call train model

In [4]:
from train import train_model

train_model()

TotalCharges successfully converted to numeric.
[ColumnTransformer] ........... (1 of 3) Processing num, total=   0.0s
[ColumnTransformer] ........... (2 of 3) Processing cat, total=   0.0s
[ColumnTransformer] ....... (3 of 3) Processing cat_enc, total=   0.0s
Training completed. Model saved to 'models/best_pipeline.pkl'
Best hyperparameters: {'model__colsample_bytree': 0.8, 'model__learning_rate': 0.3, 'model__max_depth': 3, 'model__min_child_weight': 1, 'model__n_estimators': 9, 'model__subsample': 1.0}
Best recall score: 0.8244942724835486


In [7]:
# !pip install dill

Collecting dill
  Using cached dill-0.4.0-py3-none-any.whl.metadata (10 kB)
Downloading dill-0.4.0-py3-none-any.whl (119 kB)
Installing collected packages: dill
Successfully installed dill-0.4.0


In [None]:
# load model
import joblib

model= joblib.load('../models/best_model_20251016.joblib')

# prediksi train dan test data kemudian evaluasi

y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# cetak best param


from sklearn.metrics import classification_report
print(classification_report(y_train, y_pred_train,zero_division=0))
print(classification_report(y_test, y_pred_test,zero_division=0))

              precision    recall  f1-score   support

           0       0.92      0.68      0.78      4147
           1       0.49      0.83      0.62      1503

    accuracy                           0.72      5650
   macro avg       0.70      0.76      0.70      5650
weighted avg       0.80      0.72      0.74      5650

              precision    recall  f1-score   support

           0       0.92      0.69      0.79      1037
           1       0.49      0.84      0.62       376

    accuracy                           0.73      1413
   macro avg       0.71      0.76      0.70      1413
weighted avg       0.81      0.73      0.74      1413

