In [None]:
import pandas as pd

In [20]:
c_df = pd.read_csv('../data/Telco_customer_churn_status.csv')
s_df = pd.read_csv('../data/Telco_customer_churn_services.csv')
d_df = pd.read_csv('../data/Telco_customer_churn_demographics.csv')

In [23]:
s_df = s_df[['Customer ID', 'Internet Service', 'Internet Type', 'Phone Service', 'Multiple Lines', 'Online Security', 'Device Protection Plan', 'Premium Tech Support', 'Contract', 'Avg Monthly GB Download', 'Total Long Distance Charges']]
d_df = d_df[['Customer ID', 'Age', 'Gender', 'Married', 'Dependents']]
c_df = c_df[['Customer ID', 'Churn Value']]

In [25]:
findf = pd.merge(s_df, d_df, how='inner', left_on='Customer ID', right_on='Customer ID')
findf = pd.merge(findf, c_df, how='inner', left_on='Customer ID', right_on='Customer ID')
findf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Customer ID                  7043 non-null   object 
 1   Internet Service             7043 non-null   object 
 2   Internet Type                5517 non-null   object 
 3   Phone Service                7043 non-null   object 
 4   Multiple Lines               7043 non-null   object 
 5   Online Security              7043 non-null   object 
 6   Device Protection Plan       7043 non-null   object 
 7   Premium Tech Support         7043 non-null   object 
 8   Contract                     7043 non-null   object 
 9   Avg Monthly GB Download      7043 non-null   int64  
 10  Total Long Distance Charges  7043 non-null   float64
 11  Age                          7043 non-null   int64  
 12  Gender                       7043 non-null   object 
 13  Married           

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier

target = "Churn Value"

X = findf.drop(columns=[target, "Customer ID"])
y = findf[target]

cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = X.select_dtypes(exclude=["object"]).columns.tolist()

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols),
    ]
)

model = XGBClassifier(
    n_estimators=300,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    eval_metric="logloss",
    random_state=42
)

clf = Pipeline(steps=[("preprocess", preprocess),
                     ("model", model)])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

clf.fit(X_train, y_train)

proba = clf.predict_proba(X_test)[:, 1]
print("ROC AUC:", roc_auc_score(y_test, proba))


ROC AUC: 0.8759254953628356
